migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/madvise.h"
  34 #include "qemu/main-loop.h"
  35 #include "io/channel-null.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "migration-stats.h"
  40 #include "migration/register.h"
  41 #include "migration/misc.h"
  42 #include "qemu-file.h"
  43 #include "postcopy-ram.h"
  44 #include "page_cache.h"
  45 #include "qemu/error-report.h"
  46 #include "qapi/error.h"
  47 #include "qapi/qapi-types-migration.h"
  48 #include "qapi/qapi-events-migration.h"
  49 #include "qapi/qmp/qerror.h"
  50 #include "trace.h"
  51 #include "exec/ram_addr.h"
  52 #include "exec/target_page.h"
  53 #include "qemu/rcu_queue.h"
  54 #include "migration/colo.h"
  55 #include "block.h"
  56 #include "sysemu/cpu-throttle.h"
  57 #include "savevm.h"
  58 #include "qemu/iov.h"
  59 #include "multifd.h"
  60 #include "sysemu/runstate.h"
  61 #include "options.h"
  62
  63 #include "hw/boards.h" /* for machine_dump_guest_core() */
  64
  65 #if defined(__linux__)
  66 #include "qemu/userfaultfd.h"
  67 #endif /* defined(__linux__) */
  68
  69 /***********************************************************/
  70 /* ram save/restore */
  71
  72 /*
  73  * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  74  * worked for pages that were filled with the same char.  We switched
  75  * it to only search for the zero value.  And to avoid confusion with
  76  * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
  77  */
  78 /*
  79  * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
  80  */
  81 #define RAM_SAVE_FLAG_FULL     0x01
  82 #define RAM_SAVE_FLAG_ZERO     0x02
  83 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  84 #define RAM_SAVE_FLAG_PAGE     0x08
  85 #define RAM_SAVE_FLAG_EOS      0x10
  86 #define RAM_SAVE_FLAG_CONTINUE 0x20
  87 #define RAM_SAVE_FLAG_XBZRLE   0x40
  88 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
  89 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  90 #define RAM_SAVE_FLAG_MULTIFD_FLUSH    0x200
  91 /* We can't use any flag that is bigger than 0x200 */
  92
  93 int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
  94      uint8_t *, int) = xbzrle_encode_buffer;
  95 #if defined(CONFIG_AVX512BW_OPT)
  96 #include "qemu/cpuid.h"
  97 static void __attribute__((constructor)) init_cpu_flag(void)
  98 {
  99     unsigned max = __get_cpuid_max(0, NULL);
 100     int a, b, c, d;
 101     if (max >= 1) {
 102         __cpuid(1, a, b, c, d);
 103          /* We must check that AVX is not just available, but usable.  */
 104         if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
 105             int bv;
 106             __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
 107             __cpuid_count(7, 0, a, b, c, d);
 108            /* 0xe6:
 109             *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
 110             *                    and ZMM16-ZMM31 state are enabled by OS)
 111             *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
 112             */
 113             if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
 114                 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
 115             }
 116         }
 117     }
 118 }
 119 #endif
 120
 121 XBZRLECacheStats xbzrle_counters;
 122
 123 /* used by the search for pages to send */
 124 struct PageSearchStatus {
 125     /* The migration channel used for a specific host page */
 126     QEMUFile    *pss_channel;
 127     /* Last block from where we have sent data */
 128     RAMBlock *last_sent_block;
 129     /* Current block being searched */
 130     RAMBlock    *block;
 131     /* Current page to search from */
 132     unsigned long page;
 133     /* Set once we wrap around */
 134     bool         complete_round;
 135     /* Whether we're sending a host page */
 136     bool          host_page_sending;
 137     /* The start/end of current host page.  Invalid if host_page_sending==false */
 138     unsigned long host_page_start;
 139     unsigned long host_page_end;
 140 };
 141 typedef struct PageSearchStatus PageSearchStatus;
 142
 143 /* struct contains XBZRLE cache and a static page
 144    used by the compression */
 145 static struct {
 146     /* buffer used for XBZRLE encoding */
 147     uint8_t *encoded_buf;
 148     /* buffer for storing page content */
 149     uint8_t *current_buf;
 150     /* Cache for XBZRLE, Protected by lock. */
 151     PageCache *cache;
 152     QemuMutex lock;
 153     /* it will store a page full of zeros */
 154     uint8_t *zero_target_page;
 155     /* buffer used for XBZRLE decoding */
 156     uint8_t *decoded_buf;
 157 } XBZRLE;
 158
 159 static void XBZRLE_cache_lock(void)
 160 {
 161     if (migrate_xbzrle()) {
 162         qemu_mutex_lock(&XBZRLE.lock);
 163     }
 164 }
 165
 166 static void XBZRLE_cache_unlock(void)
 167 {
 168     if (migrate_xbzrle()) {
 169         qemu_mutex_unlock(&XBZRLE.lock);
 170     }
 171 }
 172
 173 /**
 174  * xbzrle_cache_resize: resize the xbzrle cache
 175  *
 176  * This function is called from migrate_params_apply in main
 177  * thread, possibly while a migration is in progress.  A running
 178  * migration may be using the cache and might finish during this call,
 179  * hence changes to the cache are protected by XBZRLE.lock().
 180  *
 181  * Returns 0 for success or -1 for error
 182  *
 183  * @new_size: new cache size
 184  * @errp: set *errp if the check failed, with reason
 185  */
 186 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 187 {
 188     PageCache *new_cache;
 189     int64_t ret = 0;
 190
 191     /* Check for truncation */
 192     if (new_size != (size_t)new_size) {
 193         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 194                    "exceeding address space");
 195         return -1;
 196     }
 197
 198     if (new_size == migrate_xbzrle_cache_size()) {
 199         /* nothing to do */
 200         return 0;
 201     }
 202
 203     XBZRLE_cache_lock();
 204
 205     if (XBZRLE.cache != NULL) {
 206         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 207         if (!new_cache) {
 208             ret = -1;
 209             goto out;
 210         }
 211
 212         cache_fini(XBZRLE.cache);
 213         XBZRLE.cache = new_cache;
 214     }
 215 out:
 216     XBZRLE_cache_unlock();
 217     return ret;
 218 }
 219
 220 static bool postcopy_preempt_active(void)
 221 {
 222     return migrate_postcopy_preempt() && migration_in_postcopy();
 223 }
 224
 225 bool ramblock_is_ignored(RAMBlock *block)
 226 {
 227     return !qemu_ram_is_migratable(block) ||
 228            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 229 }
 230
 231 #undef RAMBLOCK_FOREACH
 232
 233 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 234 {
 235     RAMBlock *block;
 236     int ret = 0;
 237
 238     RCU_READ_LOCK_GUARD();
 239
 240     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 241         ret = func(block, opaque);
 242         if (ret) {
 243             break;
 244         }
 245     }
 246     return ret;
 247 }
 248
 249 static void ramblock_recv_map_init(void)
 250 {
 251     RAMBlock *rb;
 252
 253     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 254         assert(!rb->receivedmap);
 255         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 256     }
 257 }
 258
 259 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 260 {
 261     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 262                     rb->receivedmap);
 263 }
 264
 265 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 266 {
 267     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 268 }
 269
 270 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 271 {
 272     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 273 }
 274
 275 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 276                                     size_t nr)
 277 {
 278     bitmap_set_atomic(rb->receivedmap,
 279                       ramblock_recv_bitmap_offset(host_addr, rb),
 280                       nr);
 281 }
 282
 283 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 284
 285 /*
 286  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 287  *
 288  * Returns >0 if success with sent bytes, or <0 if error.
 289  */
 290 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 291                                   const char *block_name)
 292 {
 293     RAMBlock *block = qemu_ram_block_by_name(block_name);
 294     unsigned long *le_bitmap, nbits;
 295     uint64_t size;
 296
 297     if (!block) {
 298         error_report("%s: invalid block name: %s", __func__, block_name);
 299         return -1;
 300     }
 301
 302     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 303
 304     /*
 305      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 306      * machines we may need 4 more bytes for padding (see below
 307      * comment). So extend it a bit before hand.
 308      */
 309     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 310
 311     /*
 312      * Always use little endian when sending the bitmap. This is
 313      * required that when source and destination VMs are not using the
 314      * same endianness. (Note: big endian won't work.)
 315      */
 316     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 317
 318     /* Size of the bitmap, in bytes */
 319     size = DIV_ROUND_UP(nbits, 8);
 320
 321     /*
 322      * size is always aligned to 8 bytes for 64bit machines, but it
 323      * may not be true for 32bit machines. We need this padding to
 324      * make sure the migration can survive even between 32bit and
 325      * 64bit machines.
 326      */
 327     size = ROUND_UP(size, 8);
 328
 329     qemu_put_be64(file, size);
 330     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 331     /*
 332      * Mark as an end, in case the middle part is screwed up due to
 333      * some "mysterious" reason.
 334      */
 335     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 336     qemu_fflush(file);
 337
 338     g_free(le_bitmap);
 339
 340     if (qemu_file_get_error(file)) {
 341         return qemu_file_get_error(file);
 342     }
 343
 344     return size + sizeof(size);
 345 }
 346
 347 /*
 348  * An outstanding page request, on the source, having been received
 349  * and queued
 350  */
 351 struct RAMSrcPageRequest {
 352     RAMBlock *rb;
 353     hwaddr    offset;
 354     hwaddr    len;
 355
 356     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 357 };
 358
 359 /* State of RAM for migration */
 360 struct RAMState {
 361     /*
 362      * PageSearchStatus structures for the channels when send pages.
 363      * Protected by the bitmap_mutex.
 364      */
 365     PageSearchStatus pss[RAM_CHANNEL_MAX];
 366     /* UFFD file descriptor, used in 'write-tracking' migration */
 367     int uffdio_fd;
 368     /* total ram size in bytes */
 369     uint64_t ram_bytes_total;
 370     /* Last block that we have visited searching for dirty pages */
 371     RAMBlock *last_seen_block;
 372     /* Last dirty target page we have sent */
 373     ram_addr_t last_page;
 374     /* last ram version we have seen */
 375     uint32_t last_version;
 376     /* How many times we have dirty too many pages */
 377     int dirty_rate_high_cnt;
 378     /* these variables are used for bitmap sync */
 379     /* last time we did a full bitmap_sync */
 380     int64_t time_last_bitmap_sync;
 381     /* bytes transferred at start_time */
 382     uint64_t bytes_xfer_prev;
 383     /* number of dirty pages since start_time */
 384     uint64_t num_dirty_pages_period;
 385     /* xbzrle misses since the beginning of the period */
 386     uint64_t xbzrle_cache_miss_prev;
 387     /* Amount of xbzrle pages since the beginning of the period */
 388     uint64_t xbzrle_pages_prev;
 389     /* Amount of xbzrle encoded bytes since the beginning of the period */
 390     uint64_t xbzrle_bytes_prev;
 391     /* Are we really using XBZRLE (e.g., after the first round). */
 392     bool xbzrle_started;
 393     /* Are we on the last stage of migration */
 394     bool last_stage;
 395     /* compression statistics since the beginning of the period */
 396     /* amount of count that no free thread to compress data */
 397     uint64_t compress_thread_busy_prev;
 398     /* amount bytes after compression */
 399     uint64_t compressed_size_prev;
 400     /* amount of compressed pages */
 401     uint64_t compress_pages_prev;
 402
 403     /* total handled target pages at the beginning of period */
 404     uint64_t target_page_count_prev;
 405     /* total handled target pages since start */
 406     uint64_t target_page_count;
 407     /* number of dirty bits in the bitmap */
 408     uint64_t migration_dirty_pages;
 409     /*
 410      * Protects:
 411      * - dirty/clear bitmap
 412      * - migration_dirty_pages
 413      * - pss structures
 414      */
 415     QemuMutex bitmap_mutex;
 416     /* The RAMBlock used in the last src_page_requests */
 417     RAMBlock *last_req_rb;
 418     /* Queue of outstanding page requests from the destination */
 419     QemuMutex src_page_req_mutex;
 420     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 421 };
 422 typedef struct RAMState RAMState;
 423
 424 static RAMState *ram_state;
 425
 426 static NotifierWithReturnList precopy_notifier_list;
 427
 428 /* Whether postcopy has queued requests? */
 429 static bool postcopy_has_request(RAMState *rs)
 430 {
 431     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
 432 }
 433
 434 void precopy_infrastructure_init(void)
 435 {
 436     notifier_with_return_list_init(&precopy_notifier_list);
 437 }
 438
 439 void precopy_add_notifier(NotifierWithReturn *n)
 440 {
 441     notifier_with_return_list_add(&precopy_notifier_list, n);
 442 }
 443
 444 void precopy_remove_notifier(NotifierWithReturn *n)
 445 {
 446     notifier_with_return_remove(n);
 447 }
 448
 449 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 450 {
 451     PrecopyNotifyData pnd;
 452     pnd.reason = reason;
 453     pnd.errp = errp;
 454
 455     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 456 }
 457
 458 uint64_t ram_bytes_remaining(void)
 459 {
 460     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 461                        0;
 462 }
 463
 464 void ram_transferred_add(uint64_t bytes)
 465 {
 466     if (runstate_is_running()) {
 467         stat64_add(&mig_stats.precopy_bytes, bytes);
 468     } else if (migration_in_postcopy()) {
 469         stat64_add(&mig_stats.postcopy_bytes, bytes);
 470     } else {
 471         stat64_add(&mig_stats.downtime_bytes, bytes);
 472     }
 473     stat64_add(&mig_stats.transferred, bytes);
 474 }
 475
 476 struct MigrationOps {
 477     int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
 478 };
 479 typedef struct MigrationOps MigrationOps;
 480
 481 MigrationOps *migration_ops;
 482
 483 CompressionStats compression_counters;
 484
 485 struct CompressParam {
 486     bool done;
 487     bool quit;
 488     bool zero_page;
 489     QEMUFile *file;
 490     QemuMutex mutex;
 491     QemuCond cond;
 492     RAMBlock *block;
 493     ram_addr_t offset;
 494
 495     /* internally used fields */
 496     z_stream stream;
 497     uint8_t *originbuf;
 498 };
 499 typedef struct CompressParam CompressParam;
 500
 501 struct DecompressParam {
 502     bool done;
 503     bool quit;
 504     QemuMutex mutex;
 505     QemuCond cond;
 506     void *des;
 507     uint8_t *compbuf;
 508     int len;
 509     z_stream stream;
 510 };
 511 typedef struct DecompressParam DecompressParam;
 512
 513 static CompressParam *comp_param;
 514 static QemuThread *compress_threads;
 515 /* comp_done_cond is used to wake up the migration thread when
 516  * one of the compression threads has finished the compression.
 517  * comp_done_lock is used to co-work with comp_done_cond.
 518  */
 519 static QemuMutex comp_done_lock;
 520 static QemuCond comp_done_cond;
 521
 522 static QEMUFile *decomp_file;
 523 static DecompressParam *decomp_param;
 524 static QemuThread *decompress_threads;
 525 static QemuMutex decomp_done_lock;
 526 static QemuCond decomp_done_cond;
 527
 528 static int ram_save_host_page_urgent(PageSearchStatus *pss);
 529
 530 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 531                                  ram_addr_t offset, uint8_t *source_buf);
 532
 533 /* NOTE: page is the PFN not real ram_addr_t. */
 534 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
 535 {
 536     pss->block = rb;
 537     pss->page = page;
 538     pss->complete_round = false;
 539 }
 540
 541 /*
 542  * Check whether two PSSs are actively sending the same page.  Return true
 543  * if it is, false otherwise.
 544  */
 545 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
 546 {
 547     return pss1->host_page_sending && pss2->host_page_sending &&
 548         (pss1->host_page_start == pss2->host_page_start);
 549 }
 550
 551 static void *do_data_compress(void *opaque)
 552 {
 553     CompressParam *param = opaque;
 554     RAMBlock *block;
 555     ram_addr_t offset;
 556     bool zero_page;
 557
 558     qemu_mutex_lock(&param->mutex);
 559     while (!param->quit) {
 560         if (param->block) {
 561             block = param->block;
 562             offset = param->offset;
 563             param->block = NULL;
 564             qemu_mutex_unlock(&param->mutex);
 565
 566             zero_page = do_compress_ram_page(param->file, &param->stream,
 567                                              block, offset, param->originbuf);
 568
 569             qemu_mutex_lock(&comp_done_lock);
 570             param->done = true;
 571             param->zero_page = zero_page;
 572             qemu_cond_signal(&comp_done_cond);
 573             qemu_mutex_unlock(&comp_done_lock);
 574
 575             qemu_mutex_lock(&param->mutex);
 576         } else {
 577             qemu_cond_wait(&param->cond, &param->mutex);
 578         }
 579     }
 580     qemu_mutex_unlock(&param->mutex);
 581
 582     return NULL;
 583 }
 584
 585 static void compress_threads_save_cleanup(void)
 586 {
 587     int i, thread_count;
 588
 589     if (!migrate_compress() || !comp_param) {
 590         return;
 591     }
 592
 593     thread_count = migrate_compress_threads();
 594     for (i = 0; i < thread_count; i++) {
 595         /*
 596          * we use it as a indicator which shows if the thread is
 597          * properly init'd or not
 598          */
 599         if (!comp_param[i].file) {
 600             break;
 601         }
 602
 603         qemu_mutex_lock(&comp_param[i].mutex);
 604         comp_param[i].quit = true;
 605         qemu_cond_signal(&comp_param[i].cond);
 606         qemu_mutex_unlock(&comp_param[i].mutex);
 607
 608         qemu_thread_join(compress_threads + i);
 609         qemu_mutex_destroy(&comp_param[i].mutex);
 610         qemu_cond_destroy(&comp_param[i].cond);
 611         deflateEnd(&comp_param[i].stream);
 612         g_free(comp_param[i].originbuf);
 613         qemu_fclose(comp_param[i].file);
 614         comp_param[i].file = NULL;
 615     }
 616     qemu_mutex_destroy(&comp_done_lock);
 617     qemu_cond_destroy(&comp_done_cond);
 618     g_free(compress_threads);
 619     g_free(comp_param);
 620     compress_threads = NULL;
 621     comp_param = NULL;
 622 }
 623
 624 static int compress_threads_save_setup(void)
 625 {
 626     int i, thread_count;
 627
 628     if (!migrate_compress()) {
 629         return 0;
 630     }
 631     thread_count = migrate_compress_threads();
 632     compress_threads = g_new0(QemuThread, thread_count);
 633     comp_param = g_new0(CompressParam, thread_count);
 634     qemu_cond_init(&comp_done_cond);
 635     qemu_mutex_init(&comp_done_lock);
 636     for (i = 0; i < thread_count; i++) {
 637         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 638         if (!comp_param[i].originbuf) {
 639             goto exit;
 640         }
 641
 642         if (deflateInit(&comp_param[i].stream,
 643                         migrate_compress_level()) != Z_OK) {
 644             g_free(comp_param[i].originbuf);
 645             goto exit;
 646         }
 647
 648         /* comp_param[i].file is just used as a dummy buffer to save data,
 649          * set its ops to empty.
 650          */
 651         comp_param[i].file = qemu_file_new_output(
 652             QIO_CHANNEL(qio_channel_null_new()));
 653         comp_param[i].done = true;
 654         comp_param[i].quit = false;
 655         qemu_mutex_init(&comp_param[i].mutex);
 656         qemu_cond_init(&comp_param[i].cond);
 657         qemu_thread_create(compress_threads + i, "compress",
 658                            do_data_compress, comp_param + i,
 659                            QEMU_THREAD_JOINABLE);
 660     }
 661     return 0;
 662
 663 exit:
 664     compress_threads_save_cleanup();
 665     return -1;
 666 }
 667
 668 /**
 669  * save_page_header: write page header to wire
 670  *
 671  * If this is the 1st block, it also writes the block identification
 672  *
 673  * Returns the number of bytes written
 674  *
 675  * @pss: current PSS channel status
 676  * @block: block that contains the page we want to send
 677  * @offset: offset inside the block for the page
 678  *          in the lower bits, it contains flags
 679  */
 680 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
 681                                RAMBlock *block, ram_addr_t offset)
 682 {
 683     size_t size, len;
 684     bool same_block = (block == pss->last_sent_block);
 685
 686     if (same_block) {
 687         offset |= RAM_SAVE_FLAG_CONTINUE;
 688     }
 689     qemu_put_be64(f, offset);
 690     size = 8;
 691
 692     if (!same_block) {
 693         len = strlen(block->idstr);
 694         qemu_put_byte(f, len);
 695         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 696         size += 1 + len;
 697         pss->last_sent_block = block;
 698     }
 699     return size;
 700 }
 701
 702 /**
 703  * mig_throttle_guest_down: throttle down the guest
 704  *
 705  * Reduce amount of guest cpu execution to hopefully slow down memory
 706  * writes. If guest dirty memory rate is reduced below the rate at
 707  * which we can transfer pages to the destination then we should be
 708  * able to complete migration. Some workloads dirty memory way too
 709  * fast and will not effectively converge, even with auto-converge.
 710  */
 711 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 712                                     uint64_t bytes_dirty_threshold)
 713 {
 714     uint64_t pct_initial = migrate_cpu_throttle_initial();
 715     uint64_t pct_increment = migrate_cpu_throttle_increment();
 716     bool pct_tailslow = migrate_cpu_throttle_tailslow();
 717     int pct_max = migrate_max_cpu_throttle();
 718
 719     uint64_t throttle_now = cpu_throttle_get_percentage();
 720     uint64_t cpu_now, cpu_ideal, throttle_inc;
 721
 722     /* We have not started throttling yet. Let's start it. */
 723     if (!cpu_throttle_active()) {
 724         cpu_throttle_set(pct_initial);
 725     } else {
 726         /* Throttling already on, just increase the rate */
 727         if (!pct_tailslow) {
 728             throttle_inc = pct_increment;
 729         } else {
 730             /* Compute the ideal CPU percentage used by Guest, which may
 731              * make the dirty rate match the dirty rate threshold. */
 732             cpu_now = 100 - throttle_now;
 733             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 734                         bytes_dirty_period);
 735             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 736         }
 737         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 738     }
 739 }
 740
 741 void mig_throttle_counter_reset(void)
 742 {
 743     RAMState *rs = ram_state;
 744
 745     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 746     rs->num_dirty_pages_period = 0;
 747     rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred);
 748 }
 749
 750 /**
 751  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 752  *
 753  * @rs: current RAM state
 754  * @current_addr: address for the zero page
 755  *
 756  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 757  * The important thing is that a stale (not-yet-0'd) page be replaced
 758  * by the new data.
 759  * As a bonus, if the page wasn't in the cache it gets added so that
 760  * when a small write is made into the 0'd page it gets XBZRLE sent.
 761  */
 762 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 763 {
 764     /* We don't care if this fails to allocate a new cache page
 765      * as long as it updated an old one */
 766     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 767                  stat64_get(&mig_stats.dirty_sync_count));
 768 }
 769
 770 #define ENCODING_FLAG_XBZRLE 0x1
 771
 772 /**
 773  * save_xbzrle_page: compress and send current page
 774  *
 775  * Returns: 1 means that we wrote the page
 776  *          0 means that page is identical to the one already sent
 777  *          -1 means that xbzrle would be longer than normal
 778  *
 779  * @rs: current RAM state
 780  * @pss: current PSS channel
 781  * @current_data: pointer to the address of the page contents
 782  * @current_addr: addr of the page
 783  * @block: block that contains the page we want to send
 784  * @offset: offset inside the block for the page
 785  */
 786 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
 787                             uint8_t **current_data, ram_addr_t current_addr,
 788                             RAMBlock *block, ram_addr_t offset)
 789 {
 790     int encoded_len = 0, bytes_xbzrle;
 791     uint8_t *prev_cached_page;
 792     QEMUFile *file = pss->pss_channel;
 793     uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
 794
 795     if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) {
 796         xbzrle_counters.cache_miss++;
 797         if (!rs->last_stage) {
 798             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 799                              generation) == -1) {
 800                 return -1;
 801             } else {
 802                 /* update *current_data when the page has been
 803                    inserted into cache */
 804                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 805             }
 806         }
 807         return -1;
 808     }
 809
 810     /*
 811      * Reaching here means the page has hit the xbzrle cache, no matter what
 812      * encoding result it is (normal encoding, overflow or skipping the page),
 813      * count the page as encoded. This is used to calculate the encoding rate.
 814      *
 815      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 816      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 817      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 818      * skipped page included. In this way, the encoding rate can tell if the
 819      * guest page is good for xbzrle encoding.
 820      */
 821     xbzrle_counters.pages++;
 822     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 823
 824     /* save current buffer into memory */
 825     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 826
 827     /* XBZRLE encoding (if there is no overflow) */
 828     encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
 829                                             TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 830                                             TARGET_PAGE_SIZE);
 831
 832     /*
 833      * Update the cache contents, so that it corresponds to the data
 834      * sent, in all cases except where we skip the page.
 835      */
 836     if (!rs->last_stage && encoded_len != 0) {
 837         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 838         /*
 839          * In the case where we couldn't compress, ensure that the caller
 840          * sends the data from the cache, since the guest might have
 841          * changed the RAM since we copied it.
 842          */
 843         *current_data = prev_cached_page;
 844     }
 845
 846     if (encoded_len == 0) {
 847         trace_save_xbzrle_page_skipping();
 848         return 0;
 849     } else if (encoded_len == -1) {
 850         trace_save_xbzrle_page_overflow();
 851         xbzrle_counters.overflow++;
 852         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 853         return -1;
 854     }
 855
 856     /* Send XBZRLE based compressed page */
 857     bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
 858                                     offset | RAM_SAVE_FLAG_XBZRLE);
 859     qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
 860     qemu_put_be16(file, encoded_len);
 861     qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
 862     bytes_xbzrle += encoded_len + 1 + 2;
 863     /*
 864      * Like compressed_size (please see update_compress_thread_counts),
 865      * the xbzrle encoded bytes don't count the 8 byte header with
 866      * RAM_SAVE_FLAG_CONTINUE.
 867      */
 868     xbzrle_counters.bytes += bytes_xbzrle - 8;
 869     ram_transferred_add(bytes_xbzrle);
 870
 871     return 1;
 872 }
 873
 874 /**
 875  * pss_find_next_dirty: find the next dirty page of current ramblock
 876  *
 877  * This function updates pss->page to point to the next dirty page index
 878  * within the ramblock to migrate, or the end of ramblock when nothing
 879  * found.  Note that when pss->host_page_sending==true it means we're
 880  * during sending a host page, so we won't look for dirty page that is
 881  * outside the host page boundary.
 882  *
 883  * @pss: the current page search status
 884  */
 885 static void pss_find_next_dirty(PageSearchStatus *pss)
 886 {
 887     RAMBlock *rb = pss->block;
 888     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 889     unsigned long *bitmap = rb->bmap;
 890
 891     if (ramblock_is_ignored(rb)) {
 892         /* Points directly to the end, so we know no dirty page */
 893         pss->page = size;
 894         return;
 895     }
 896
 897     /*
 898      * If during sending a host page, only look for dirty pages within the
 899      * current host page being send.
 900      */
 901     if (pss->host_page_sending) {
 902         assert(pss->host_page_end);
 903         size = MIN(size, pss->host_page_end);
 904     }
 905
 906     pss->page = find_next_bit(bitmap, size, pss->page);
 907 }
 908
 909 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
 910                                                        unsigned long page)
 911 {
 912     uint8_t shift;
 913     hwaddr size, start;
 914
 915     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
 916         return;
 917     }
 918
 919     shift = rb->clear_bmap_shift;
 920     /*
 921      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 922      * can make things easier sometimes since then start address
 923      * of the small chunk will always be 64 pages aligned so the
 924      * bitmap will always be aligned to unsigned long. We should
 925      * even be able to remove this restriction but I'm simply
 926      * keeping it.
 927      */
 928     assert(shift >= 6);
 929
 930     size = 1ULL << (TARGET_PAGE_BITS + shift);
 931     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
 932     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 933     memory_region_clear_dirty_bitmap(rb->mr, start, size);
 934 }
 935
 936 static void
 937 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
 938                                                  unsigned long start,
 939                                                  unsigned long npages)
 940 {
 941     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
 942     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
 943     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
 944
 945     /*
 946      * Clear pages from start to start + npages - 1, so the end boundary is
 947      * exclusive.
 948      */
 949     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
 950         migration_clear_memory_region_dirty_bitmap(rb, i);
 951     }
 952 }
 953
 954 /*
 955  * colo_bitmap_find_diry:find contiguous dirty pages from start
 956  *
 957  * Returns the page offset within memory region of the start of the contiguout
 958  * dirty page
 959  *
 960  * @rs: current RAM state
 961  * @rb: RAMBlock where to search for dirty pages
 962  * @start: page where we start the search
 963  * @num: the number of contiguous dirty pages
 964  */
 965 static inline
 966 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 967                                      unsigned long start, unsigned long *num)
 968 {
 969     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 970     unsigned long *bitmap = rb->bmap;
 971     unsigned long first, next;
 972
 973     *num = 0;
 974
 975     if (ramblock_is_ignored(rb)) {
 976         return size;
 977     }
 978
 979     first = find_next_bit(bitmap, size, start);
 980     if (first >= size) {
 981         return first;
 982     }
 983     next = find_next_zero_bit(bitmap, size, first + 1);
 984     assert(next >= first);
 985     *num = next - first;
 986     return first;
 987 }
 988
 989 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 990                                                 RAMBlock *rb,
 991                                                 unsigned long page)
 992 {
 993     bool ret;
 994
 995     /*
 996      * Clear dirty bitmap if needed.  This _must_ be called before we
 997      * send any of the page in the chunk because we need to make sure
 998      * we can capture further page content changes when we sync dirty
 999      * log the next time.  So as long as we are going to send any of
1000      * the page in the chunk we clear the remote dirty bitmap for all.
1001      * Clearing it earlier won't be a problem, but too late will.
1002      */
1003     migration_clear_memory_region_dirty_bitmap(rb, page);
1004
1005     ret = test_and_clear_bit(page, rb->bmap);
1006     if (ret) {
1007         rs->migration_dirty_pages--;
1008     }
1009
1010     return ret;
1011 }
1012
1013 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
1014                                        void *opaque)
1015 {
1016     const hwaddr offset = section->offset_within_region;
1017     const hwaddr size = int128_get64(section->size);
1018     const unsigned long start = offset >> TARGET_PAGE_BITS;
1019     const unsigned long npages = size >> TARGET_PAGE_BITS;
1020     RAMBlock *rb = section->mr->ram_block;
1021     uint64_t *cleared_bits = opaque;
1022
1023     /*
1024      * We don't grab ram_state->bitmap_mutex because we expect to run
1025      * only when starting migration or during postcopy recovery where
1026      * we don't have concurrent access.
1027      */
1028     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
1029         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
1030     }
1031     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
1032     bitmap_clear(rb->bmap, start, npages);
1033 }
1034
1035 /*
1036  * Exclude all dirty pages from migration that fall into a discarded range as
1037  * managed by a RamDiscardManager responsible for the mapped memory region of
1038  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1039  *
1040  * Discarded pages ("logically unplugged") have undefined content and must
1041  * not get migrated, because even reading these pages for migration might
1042  * result in undesired behavior.
1043  *
1044  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1045  *
1046  * Note: The result is only stable while migrating (precopy/postcopy).
1047  */
1048 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1049 {
1050     uint64_t cleared_bits = 0;
1051
1052     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1053         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1054         MemoryRegionSection section = {
1055             .mr = rb->mr,
1056             .offset_within_region = 0,
1057             .size = int128_make64(qemu_ram_get_used_length(rb)),
1058         };
1059
1060         ram_discard_manager_replay_discarded(rdm, &section,
1061                                              dirty_bitmap_clear_section,
1062                                              &cleared_bits);
1063     }
1064     return cleared_bits;
1065 }
1066
1067 /*
1068  * Check if a host-page aligned page falls into a discarded range as managed by
1069  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1070  *
1071  * Note: The result is only stable while migrating (precopy/postcopy).
1072  */
1073 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1074 {
1075     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1076         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1077         MemoryRegionSection section = {
1078             .mr = rb->mr,
1079             .offset_within_region = start,
1080             .size = int128_make64(qemu_ram_pagesize(rb)),
1081         };
1082
1083         return !ram_discard_manager_is_populated(rdm, &section);
1084     }
1085     return false;
1086 }
1087
1088 /* Called with RCU critical section */
1089 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1090 {
1091     uint64_t new_dirty_pages =
1092         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1093
1094     rs->migration_dirty_pages += new_dirty_pages;
1095     rs->num_dirty_pages_period += new_dirty_pages;
1096 }
1097
1098 /**
1099  * ram_pagesize_summary: calculate all the pagesizes of a VM
1100  *
1101  * Returns a summary bitmap of the page sizes of all RAMBlocks
1102  *
1103  * For VMs with just normal pages this is equivalent to the host page
1104  * size. If it's got some huge pages then it's the OR of all the
1105  * different page sizes.
1106  */
1107 uint64_t ram_pagesize_summary(void)
1108 {
1109     RAMBlock *block;
1110     uint64_t summary = 0;
1111
1112     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1113         summary |= block->page_size;
1114     }
1115
1116     return summary;
1117 }
1118
1119 uint64_t ram_get_total_transferred_pages(void)
1120 {
1121     return stat64_get(&mig_stats.normal_pages) +
1122         stat64_get(&mig_stats.zero_pages) +
1123         compression_counters.pages + xbzrle_counters.pages;
1124 }
1125
1126 static void migration_update_rates(RAMState *rs, int64_t end_time)
1127 {
1128     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1129     double compressed_size;
1130
1131     /* calculate period counters */
1132     stat64_set(&mig_stats.dirty_pages_rate,
1133                rs->num_dirty_pages_period * 1000 /
1134                (end_time - rs->time_last_bitmap_sync));
1135
1136     if (!page_count) {
1137         return;
1138     }
1139
1140     if (migrate_xbzrle()) {
1141         double encoded_size, unencoded_size;
1142
1143         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1144             rs->xbzrle_cache_miss_prev) / page_count;
1145         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1146         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1147                          TARGET_PAGE_SIZE;
1148         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1149         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1150             xbzrle_counters.encoding_rate = 0;
1151         } else {
1152             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1153         }
1154         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1155         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1156     }
1157
1158     if (migrate_compress()) {
1159         compression_counters.busy_rate = (double)(compression_counters.busy -
1160             rs->compress_thread_busy_prev) / page_count;
1161         rs->compress_thread_busy_prev = compression_counters.busy;
1162
1163         compressed_size = compression_counters.compressed_size -
1164                           rs->compressed_size_prev;
1165         if (compressed_size) {
1166             double uncompressed_size = (compression_counters.pages -
1167                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1168
1169             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1170             compression_counters.compression_rate =
1171                                         uncompressed_size / compressed_size;
1172
1173             rs->compress_pages_prev = compression_counters.pages;
1174             rs->compressed_size_prev = compression_counters.compressed_size;
1175         }
1176     }
1177 }
1178
1179 static void migration_trigger_throttle(RAMState *rs)
1180 {
1181     uint64_t threshold = migrate_throttle_trigger_threshold();
1182     uint64_t bytes_xfer_period =
1183         stat64_get(&mig_stats.transferred) - rs->bytes_xfer_prev;
1184     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1185     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1186
1187     /* During block migration the auto-converge logic incorrectly detects
1188      * that ram migration makes no progress. Avoid this by disabling the
1189      * throttling logic during the bulk phase of block migration. */
1190     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1191         /* The following detection logic can be refined later. For now:
1192            Check to see if the ratio between dirtied bytes and the approx.
1193            amount of bytes that just got transferred since the last time
1194            we were in this routine reaches the threshold. If that happens
1195            twice, start or increase throttling. */
1196
1197         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1198             (++rs->dirty_rate_high_cnt >= 2)) {
1199             trace_migration_throttle();
1200             rs->dirty_rate_high_cnt = 0;
1201             mig_throttle_guest_down(bytes_dirty_period,
1202                                     bytes_dirty_threshold);
1203         }
1204     }
1205 }
1206
1207 static void migration_bitmap_sync(RAMState *rs)
1208 {
1209     RAMBlock *block;
1210     int64_t end_time;
1211
1212     stat64_add(&mig_stats.dirty_sync_count, 1);
1213
1214     if (!rs->time_last_bitmap_sync) {
1215         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1216     }
1217
1218     trace_migration_bitmap_sync_start();
1219     memory_global_dirty_log_sync();
1220
1221     qemu_mutex_lock(&rs->bitmap_mutex);
1222     WITH_RCU_READ_LOCK_GUARD() {
1223         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1224             ramblock_sync_dirty_bitmap(rs, block);
1225         }
1226         stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining());
1227     }
1228     qemu_mutex_unlock(&rs->bitmap_mutex);
1229
1230     memory_global_after_dirty_log_sync();
1231     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1232
1233     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1234
1235     /* more than 1 second = 1000 millisecons */
1236     if (end_time > rs->time_last_bitmap_sync + 1000) {
1237         migration_trigger_throttle(rs);
1238
1239         migration_update_rates(rs, end_time);
1240
1241         rs->target_page_count_prev = rs->target_page_count;
1242
1243         /* reset period counters */
1244         rs->time_last_bitmap_sync = end_time;
1245         rs->num_dirty_pages_period = 0;
1246         rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred);
1247     }
1248     if (migrate_events()) {
1249         uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
1250         qapi_event_send_migration_pass(generation);
1251     }
1252 }
1253
1254 static void migration_bitmap_sync_precopy(RAMState *rs)
1255 {
1256     Error *local_err = NULL;
1257
1258     /*
1259      * The current notifier usage is just an optimization to migration, so we
1260      * don't stop the normal migration process in the error case.
1261      */
1262     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1263         error_report_err(local_err);
1264         local_err = NULL;
1265     }
1266
1267     migration_bitmap_sync(rs);
1268
1269     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1270         error_report_err(local_err);
1271     }
1272 }
1273
1274 void ram_release_page(const char *rbname, uint64_t offset)
1275 {
1276     if (!migrate_release_ram() || !migration_in_postcopy()) {
1277         return;
1278     }
1279
1280     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1281 }
1282
1283 /**
1284  * save_zero_page_to_file: send the zero page to the file
1285  *
1286  * Returns the size of data written to the file, 0 means the page is not
1287  * a zero page
1288  *
1289  * @pss: current PSS channel
1290  * @block: block that contains the page we want to send
1291  * @offset: offset inside the block for the page
1292  */
1293 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
1294                                   RAMBlock *block, ram_addr_t offset)
1295 {
1296     uint8_t *p = block->host + offset;
1297     int len = 0;
1298
1299     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1300         len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1301         qemu_put_byte(file, 0);
1302         len += 1;
1303         ram_release_page(block->idstr, offset);
1304     }
1305     return len;
1306 }
1307
1308 /**
1309  * save_zero_page: send the zero page to the stream
1310  *
1311  * Returns the number of pages written.
1312  *
1313  * @pss: current PSS channel
1314  * @block: block that contains the page we want to send
1315  * @offset: offset inside the block for the page
1316  */
1317 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block,
1318                           ram_addr_t offset)
1319 {
1320     int len = save_zero_page_to_file(pss, f, block, offset);
1321
1322     if (len) {
1323         stat64_add(&mig_stats.zero_pages, 1);
1324         ram_transferred_add(len);
1325         return 1;
1326     }
1327     return -1;
1328 }
1329
1330 /*
1331  * @pages: the number of pages written by the control path,
1332  *        < 0 - error
1333  *        > 0 - number of pages written
1334  *
1335  * Return true if the pages has been saved, otherwise false is returned.
1336  */
1337 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1338                               ram_addr_t offset, int *pages)
1339 {
1340     uint64_t bytes_xmit = 0;
1341     int ret;
1342
1343     *pages = -1;
1344     ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1345                                 TARGET_PAGE_SIZE, &bytes_xmit);
1346     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1347         return false;
1348     }
1349
1350     if (bytes_xmit) {
1351         ram_transferred_add(bytes_xmit);
1352         *pages = 1;
1353     }
1354
1355     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1356         return true;
1357     }
1358
1359     if (bytes_xmit > 0) {
1360         stat64_add(&mig_stats.normal_pages, 1);
1361     } else if (bytes_xmit == 0) {
1362         stat64_add(&mig_stats.zero_pages, 1);
1363     }
1364
1365     return true;
1366 }
1367
1368 /*
1369  * directly send the page to the stream
1370  *
1371  * Returns the number of pages written.
1372  *
1373  * @pss: current PSS channel
1374  * @block: block that contains the page we want to send
1375  * @offset: offset inside the block for the page
1376  * @buf: the page to be sent
1377  * @async: send to page asyncly
1378  */
1379 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1380                             ram_addr_t offset, uint8_t *buf, bool async)
1381 {
1382     QEMUFile *file = pss->pss_channel;
1383
1384     ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
1385                                          offset | RAM_SAVE_FLAG_PAGE));
1386     if (async) {
1387         qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1388                               migrate_release_ram() &&
1389                               migration_in_postcopy());
1390     } else {
1391         qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1392     }
1393     ram_transferred_add(TARGET_PAGE_SIZE);
1394     stat64_add(&mig_stats.normal_pages, 1);
1395     return 1;
1396 }
1397
1398 /**
1399  * ram_save_page: send the given page to the stream
1400  *
1401  * Returns the number of pages written.
1402  *          < 0 - error
1403  *          >=0 - Number of pages written - this might legally be 0
1404  *                if xbzrle noticed the page was the same.
1405  *
1406  * @rs: current RAM state
1407  * @block: block that contains the page we want to send
1408  * @offset: offset inside the block for the page
1409  */
1410 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1411 {
1412     int pages = -1;
1413     uint8_t *p;
1414     bool send_async = true;
1415     RAMBlock *block = pss->block;
1416     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1417     ram_addr_t current_addr = block->offset + offset;
1418
1419     p = block->host + offset;
1420     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1421
1422     XBZRLE_cache_lock();
1423     if (rs->xbzrle_started && !migration_in_postcopy()) {
1424         pages = save_xbzrle_page(rs, pss, &p, current_addr,
1425                                  block, offset);
1426         if (!rs->last_stage) {
1427             /* Can't send this cached data async, since the cache page
1428              * might get updated before it gets to the wire
1429              */
1430             send_async = false;
1431         }
1432     }
1433
1434     /* XBZRLE overflow or normal page */
1435     if (pages == -1) {
1436         pages = save_normal_page(pss, block, offset, p, send_async);
1437     }
1438
1439     XBZRLE_cache_unlock();
1440
1441     return pages;
1442 }
1443
1444 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
1445                                  ram_addr_t offset)
1446 {
1447     if (multifd_queue_page(file, block, offset) < 0) {
1448         return -1;
1449     }
1450     stat64_add(&mig_stats.normal_pages, 1);
1451
1452     return 1;
1453 }
1454
1455 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1456                                  ram_addr_t offset, uint8_t *source_buf)
1457 {
1458     RAMState *rs = ram_state;
1459     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
1460     uint8_t *p = block->host + offset;
1461     int ret;
1462
1463     if (save_zero_page_to_file(pss, f, block, offset)) {
1464         return true;
1465     }
1466
1467     save_page_header(pss, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1468
1469     /*
1470      * copy it to a internal buffer to avoid it being modified by VM
1471      * so that we can catch up the error during compression and
1472      * decompression
1473      */
1474     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1475     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1476     if (ret < 0) {
1477         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1478         error_report("compressed data failed!");
1479     }
1480     return false;
1481 }
1482
1483 static void
1484 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1485 {
1486     ram_transferred_add(bytes_xmit);
1487
1488     if (param->zero_page) {
1489         stat64_add(&mig_stats.zero_pages, 1);
1490         return;
1491     }
1492
1493     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1494     compression_counters.compressed_size += bytes_xmit - 8;
1495     compression_counters.pages++;
1496 }
1497
1498 static bool save_page_use_compression(RAMState *rs);
1499
1500 static void flush_compressed_data(RAMState *rs)
1501 {
1502     MigrationState *ms = migrate_get_current();
1503     int idx, len, thread_count;
1504
1505     if (!save_page_use_compression(rs)) {
1506         return;
1507     }
1508     thread_count = migrate_compress_threads();
1509
1510     qemu_mutex_lock(&comp_done_lock);
1511     for (idx = 0; idx < thread_count; idx++) {
1512         while (!comp_param[idx].done) {
1513             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1514         }
1515     }
1516     qemu_mutex_unlock(&comp_done_lock);
1517
1518     for (idx = 0; idx < thread_count; idx++) {
1519         qemu_mutex_lock(&comp_param[idx].mutex);
1520         if (!comp_param[idx].quit) {
1521             len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
1522             /*
1523              * it's safe to fetch zero_page without holding comp_done_lock
1524              * as there is no further request submitted to the thread,
1525              * i.e, the thread should be waiting for a request at this point.
1526              */
1527             update_compress_thread_counts(&comp_param[idx], len);
1528         }
1529         qemu_mutex_unlock(&comp_param[idx].mutex);
1530     }
1531 }
1532
1533 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1534                                        ram_addr_t offset)
1535 {
1536     param->block = block;
1537     param->offset = offset;
1538 }
1539
1540 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
1541 {
1542     int idx, thread_count, bytes_xmit = -1, pages = -1;
1543     bool wait = migrate_compress_wait_thread();
1544     MigrationState *ms = migrate_get_current();
1545
1546     thread_count = migrate_compress_threads();
1547     qemu_mutex_lock(&comp_done_lock);
1548 retry:
1549     for (idx = 0; idx < thread_count; idx++) {
1550         if (comp_param[idx].done) {
1551             comp_param[idx].done = false;
1552             bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
1553                                             comp_param[idx].file);
1554             qemu_mutex_lock(&comp_param[idx].mutex);
1555             set_compress_params(&comp_param[idx], block, offset);
1556             qemu_cond_signal(&comp_param[idx].cond);
1557             qemu_mutex_unlock(&comp_param[idx].mutex);
1558             pages = 1;
1559             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1560             break;
1561         }
1562     }
1563
1564     /*
1565      * wait for the free thread if the user specifies 'compress-wait-thread',
1566      * otherwise we will post the page out in the main thread as normal page.
1567      */
1568     if (pages < 0 && wait) {
1569         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1570         goto retry;
1571     }
1572     qemu_mutex_unlock(&comp_done_lock);
1573
1574     return pages;
1575 }
1576
1577 #define PAGE_ALL_CLEAN 0
1578 #define PAGE_TRY_AGAIN 1
1579 #define PAGE_DIRTY_FOUND 2
1580 /**
1581  * find_dirty_block: find the next dirty page and update any state
1582  * associated with the search process.
1583  *
1584  * Returns:
1585  *         <0: An error happened
1586  *         PAGE_ALL_CLEAN: no dirty page found, give up
1587  *         PAGE_TRY_AGAIN: no dirty page found, retry for next block
1588  *         PAGE_DIRTY_FOUND: dirty page found
1589  *
1590  * @rs: current RAM state
1591  * @pss: data about the state of the current dirty page scan
1592  * @again: set to false if the search has scanned the whole of RAM
1593  */
1594 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
1595 {
1596     /* Update pss->page for the next dirty bit in ramblock */
1597     pss_find_next_dirty(pss);
1598
1599     if (pss->complete_round && pss->block == rs->last_seen_block &&
1600         pss->page >= rs->last_page) {
1601         /*
1602          * We've been once around the RAM and haven't found anything.
1603          * Give up.
1604          */
1605         return PAGE_ALL_CLEAN;
1606     }
1607     if (!offset_in_ramblock(pss->block,
1608                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1609         /* Didn't find anything in this RAM Block */
1610         pss->page = 0;
1611         pss->block = QLIST_NEXT_RCU(pss->block, next);
1612         if (!pss->block) {
1613             if (!migrate_multifd_flush_after_each_section()) {
1614                 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel;
1615                 int ret = multifd_send_sync_main(f);
1616                 if (ret < 0) {
1617                     return ret;
1618                 }
1619                 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
1620                 qemu_fflush(f);
1621             }
1622             /*
1623              * If memory migration starts over, we will meet a dirtied page
1624              * which may still exists in compression threads's ring, so we
1625              * should flush the compressed data to make sure the new page
1626              * is not overwritten by the old one in the destination.
1627              *
1628              * Also If xbzrle is on, stop using the data compression at this
1629              * point. In theory, xbzrle can do better than compression.
1630              */
1631             flush_compressed_data(rs);
1632
1633             /* Hit the end of the list */
1634             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1635             /* Flag that we've looped */
1636             pss->complete_round = true;
1637             /* After the first round, enable XBZRLE. */
1638             if (migrate_xbzrle()) {
1639                 rs->xbzrle_started = true;
1640             }
1641         }
1642         /* Didn't find anything this time, but try again on the new block */
1643         return PAGE_TRY_AGAIN;
1644     } else {
1645         /* We've found something */
1646         return PAGE_DIRTY_FOUND;
1647     }
1648 }
1649
1650 /**
1651  * unqueue_page: gets a page of the queue
1652  *
1653  * Helper for 'get_queued_page' - gets a page off the queue
1654  *
1655  * Returns the block of the page (or NULL if none available)
1656  *
1657  * @rs: current RAM state
1658  * @offset: used to return the offset within the RAMBlock
1659  */
1660 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1661 {
1662     struct RAMSrcPageRequest *entry;
1663     RAMBlock *block = NULL;
1664
1665     if (!postcopy_has_request(rs)) {
1666         return NULL;
1667     }
1668
1669     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1670
1671     /*
1672      * This should _never_ change even after we take the lock, because no one
1673      * should be taking anything off the request list other than us.
1674      */
1675     assert(postcopy_has_request(rs));
1676
1677     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1678     block = entry->rb;
1679     *offset = entry->offset;
1680
1681     if (entry->len > TARGET_PAGE_SIZE) {
1682         entry->len -= TARGET_PAGE_SIZE;
1683         entry->offset += TARGET_PAGE_SIZE;
1684     } else {
1685         memory_region_unref(block->mr);
1686         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1687         g_free(entry);
1688         migration_consume_urgent_request();
1689     }
1690
1691     return block;
1692 }
1693
1694 #if defined(__linux__)
1695 /**
1696  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1697  *   is found, return RAM block pointer and page offset
1698  *
1699  * Returns pointer to the RAMBlock containing faulting page,
1700  *   NULL if no write faults are pending
1701  *
1702  * @rs: current RAM state
1703  * @offset: page offset from the beginning of the block
1704  */
1705 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1706 {
1707     struct uffd_msg uffd_msg;
1708     void *page_address;
1709     RAMBlock *block;
1710     int res;
1711
1712     if (!migrate_background_snapshot()) {
1713         return NULL;
1714     }
1715
1716     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1717     if (res <= 0) {
1718         return NULL;
1719     }
1720
1721     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1722     block = qemu_ram_block_from_host(page_address, false, offset);
1723     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1724     return block;
1725 }
1726
1727 /**
1728  * ram_save_release_protection: release UFFD write protection after
1729  *   a range of pages has been saved
1730  *
1731  * @rs: current RAM state
1732  * @pss: page-search-status structure
1733  * @start_page: index of the first page in the range relative to pss->block
1734  *
1735  * Returns 0 on success, negative value in case of an error
1736 */
1737 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1738         unsigned long start_page)
1739 {
1740     int res = 0;
1741
1742     /* Check if page is from UFFD-managed region. */
1743     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1744         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1745         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1746
1747         /* Flush async buffers before un-protect. */
1748         qemu_fflush(pss->pss_channel);
1749         /* Un-protect memory range. */
1750         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1751                 false, false);
1752     }
1753
1754     return res;
1755 }
1756
1757 /* ram_write_tracking_available: check if kernel supports required UFFD features
1758  *
1759  * Returns true if supports, false otherwise
1760  */
1761 bool ram_write_tracking_available(void)
1762 {
1763     uint64_t uffd_features;
1764     int res;
1765
1766     res = uffd_query_features(&uffd_features);
1767     return (res == 0 &&
1768             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1769 }
1770
1771 /* ram_write_tracking_compatible: check if guest configuration is
1772  *   compatible with 'write-tracking'
1773  *
1774  * Returns true if compatible, false otherwise
1775  */
1776 bool ram_write_tracking_compatible(void)
1777 {
1778     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1779     int uffd_fd;
1780     RAMBlock *block;
1781     bool ret = false;
1782
1783     /* Open UFFD file descriptor */
1784     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1785     if (uffd_fd < 0) {
1786         return false;
1787     }
1788
1789     RCU_READ_LOCK_GUARD();
1790
1791     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1792         uint64_t uffd_ioctls;
1793
1794         /* Nothing to do with read-only and MMIO-writable regions */
1795         if (block->mr->readonly || block->mr->rom_device) {
1796             continue;
1797         }
1798         /* Try to register block memory via UFFD-IO to track writes */
1799         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1800                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1801             goto out;
1802         }
1803         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1804             goto out;
1805         }
1806     }
1807     ret = true;
1808
1809 out:
1810     uffd_close_fd(uffd_fd);
1811     return ret;
1812 }
1813
1814 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1815                                        ram_addr_t size)
1816 {
1817     const ram_addr_t end = offset + size;
1818
1819     /*
1820      * We read one byte of each page; this will preallocate page tables if
1821      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1822      * where no page was populated yet. This might require adaption when
1823      * supporting other mappings, like shmem.
1824      */
1825     for (; offset < end; offset += block->page_size) {
1826         char tmp = *((char *)block->host + offset);
1827
1828         /* Don't optimize the read out */
1829         asm volatile("" : "+r" (tmp));
1830     }
1831 }
1832
1833 static inline int populate_read_section(MemoryRegionSection *section,
1834                                         void *opaque)
1835 {
1836     const hwaddr size = int128_get64(section->size);
1837     hwaddr offset = section->offset_within_region;
1838     RAMBlock *block = section->mr->ram_block;
1839
1840     populate_read_range(block, offset, size);
1841     return 0;
1842 }
1843
1844 /*
1845  * ram_block_populate_read: preallocate page tables and populate pages in the
1846  *   RAM block by reading a byte of each page.
1847  *
1848  * Since it's solely used for userfault_fd WP feature, here we just
1849  *   hardcode page size to qemu_real_host_page_size.
1850  *
1851  * @block: RAM block to populate
1852  */
1853 static void ram_block_populate_read(RAMBlock *rb)
1854 {
1855     /*
1856      * Skip populating all pages that fall into a discarded range as managed by
1857      * a RamDiscardManager responsible for the mapped memory region of the
1858      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1859      * must not get populated automatically. We don't have to track
1860      * modifications via userfaultfd WP reliably, because these pages will
1861      * not be part of the migration stream either way -- see
1862      * ramblock_dirty_bitmap_exclude_discarded_pages().
1863      *
1864      * Note: The result is only stable while migrating (precopy/postcopy).
1865      */
1866     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1867         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1868         MemoryRegionSection section = {
1869             .mr = rb->mr,
1870             .offset_within_region = 0,
1871             .size = rb->mr->size,
1872         };
1873
1874         ram_discard_manager_replay_populated(rdm, &section,
1875                                              populate_read_section, NULL);
1876     } else {
1877         populate_read_range(rb, 0, rb->used_length);
1878     }
1879 }
1880
1881 /*
1882  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1883  */
1884 void ram_write_tracking_prepare(void)
1885 {
1886     RAMBlock *block;
1887
1888     RCU_READ_LOCK_GUARD();
1889
1890     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1891         /* Nothing to do with read-only and MMIO-writable regions */
1892         if (block->mr->readonly || block->mr->rom_device) {
1893             continue;
1894         }
1895
1896         /*
1897          * Populate pages of the RAM block before enabling userfault_fd
1898          * write protection.
1899          *
1900          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1901          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1902          * pages with pte_none() entries in page table.
1903          */
1904         ram_block_populate_read(block);
1905     }
1906 }
1907
1908 static inline int uffd_protect_section(MemoryRegionSection *section,
1909                                        void *opaque)
1910 {
1911     const hwaddr size = int128_get64(section->size);
1912     const hwaddr offset = section->offset_within_region;
1913     RAMBlock *rb = section->mr->ram_block;
1914     int uffd_fd = (uintptr_t)opaque;
1915
1916     return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1917                                   false);
1918 }
1919
1920 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1921 {
1922     assert(rb->flags & RAM_UF_WRITEPROTECT);
1923
1924     /* See ram_block_populate_read() */
1925     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1926         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1927         MemoryRegionSection section = {
1928             .mr = rb->mr,
1929             .offset_within_region = 0,
1930             .size = rb->mr->size,
1931         };
1932
1933         return ram_discard_manager_replay_populated(rdm, &section,
1934                                                     uffd_protect_section,
1935                                                     (void *)(uintptr_t)uffd_fd);
1936     }
1937     return uffd_change_protection(uffd_fd, rb->host,
1938                                   rb->used_length, true, false);
1939 }
1940
1941 /*
1942  * ram_write_tracking_start: start UFFD-WP memory tracking
1943  *
1944  * Returns 0 for success or negative value in case of error
1945  */
1946 int ram_write_tracking_start(void)
1947 {
1948     int uffd_fd;
1949     RAMState *rs = ram_state;
1950     RAMBlock *block;
1951
1952     /* Open UFFD file descriptor */
1953     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1954     if (uffd_fd < 0) {
1955         return uffd_fd;
1956     }
1957     rs->uffdio_fd = uffd_fd;
1958
1959     RCU_READ_LOCK_GUARD();
1960
1961     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1962         /* Nothing to do with read-only and MMIO-writable regions */
1963         if (block->mr->readonly || block->mr->rom_device) {
1964             continue;
1965         }
1966
1967         /* Register block memory with UFFD to track writes */
1968         if (uffd_register_memory(rs->uffdio_fd, block->host,
1969                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1970             goto fail;
1971         }
1972         block->flags |= RAM_UF_WRITEPROTECT;
1973         memory_region_ref(block->mr);
1974
1975         /* Apply UFFD write protection to the block memory range */
1976         if (ram_block_uffd_protect(block, uffd_fd)) {
1977             goto fail;
1978         }
1979
1980         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1981                 block->host, block->max_length);
1982     }
1983
1984     return 0;
1985
1986 fail:
1987     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1988
1989     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1990         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1991             continue;
1992         }
1993         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1994         /* Cleanup flags and remove reference */
1995         block->flags &= ~RAM_UF_WRITEPROTECT;
1996         memory_region_unref(block->mr);
1997     }
1998
1999     uffd_close_fd(uffd_fd);
2000     rs->uffdio_fd = -1;
2001     return -1;
2002 }
2003
2004 /**
2005  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
2006  */
2007 void ram_write_tracking_stop(void)
2008 {
2009     RAMState *rs = ram_state;
2010     RAMBlock *block;
2011
2012     RCU_READ_LOCK_GUARD();
2013
2014     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2015         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
2016             continue;
2017         }
2018         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
2019
2020         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
2021                 block->host, block->max_length);
2022
2023         /* Cleanup flags and remove reference */
2024         block->flags &= ~RAM_UF_WRITEPROTECT;
2025         memory_region_unref(block->mr);
2026     }
2027
2028     /* Finally close UFFD file descriptor */
2029     uffd_close_fd(rs->uffdio_fd);
2030     rs->uffdio_fd = -1;
2031 }
2032
2033 #else
2034 /* No target OS support, stubs just fail or ignore */
2035
2036 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
2037 {
2038     (void) rs;
2039     (void) offset;
2040
2041     return NULL;
2042 }
2043
2044 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
2045         unsigned long start_page)
2046 {
2047     (void) rs;
2048     (void) pss;
2049     (void) start_page;
2050
2051     return 0;
2052 }
2053
2054 bool ram_write_tracking_available(void)
2055 {
2056     return false;
2057 }
2058
2059 bool ram_write_tracking_compatible(void)
2060 {
2061     assert(0);
2062     return false;
2063 }
2064
2065 int ram_write_tracking_start(void)
2066 {
2067     assert(0);
2068     return -1;
2069 }
2070
2071 void ram_write_tracking_stop(void)
2072 {
2073     assert(0);
2074 }
2075 #endif /* defined(__linux__) */
2076
2077 /**
2078  * get_queued_page: unqueue a page from the postcopy requests
2079  *
2080  * Skips pages that are already sent (!dirty)
2081  *
2082  * Returns true if a queued page is found
2083  *
2084  * @rs: current RAM state
2085  * @pss: data about the state of the current dirty page scan
2086  */
2087 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2088 {
2089     RAMBlock  *block;
2090     ram_addr_t offset;
2091     bool dirty;
2092
2093     do {
2094         block = unqueue_page(rs, &offset);
2095         /*
2096          * We're sending this page, and since it's postcopy nothing else
2097          * will dirty it, and we must make sure it doesn't get sent again
2098          * even if this queue request was received after the background
2099          * search already sent it.
2100          */
2101         if (block) {
2102             unsigned long page;
2103
2104             page = offset >> TARGET_PAGE_BITS;
2105             dirty = test_bit(page, block->bmap);
2106             if (!dirty) {
2107                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2108                                                 page);
2109             } else {
2110                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2111             }
2112         }
2113
2114     } while (block && !dirty);
2115
2116     if (!block) {
2117         /*
2118          * Poll write faults too if background snapshot is enabled; that's
2119          * when we have vcpus got blocked by the write protected pages.
2120          */
2121         block = poll_fault_page(rs, &offset);
2122     }
2123
2124     if (block) {
2125         /*
2126          * We want the background search to continue from the queued page
2127          * since the guest is likely to want other pages near to the page
2128          * it just requested.
2129          */
2130         pss->block = block;
2131         pss->page = offset >> TARGET_PAGE_BITS;
2132
2133         /*
2134          * This unqueued page would break the "one round" check, even is
2135          * really rare.
2136          */
2137         pss->complete_round = false;
2138     }
2139
2140     return !!block;
2141 }
2142
2143 /**
2144  * migration_page_queue_free: drop any remaining pages in the ram
2145  * request queue
2146  *
2147  * It should be empty at the end anyway, but in error cases there may
2148  * be some left.  in case that there is any page left, we drop it.
2149  *
2150  */
2151 static void migration_page_queue_free(RAMState *rs)
2152 {
2153     struct RAMSrcPageRequest *mspr, *next_mspr;
2154     /* This queue generally should be empty - but in the case of a failed
2155      * migration might have some droppings in.
2156      */
2157     RCU_READ_LOCK_GUARD();
2158     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2159         memory_region_unref(mspr->rb->mr);
2160         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2161         g_free(mspr);
2162     }
2163 }
2164
2165 /**
2166  * ram_save_queue_pages: queue the page for transmission
2167  *
2168  * A request from postcopy destination for example.
2169  *
2170  * Returns zero on success or negative on error
2171  *
2172  * @rbname: Name of the RAMBLock of the request. NULL means the
2173  *          same that last one.
2174  * @start: starting address from the start of the RAMBlock
2175  * @len: length (in bytes) to send
2176  */
2177 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2178 {
2179     RAMBlock *ramblock;
2180     RAMState *rs = ram_state;
2181
2182     stat64_add(&mig_stats.postcopy_requests, 1);
2183     RCU_READ_LOCK_GUARD();
2184
2185     if (!rbname) {
2186         /* Reuse last RAMBlock */
2187         ramblock = rs->last_req_rb;
2188
2189         if (!ramblock) {
2190             /*
2191              * Shouldn't happen, we can't reuse the last RAMBlock if
2192              * it's the 1st request.
2193              */
2194             error_report("ram_save_queue_pages no previous block");
2195             return -1;
2196         }
2197     } else {
2198         ramblock = qemu_ram_block_by_name(rbname);
2199
2200         if (!ramblock) {
2201             /* We shouldn't be asked for a non-existent RAMBlock */
2202             error_report("ram_save_queue_pages no block '%s'", rbname);
2203             return -1;
2204         }
2205         rs->last_req_rb = ramblock;
2206     }
2207     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2208     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2209         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2210                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2211                      __func__, start, len, ramblock->used_length);
2212         return -1;
2213     }
2214
2215     /*
2216      * When with postcopy preempt, we send back the page directly in the
2217      * rp-return thread.
2218      */
2219     if (postcopy_preempt_active()) {
2220         ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2221         size_t page_size = qemu_ram_pagesize(ramblock);
2222         PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2223         int ret = 0;
2224
2225         qemu_mutex_lock(&rs->bitmap_mutex);
2226
2227         pss_init(pss, ramblock, page_start);
2228         /*
2229          * Always use the preempt channel, and make sure it's there.  It's
2230          * safe to access without lock, because when rp-thread is running
2231          * we should be the only one who operates on the qemufile
2232          */
2233         pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
2234         assert(pss->pss_channel);
2235
2236         /*
2237          * It must be either one or multiple of host page size.  Just
2238          * assert; if something wrong we're mostly split brain anyway.
2239          */
2240         assert(len % page_size == 0);
2241         while (len) {
2242             if (ram_save_host_page_urgent(pss)) {
2243                 error_report("%s: ram_save_host_page_urgent() failed: "
2244                              "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2245                              __func__, ramblock->idstr, start);
2246                 ret = -1;
2247                 break;
2248             }
2249             /*
2250              * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2251              * will automatically be moved and point to the next host page
2252              * we're going to send, so no need to update here.
2253              *
2254              * Normally QEMU never sends >1 host page in requests, so
2255              * logically we don't even need that as the loop should only
2256              * run once, but just to be consistent.
2257              */
2258             len -= page_size;
2259         };
2260         qemu_mutex_unlock(&rs->bitmap_mutex);
2261
2262         return ret;
2263     }
2264
2265     struct RAMSrcPageRequest *new_entry =
2266         g_new0(struct RAMSrcPageRequest, 1);
2267     new_entry->rb = ramblock;
2268     new_entry->offset = start;
2269     new_entry->len = len;
2270
2271     memory_region_ref(ramblock->mr);
2272     qemu_mutex_lock(&rs->src_page_req_mutex);
2273     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2274     migration_make_urgent_request();
2275     qemu_mutex_unlock(&rs->src_page_req_mutex);
2276
2277     return 0;
2278 }
2279
2280 static bool save_page_use_compression(RAMState *rs)
2281 {
2282     if (!migrate_compress()) {
2283         return false;
2284     }
2285
2286     /*
2287      * If xbzrle is enabled (e.g., after first round of migration), stop
2288      * using the data compression. In theory, xbzrle can do better than
2289      * compression.
2290      */
2291     if (rs->xbzrle_started) {
2292         return false;
2293     }
2294
2295     return true;
2296 }
2297
2298 /*
2299  * try to compress the page before posting it out, return true if the page
2300  * has been properly handled by compression, otherwise needs other
2301  * paths to handle it
2302  */
2303 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2304                                RAMBlock *block, ram_addr_t offset)
2305 {
2306     if (!save_page_use_compression(rs)) {
2307         return false;
2308     }
2309
2310     /*
2311      * When starting the process of a new block, the first page of
2312      * the block should be sent out before other pages in the same
2313      * block, and all the pages in last block should have been sent
2314      * out, keeping this order is important, because the 'cont' flag
2315      * is used to avoid resending the block name.
2316      *
2317      * We post the fist page as normal page as compression will take
2318      * much CPU resource.
2319      */
2320     if (block != pss->last_sent_block) {
2321         flush_compressed_data(rs);
2322         return false;
2323     }
2324
2325     if (compress_page_with_multi_thread(block, offset) > 0) {
2326         return true;
2327     }
2328
2329     compression_counters.busy++;
2330     return false;
2331 }
2332
2333 /**
2334  * ram_save_target_page_legacy: save one target page
2335  *
2336  * Returns the number of pages written
2337  *
2338  * @rs: current RAM state
2339  * @pss: data about the page we want to send
2340  */
2341 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
2342 {
2343     RAMBlock *block = pss->block;
2344     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2345     int res;
2346
2347     if (control_save_page(pss, block, offset, &res)) {
2348         return res;
2349     }
2350
2351     if (save_compress_page(rs, pss, block, offset)) {
2352         return 1;
2353     }
2354
2355     res = save_zero_page(pss, pss->pss_channel, block, offset);
2356     if (res > 0) {
2357         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2358          * page would be stale
2359          */
2360         if (rs->xbzrle_started) {
2361             XBZRLE_cache_lock();
2362             xbzrle_cache_zero_page(rs, block->offset + offset);
2363             XBZRLE_cache_unlock();
2364         }
2365         return res;
2366     }
2367
2368     /*
2369      * Do not use multifd in postcopy as one whole host page should be
2370      * placed.  Meanwhile postcopy requires atomic update of pages, so even
2371      * if host page size == guest page size the dest guest during run may
2372      * still see partially copied pages which is data corruption.
2373      */
2374     if (migrate_multifd() && !migration_in_postcopy()) {
2375         return ram_save_multifd_page(pss->pss_channel, block, offset);
2376     }
2377
2378     return ram_save_page(rs, pss);
2379 }
2380
2381 /* Should be called before sending a host page */
2382 static void pss_host_page_prepare(PageSearchStatus *pss)
2383 {
2384     /* How many guest pages are there in one host page? */
2385     size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2386
2387     pss->host_page_sending = true;
2388     if (guest_pfns <= 1) {
2389         /*
2390          * This covers both when guest psize == host psize, or when guest
2391          * has larger psize than the host (guest_pfns==0).
2392          *
2393          * For the latter, we always send one whole guest page per
2394          * iteration of the host page (example: an Alpha VM on x86 host
2395          * will have guest psize 8K while host psize 4K).
2396          */
2397         pss->host_page_start = pss->page;
2398         pss->host_page_end = pss->page + 1;
2399     } else {
2400         /*
2401          * The host page spans over multiple guest pages, we send them
2402          * within the same host page iteration.
2403          */
2404         pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2405         pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2406     }
2407 }
2408
2409 /*
2410  * Whether the page pointed by PSS is within the host page being sent.
2411  * Must be called after a previous pss_host_page_prepare().
2412  */
2413 static bool pss_within_range(PageSearchStatus *pss)
2414 {
2415     ram_addr_t ram_addr;
2416
2417     assert(pss->host_page_sending);
2418
2419     /* Over host-page boundary? */
2420     if (pss->page >= pss->host_page_end) {
2421         return false;
2422     }
2423
2424     ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2425
2426     return offset_in_ramblock(pss->block, ram_addr);
2427 }
2428
2429 static void pss_host_page_finish(PageSearchStatus *pss)
2430 {
2431     pss->host_page_sending = false;
2432     /* This is not needed, but just to reset it */
2433     pss->host_page_start = pss->host_page_end = 0;
2434 }
2435
2436 /*
2437  * Send an urgent host page specified by `pss'.  Need to be called with
2438  * bitmap_mutex held.
2439  *
2440  * Returns 0 if save host page succeeded, false otherwise.
2441  */
2442 static int ram_save_host_page_urgent(PageSearchStatus *pss)
2443 {
2444     bool page_dirty, sent = false;
2445     RAMState *rs = ram_state;
2446     int ret = 0;
2447
2448     trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2449     pss_host_page_prepare(pss);
2450
2451     /*
2452      * If precopy is sending the same page, let it be done in precopy, or
2453      * we could send the same page in two channels and none of them will
2454      * receive the whole page.
2455      */
2456     if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2457         trace_postcopy_preempt_hit(pss->block->idstr,
2458                                    pss->page << TARGET_PAGE_BITS);
2459         return 0;
2460     }
2461
2462     do {
2463         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2464
2465         if (page_dirty) {
2466             /* Be strict to return code; it must be 1, or what else? */
2467             if (migration_ops->ram_save_target_page(rs, pss) != 1) {
2468                 error_report_once("%s: ram_save_target_page failed", __func__);
2469                 ret = -1;
2470                 goto out;
2471             }
2472             sent = true;
2473         }
2474         pss_find_next_dirty(pss);
2475     } while (pss_within_range(pss));
2476 out:
2477     pss_host_page_finish(pss);
2478     /* For urgent requests, flush immediately if sent */
2479     if (sent) {
2480         qemu_fflush(pss->pss_channel);
2481     }
2482     return ret;
2483 }
2484
2485 /**
2486  * ram_save_host_page: save a whole host page
2487  *
2488  * Starting at *offset send pages up to the end of the current host
2489  * page. It's valid for the initial offset to point into the middle of
2490  * a host page in which case the remainder of the hostpage is sent.
2491  * Only dirty target pages are sent. Note that the host page size may
2492  * be a huge page for this block.
2493  *
2494  * The saving stops at the boundary of the used_length of the block
2495  * if the RAMBlock isn't a multiple of the host page size.
2496  *
2497  * The caller must be with ram_state.bitmap_mutex held to call this
2498  * function.  Note that this function can temporarily release the lock, but
2499  * when the function is returned it'll make sure the lock is still held.
2500  *
2501  * Returns the number of pages written or negative on error
2502  *
2503  * @rs: current RAM state
2504  * @pss: data about the page we want to send
2505  */
2506 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2507 {
2508     bool page_dirty, preempt_active = postcopy_preempt_active();
2509     int tmppages, pages = 0;
2510     size_t pagesize_bits =
2511         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2512     unsigned long start_page = pss->page;
2513     int res;
2514
2515     if (ramblock_is_ignored(pss->block)) {
2516         error_report("block %s should not be migrated !", pss->block->idstr);
2517         return 0;
2518     }
2519
2520     /* Update host page boundary information */
2521     pss_host_page_prepare(pss);
2522
2523     do {
2524         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2525
2526         /* Check the pages is dirty and if it is send it */
2527         if (page_dirty) {
2528             /*
2529              * Properly yield the lock only in postcopy preempt mode
2530              * because both migration thread and rp-return thread can
2531              * operate on the bitmaps.
2532              */
2533             if (preempt_active) {
2534                 qemu_mutex_unlock(&rs->bitmap_mutex);
2535             }
2536             tmppages = migration_ops->ram_save_target_page(rs, pss);
2537             if (tmppages >= 0) {
2538                 pages += tmppages;
2539                 /*
2540                  * Allow rate limiting to happen in the middle of huge pages if
2541                  * something is sent in the current iteration.
2542                  */
2543                 if (pagesize_bits > 1 && tmppages > 0) {
2544                     migration_rate_limit();
2545                 }
2546             }
2547             if (preempt_active) {
2548                 qemu_mutex_lock(&rs->bitmap_mutex);
2549             }
2550         } else {
2551             tmppages = 0;
2552         }
2553
2554         if (tmppages < 0) {
2555             pss_host_page_finish(pss);
2556             return tmppages;
2557         }
2558
2559         pss_find_next_dirty(pss);
2560     } while (pss_within_range(pss));
2561
2562     pss_host_page_finish(pss);
2563
2564     res = ram_save_release_protection(rs, pss, start_page);
2565     return (res < 0 ? res : pages);
2566 }
2567
2568 /**
2569  * ram_find_and_save_block: finds a dirty page and sends it to f
2570  *
2571  * Called within an RCU critical section.
2572  *
2573  * Returns the number of pages written where zero means no dirty pages,
2574  * or negative on error
2575  *
2576  * @rs: current RAM state
2577  *
2578  * On systems where host-page-size > target-page-size it will send all the
2579  * pages in a host page that are dirty.
2580  */
2581 static int ram_find_and_save_block(RAMState *rs)
2582 {
2583     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2584     int pages = 0;
2585
2586     /* No dirty page as there is zero RAM */
2587     if (!rs->ram_bytes_total) {
2588         return pages;
2589     }
2590
2591     /*
2592      * Always keep last_seen_block/last_page valid during this procedure,
2593      * because find_dirty_block() relies on these values (e.g., we compare
2594      * last_seen_block with pss.block to see whether we searched all the
2595      * ramblocks) to detect the completion of migration.  Having NULL value
2596      * of last_seen_block can conditionally cause below loop to run forever.
2597      */
2598     if (!rs->last_seen_block) {
2599         rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2600         rs->last_page = 0;
2601     }
2602
2603     pss_init(pss, rs->last_seen_block, rs->last_page);
2604
2605     while (true){
2606         if (!get_queued_page(rs, pss)) {
2607             /* priority queue empty, so just search for something dirty */
2608             int res = find_dirty_block(rs, pss);
2609             if (res != PAGE_DIRTY_FOUND) {
2610                 if (res == PAGE_ALL_CLEAN) {
2611                     break;
2612                 } else if (res == PAGE_TRY_AGAIN) {
2613                     continue;
2614                 } else if (res < 0) {
2615                     pages = res;
2616                     break;
2617                 }
2618             }
2619         }
2620         pages = ram_save_host_page(rs, pss);
2621         if (pages) {
2622             break;
2623         }
2624     }
2625
2626     rs->last_seen_block = pss->block;
2627     rs->last_page = pss->page;
2628
2629     return pages;
2630 }
2631
2632 static uint64_t ram_bytes_total_with_ignored(void)
2633 {
2634     RAMBlock *block;
2635     uint64_t total = 0;
2636
2637     RCU_READ_LOCK_GUARD();
2638
2639     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2640         total += block->used_length;
2641     }
2642     return total;
2643 }
2644
2645 uint64_t ram_bytes_total(void)
2646 {
2647     RAMBlock *block;
2648     uint64_t total = 0;
2649
2650     RCU_READ_LOCK_GUARD();
2651
2652     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2653         total += block->used_length;
2654     }
2655     return total;
2656 }
2657
2658 static void xbzrle_load_setup(void)
2659 {
2660     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2661 }
2662
2663 static void xbzrle_load_cleanup(void)
2664 {
2665     g_free(XBZRLE.decoded_buf);
2666     XBZRLE.decoded_buf = NULL;
2667 }
2668
2669 static void ram_state_cleanup(RAMState **rsp)
2670 {
2671     if (*rsp) {
2672         migration_page_queue_free(*rsp);
2673         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2674         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2675         g_free(*rsp);
2676         *rsp = NULL;
2677     }
2678 }
2679
2680 static void xbzrle_cleanup(void)
2681 {
2682     XBZRLE_cache_lock();
2683     if (XBZRLE.cache) {
2684         cache_fini(XBZRLE.cache);
2685         g_free(XBZRLE.encoded_buf);
2686         g_free(XBZRLE.current_buf);
2687         g_free(XBZRLE.zero_target_page);
2688         XBZRLE.cache = NULL;
2689         XBZRLE.encoded_buf = NULL;
2690         XBZRLE.current_buf = NULL;
2691         XBZRLE.zero_target_page = NULL;
2692     }
2693     XBZRLE_cache_unlock();
2694 }
2695
2696 static void ram_save_cleanup(void *opaque)
2697 {
2698     RAMState **rsp = opaque;
2699     RAMBlock *block;
2700
2701     /* We don't use dirty log with background snapshots */
2702     if (!migrate_background_snapshot()) {
2703         /* caller have hold iothread lock or is in a bh, so there is
2704          * no writing race against the migration bitmap
2705          */
2706         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2707             /*
2708              * do not stop dirty log without starting it, since
2709              * memory_global_dirty_log_stop will assert that
2710              * memory_global_dirty_log_start/stop used in pairs
2711              */
2712             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2713         }
2714     }
2715
2716     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2717         g_free(block->clear_bmap);
2718         block->clear_bmap = NULL;
2719         g_free(block->bmap);
2720         block->bmap = NULL;
2721     }
2722
2723     xbzrle_cleanup();
2724     compress_threads_save_cleanup();
2725     ram_state_cleanup(rsp);
2726     g_free(migration_ops);
2727     migration_ops = NULL;
2728 }
2729
2730 static void ram_state_reset(RAMState *rs)
2731 {
2732     int i;
2733
2734     for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2735         rs->pss[i].last_sent_block = NULL;
2736     }
2737
2738     rs->last_seen_block = NULL;
2739     rs->last_page = 0;
2740     rs->last_version = ram_list.version;
2741     rs->xbzrle_started = false;
2742 }
2743
2744 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2745
2746 /* **** functions for postcopy ***** */
2747
2748 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2749 {
2750     struct RAMBlock *block;
2751
2752     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2753         unsigned long *bitmap = block->bmap;
2754         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2755         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2756
2757         while (run_start < range) {
2758             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2759             ram_discard_range(block->idstr,
2760                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2761                               ((ram_addr_t)(run_end - run_start))
2762                                 << TARGET_PAGE_BITS);
2763             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2764         }
2765     }
2766 }
2767
2768 /**
2769  * postcopy_send_discard_bm_ram: discard a RAMBlock
2770  *
2771  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2772  *
2773  * @ms: current migration state
2774  * @block: RAMBlock to discard
2775  */
2776 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2777 {
2778     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2779     unsigned long current;
2780     unsigned long *bitmap = block->bmap;
2781
2782     for (current = 0; current < end; ) {
2783         unsigned long one = find_next_bit(bitmap, end, current);
2784         unsigned long zero, discard_length;
2785
2786         if (one >= end) {
2787             break;
2788         }
2789
2790         zero = find_next_zero_bit(bitmap, end, one + 1);
2791
2792         if (zero >= end) {
2793             discard_length = end - one;
2794         } else {
2795             discard_length = zero - one;
2796         }
2797         postcopy_discard_send_range(ms, one, discard_length);
2798         current = one + discard_length;
2799     }
2800 }
2801
2802 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2803
2804 /**
2805  * postcopy_each_ram_send_discard: discard all RAMBlocks
2806  *
2807  * Utility for the outgoing postcopy code.
2808  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2809  *   passing it bitmap indexes and name.
2810  * (qemu_ram_foreach_block ends up passing unscaled lengths
2811  *  which would mean postcopy code would have to deal with target page)
2812  *
2813  * @ms: current migration state
2814  */
2815 static void postcopy_each_ram_send_discard(MigrationState *ms)
2816 {
2817     struct RAMBlock *block;
2818
2819     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2820         postcopy_discard_send_init(ms, block->idstr);
2821
2822         /*
2823          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2824          * host-page size chunks, mark any partially dirty host-page size
2825          * chunks as all dirty.  In this case the host-page is the host-page
2826          * for the particular RAMBlock, i.e. it might be a huge page.
2827          */
2828         postcopy_chunk_hostpages_pass(ms, block);
2829
2830         /*
2831          * Postcopy sends chunks of bitmap over the wire, but it
2832          * just needs indexes at this point, avoids it having
2833          * target page specific code.
2834          */
2835         postcopy_send_discard_bm_ram(ms, block);
2836         postcopy_discard_send_finish(ms);
2837     }
2838 }
2839
2840 /**
2841  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2842  *
2843  * Helper for postcopy_chunk_hostpages; it's called twice to
2844  * canonicalize the two bitmaps, that are similar, but one is
2845  * inverted.
2846  *
2847  * Postcopy requires that all target pages in a hostpage are dirty or
2848  * clean, not a mix.  This function canonicalizes the bitmaps.
2849  *
2850  * @ms: current migration state
2851  * @block: block that contains the page we want to canonicalize
2852  */
2853 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2854 {
2855     RAMState *rs = ram_state;
2856     unsigned long *bitmap = block->bmap;
2857     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2858     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2859     unsigned long run_start;
2860
2861     if (block->page_size == TARGET_PAGE_SIZE) {
2862         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2863         return;
2864     }
2865
2866     /* Find a dirty page */
2867     run_start = find_next_bit(bitmap, pages, 0);
2868
2869     while (run_start < pages) {
2870
2871         /*
2872          * If the start of this run of pages is in the middle of a host
2873          * page, then we need to fixup this host page.
2874          */
2875         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2876             /* Find the end of this run */
2877             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2878             /*
2879              * If the end isn't at the start of a host page, then the
2880              * run doesn't finish at the end of a host page
2881              * and we need to discard.
2882              */
2883         }
2884
2885         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2886             unsigned long page;
2887             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2888                                                              host_ratio);
2889             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2890
2891             /* Clean up the bitmap */
2892             for (page = fixup_start_addr;
2893                  page < fixup_start_addr + host_ratio; page++) {
2894                 /*
2895                  * Remark them as dirty, updating the count for any pages
2896                  * that weren't previously dirty.
2897                  */
2898                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2899             }
2900         }
2901
2902         /* Find the next dirty page for the next iteration */
2903         run_start = find_next_bit(bitmap, pages, run_start);
2904     }
2905 }
2906
2907 /**
2908  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2909  *
2910  * Transmit the set of pages to be discarded after precopy to the target
2911  * these are pages that:
2912  *     a) Have been previously transmitted but are now dirty again
2913  *     b) Pages that have never been transmitted, this ensures that
2914  *        any pages on the destination that have been mapped by background
2915  *        tasks get discarded (transparent huge pages is the specific concern)
2916  * Hopefully this is pretty sparse
2917  *
2918  * @ms: current migration state
2919  */
2920 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2921 {
2922     RAMState *rs = ram_state;
2923
2924     RCU_READ_LOCK_GUARD();
2925
2926     /* This should be our last sync, the src is now paused */
2927     migration_bitmap_sync(rs);
2928
2929     /* Easiest way to make sure we don't resume in the middle of a host-page */
2930     rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
2931     rs->last_seen_block = NULL;
2932     rs->last_page = 0;
2933
2934     postcopy_each_ram_send_discard(ms);
2935
2936     trace_ram_postcopy_send_discard_bitmap();
2937 }
2938
2939 /**
2940  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2941  *
2942  * Returns zero on success
2943  *
2944  * @rbname: name of the RAMBlock of the request. NULL means the
2945  *          same that last one.
2946  * @start: RAMBlock starting page
2947  * @length: RAMBlock size
2948  */
2949 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2950 {
2951     trace_ram_discard_range(rbname, start, length);
2952
2953     RCU_READ_LOCK_GUARD();
2954     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2955
2956     if (!rb) {
2957         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2958         return -1;
2959     }
2960
2961     /*
2962      * On source VM, we don't need to update the received bitmap since
2963      * we don't even have one.
2964      */
2965     if (rb->receivedmap) {
2966         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2967                      length >> qemu_target_page_bits());
2968     }
2969
2970     return ram_block_discard_range(rb, start, length);
2971 }
2972
2973 /*
2974  * For every allocation, we will try not to crash the VM if the
2975  * allocation failed.
2976  */
2977 static int xbzrle_init(void)
2978 {
2979     Error *local_err = NULL;
2980
2981     if (!migrate_xbzrle()) {
2982         return 0;
2983     }
2984
2985     XBZRLE_cache_lock();
2986
2987     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2988     if (!XBZRLE.zero_target_page) {
2989         error_report("%s: Error allocating zero page", __func__);
2990         goto err_out;
2991     }
2992
2993     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2994                               TARGET_PAGE_SIZE, &local_err);
2995     if (!XBZRLE.cache) {
2996         error_report_err(local_err);
2997         goto free_zero_page;
2998     }
2999
3000     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3001     if (!XBZRLE.encoded_buf) {
3002         error_report("%s: Error allocating encoded_buf", __func__);
3003         goto free_cache;
3004     }
3005
3006     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3007     if (!XBZRLE.current_buf) {
3008         error_report("%s: Error allocating current_buf", __func__);
3009         goto free_encoded_buf;
3010     }
3011
3012     /* We are all good */
3013     XBZRLE_cache_unlock();
3014     return 0;
3015
3016 free_encoded_buf:
3017     g_free(XBZRLE.encoded_buf);
3018     XBZRLE.encoded_buf = NULL;
3019 free_cache:
3020     cache_fini(XBZRLE.cache);
3021     XBZRLE.cache = NULL;
3022 free_zero_page:
3023     g_free(XBZRLE.zero_target_page);
3024     XBZRLE.zero_target_page = NULL;
3025 err_out:
3026     XBZRLE_cache_unlock();
3027     return -ENOMEM;
3028 }
3029
3030 static int ram_state_init(RAMState **rsp)
3031 {
3032     *rsp = g_try_new0(RAMState, 1);
3033
3034     if (!*rsp) {
3035         error_report("%s: Init ramstate fail", __func__);
3036         return -1;
3037     }
3038
3039     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3040     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3041     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3042     (*rsp)->ram_bytes_total = ram_bytes_total();
3043
3044     /*
3045      * Count the total number of pages used by ram blocks not including any
3046      * gaps due to alignment or unplugs.
3047      * This must match with the initial values of dirty bitmap.
3048      */
3049     (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
3050     ram_state_reset(*rsp);
3051
3052     return 0;
3053 }
3054
3055 static void ram_list_init_bitmaps(void)
3056 {
3057     MigrationState *ms = migrate_get_current();
3058     RAMBlock *block;
3059     unsigned long pages;
3060     uint8_t shift;
3061
3062     /* Skip setting bitmap if there is no RAM */
3063     if (ram_bytes_total()) {
3064         shift = ms->clear_bitmap_shift;
3065         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3066             error_report("clear_bitmap_shift (%u) too big, using "
3067                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3068             shift = CLEAR_BITMAP_SHIFT_MAX;
3069         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3070             error_report("clear_bitmap_shift (%u) too small, using "
3071                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3072             shift = CLEAR_BITMAP_SHIFT_MIN;
3073         }
3074
3075         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3076             pages = block->max_length >> TARGET_PAGE_BITS;
3077             /*
3078              * The initial dirty bitmap for migration must be set with all
3079              * ones to make sure we'll migrate every guest RAM page to
3080              * destination.
3081              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3082              * new migration after a failed migration, ram_list.
3083              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3084              * guest memory.
3085              */
3086             block->bmap = bitmap_new(pages);
3087             bitmap_set(block->bmap, 0, pages);
3088             block->clear_bmap_shift = shift;
3089             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3090         }
3091     }
3092 }
3093
3094 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3095 {
3096     unsigned long pages;
3097     RAMBlock *rb;
3098
3099     RCU_READ_LOCK_GUARD();
3100
3101     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3102             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3103             rs->migration_dirty_pages -= pages;
3104     }
3105 }
3106
3107 static void ram_init_bitmaps(RAMState *rs)
3108 {
3109     /* For memory_global_dirty_log_start below.  */
3110     qemu_mutex_lock_iothread();
3111     qemu_mutex_lock_ramlist();
3112
3113     WITH_RCU_READ_LOCK_GUARD() {
3114         ram_list_init_bitmaps();
3115         /* We don't use dirty log with background snapshots */
3116         if (!migrate_background_snapshot()) {
3117             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3118             migration_bitmap_sync_precopy(rs);
3119         }
3120     }
3121     qemu_mutex_unlock_ramlist();
3122     qemu_mutex_unlock_iothread();
3123
3124     /*
3125      * After an eventual first bitmap sync, fixup the initial bitmap
3126      * containing all 1s to exclude any discarded pages from migration.
3127      */
3128     migration_bitmap_clear_discarded_pages(rs);
3129 }
3130
3131 static int ram_init_all(RAMState **rsp)
3132 {
3133     if (ram_state_init(rsp)) {
3134         return -1;
3135     }
3136
3137     if (xbzrle_init()) {
3138         ram_state_cleanup(rsp);
3139         return -1;
3140     }
3141
3142     ram_init_bitmaps(*rsp);
3143
3144     return 0;
3145 }
3146
3147 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3148 {
3149     RAMBlock *block;
3150     uint64_t pages = 0;
3151
3152     /*
3153      * Postcopy is not using xbzrle/compression, so no need for that.
3154      * Also, since source are already halted, we don't need to care
3155      * about dirty page logging as well.
3156      */
3157
3158     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3159         pages += bitmap_count_one(block->bmap,
3160                                   block->used_length >> TARGET_PAGE_BITS);
3161     }
3162
3163     /* This may not be aligned with current bitmaps. Recalculate. */
3164     rs->migration_dirty_pages = pages;
3165
3166     ram_state_reset(rs);
3167
3168     /* Update RAMState cache of output QEMUFile */
3169     rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
3170
3171     trace_ram_state_resume_prepare(pages);
3172 }
3173
3174 /*
3175  * This function clears bits of the free pages reported by the caller from the
3176  * migration dirty bitmap. @addr is the host address corresponding to the
3177  * start of the continuous guest free pages, and @len is the total bytes of
3178  * those pages.
3179  */
3180 void qemu_guest_free_page_hint(void *addr, size_t len)
3181 {
3182     RAMBlock *block;
3183     ram_addr_t offset;
3184     size_t used_len, start, npages;
3185     MigrationState *s = migrate_get_current();
3186
3187     /* This function is currently expected to be used during live migration */
3188     if (!migration_is_setup_or_active(s->state)) {
3189         return;
3190     }
3191
3192     for (; len > 0; len -= used_len, addr += used_len) {
3193         block = qemu_ram_block_from_host(addr, false, &offset);
3194         if (unlikely(!block || offset >= block->used_length)) {
3195             /*
3196              * The implementation might not support RAMBlock resize during
3197              * live migration, but it could happen in theory with future
3198              * updates. So we add a check here to capture that case.
3199              */
3200             error_report_once("%s unexpected error", __func__);
3201             return;
3202         }
3203
3204         if (len <= block->used_length - offset) {
3205             used_len = len;
3206         } else {
3207             used_len = block->used_length - offset;
3208         }
3209
3210         start = offset >> TARGET_PAGE_BITS;
3211         npages = used_len >> TARGET_PAGE_BITS;
3212
3213         qemu_mutex_lock(&ram_state->bitmap_mutex);
3214         /*
3215          * The skipped free pages are equavalent to be sent from clear_bmap's
3216          * perspective, so clear the bits from the memory region bitmap which
3217          * are initially set. Otherwise those skipped pages will be sent in
3218          * the next round after syncing from the memory region bitmap.
3219          */
3220         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
3221         ram_state->migration_dirty_pages -=
3222                       bitmap_count_one_with_offset(block->bmap, start, npages);
3223         bitmap_clear(block->bmap, start, npages);
3224         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3225     }
3226 }
3227
3228 /*
3229  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3230  * long-running RCU critical section.  When rcu-reclaims in the code
3231  * start to become numerous it will be necessary to reduce the
3232  * granularity of these critical sections.
3233  */
3234
3235 /**
3236  * ram_save_setup: Setup RAM for migration
3237  *
3238  * Returns zero to indicate success and negative for error
3239  *
3240  * @f: QEMUFile where to send the data
3241  * @opaque: RAMState pointer
3242  */
3243 static int ram_save_setup(QEMUFile *f, void *opaque)
3244 {
3245     RAMState **rsp = opaque;
3246     RAMBlock *block;
3247     int ret;
3248
3249     if (compress_threads_save_setup()) {
3250         return -1;
3251     }
3252
3253     /* migration has already setup the bitmap, reuse it. */
3254     if (!migration_in_colo_state()) {
3255         if (ram_init_all(rsp) != 0) {
3256             compress_threads_save_cleanup();
3257             return -1;
3258         }
3259     }
3260     (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
3261
3262     WITH_RCU_READ_LOCK_GUARD() {
3263         qemu_put_be64(f, ram_bytes_total_with_ignored()
3264                          | RAM_SAVE_FLAG_MEM_SIZE);
3265
3266         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3267             qemu_put_byte(f, strlen(block->idstr));
3268             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3269             qemu_put_be64(f, block->used_length);
3270             if (migrate_postcopy_ram() && block->page_size !=
3271                                           qemu_host_page_size) {
3272                 qemu_put_be64(f, block->page_size);
3273             }
3274             if (migrate_ignore_shared()) {
3275                 qemu_put_be64(f, block->mr->addr);
3276             }
3277         }
3278     }
3279
3280     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3281     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3282
3283     migration_ops = g_malloc0(sizeof(MigrationOps));
3284     migration_ops->ram_save_target_page = ram_save_target_page_legacy;
3285     ret = multifd_send_sync_main(f);
3286     if (ret < 0) {
3287         return ret;
3288     }
3289
3290     if (!migrate_multifd_flush_after_each_section()) {
3291         qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3292     }
3293
3294     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3295     qemu_fflush(f);
3296
3297     return 0;
3298 }
3299
3300 /**
3301  * ram_save_iterate: iterative stage for migration
3302  *
3303  * Returns zero to indicate success and negative for error
3304  *
3305  * @f: QEMUFile where to send the data
3306  * @opaque: RAMState pointer
3307  */
3308 static int ram_save_iterate(QEMUFile *f, void *opaque)
3309 {
3310     RAMState **temp = opaque;
3311     RAMState *rs = *temp;
3312     int ret = 0;
3313     int i;
3314     int64_t t0;
3315     int done = 0;
3316
3317     if (blk_mig_bulk_active()) {
3318         /* Avoid transferring ram during bulk phase of block migration as
3319          * the bulk phase will usually take a long time and transferring
3320          * ram updates during that time is pointless. */
3321         goto out;
3322     }
3323
3324     /*
3325      * We'll take this lock a little bit long, but it's okay for two reasons.
3326      * Firstly, the only possible other thread to take it is who calls
3327      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3328      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3329      * guarantees that we'll at least released it in a regular basis.
3330      */
3331     qemu_mutex_lock(&rs->bitmap_mutex);
3332     WITH_RCU_READ_LOCK_GUARD() {
3333         if (ram_list.version != rs->last_version) {
3334             ram_state_reset(rs);
3335         }
3336
3337         /* Read version before ram_list.blocks */
3338         smp_rmb();
3339
3340         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3341
3342         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3343         i = 0;
3344         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3345                postcopy_has_request(rs)) {
3346             int pages;
3347
3348             if (qemu_file_get_error(f)) {
3349                 break;
3350             }
3351
3352             pages = ram_find_and_save_block(rs);
3353             /* no more pages to sent */
3354             if (pages == 0) {
3355                 done = 1;
3356                 break;
3357             }
3358
3359             if (pages < 0) {
3360                 qemu_file_set_error(f, pages);
3361                 break;
3362             }
3363
3364             rs->target_page_count += pages;
3365
3366             /*
3367              * During postcopy, it is necessary to make sure one whole host
3368              * page is sent in one chunk.
3369              */
3370             if (migrate_postcopy_ram()) {
3371                 flush_compressed_data(rs);
3372             }
3373
3374             /*
3375              * we want to check in the 1st loop, just in case it was the 1st
3376              * time and we had to sync the dirty bitmap.
3377              * qemu_clock_get_ns() is a bit expensive, so we only check each
3378              * some iterations
3379              */
3380             if ((i & 63) == 0) {
3381                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3382                               1000000;
3383                 if (t1 > MAX_WAIT) {
3384                     trace_ram_save_iterate_big_wait(t1, i);
3385                     break;
3386                 }
3387             }
3388             i++;
3389         }
3390     }
3391     qemu_mutex_unlock(&rs->bitmap_mutex);
3392
3393     /*
3394      * Must occur before EOS (or any QEMUFile operation)
3395      * because of RDMA protocol.
3396      */
3397     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3398
3399 out:
3400     if (ret >= 0
3401         && migration_is_setup_or_active(migrate_get_current()->state)) {
3402         if (migrate_multifd_flush_after_each_section()) {
3403             ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3404             if (ret < 0) {
3405                 return ret;
3406             }
3407         }
3408
3409         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3410         qemu_fflush(f);
3411         ram_transferred_add(8);
3412
3413         ret = qemu_file_get_error(f);
3414     }
3415     if (ret < 0) {
3416         return ret;
3417     }
3418
3419     return done;
3420 }
3421
3422 /**
3423  * ram_save_complete: function called to send the remaining amount of ram
3424  *
3425  * Returns zero to indicate success or negative on error
3426  *
3427  * Called with iothread lock
3428  *
3429  * @f: QEMUFile where to send the data
3430  * @opaque: RAMState pointer
3431  */
3432 static int ram_save_complete(QEMUFile *f, void *opaque)
3433 {
3434     RAMState **temp = opaque;
3435     RAMState *rs = *temp;
3436     int ret = 0;
3437
3438     rs->last_stage = !migration_in_colo_state();
3439
3440     WITH_RCU_READ_LOCK_GUARD() {
3441         if (!migration_in_postcopy()) {
3442             migration_bitmap_sync_precopy(rs);
3443         }
3444
3445         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3446
3447         /* try transferring iterative blocks of memory */
3448
3449         /* flush all remaining blocks regardless of rate limiting */
3450         qemu_mutex_lock(&rs->bitmap_mutex);
3451         while (true) {
3452             int pages;
3453
3454             pages = ram_find_and_save_block(rs);
3455             /* no more blocks to sent */
3456             if (pages == 0) {
3457                 break;
3458             }
3459             if (pages < 0) {
3460                 ret = pages;
3461                 break;
3462             }
3463         }
3464         qemu_mutex_unlock(&rs->bitmap_mutex);
3465
3466         flush_compressed_data(rs);
3467         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3468     }
3469
3470     if (ret < 0) {
3471         return ret;
3472     }
3473
3474     ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3475     if (ret < 0) {
3476         return ret;
3477     }
3478
3479     if (!migrate_multifd_flush_after_each_section()) {
3480         qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3481     }
3482     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3483     qemu_fflush(f);
3484
3485     return 0;
3486 }
3487
3488 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3489                                        uint64_t *can_postcopy)
3490 {
3491     RAMState **temp = opaque;
3492     RAMState *rs = *temp;
3493
3494     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3495
3496     if (migrate_postcopy_ram()) {
3497         /* We can do postcopy, and all the data is postcopiable */
3498         *can_postcopy += remaining_size;
3499     } else {
3500         *must_precopy += remaining_size;
3501     }
3502 }
3503
3504 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3505                                     uint64_t *can_postcopy)
3506 {
3507     MigrationState *s = migrate_get_current();
3508     RAMState **temp = opaque;
3509     RAMState *rs = *temp;
3510
3511     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3512
3513     if (!migration_in_postcopy() && remaining_size < s->threshold_size) {
3514         qemu_mutex_lock_iothread();
3515         WITH_RCU_READ_LOCK_GUARD() {
3516             migration_bitmap_sync_precopy(rs);
3517         }
3518         qemu_mutex_unlock_iothread();
3519         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3520     }
3521
3522     if (migrate_postcopy_ram()) {
3523         /* We can do postcopy, and all the data is postcopiable */
3524         *can_postcopy += remaining_size;
3525     } else {
3526         *must_precopy += remaining_size;
3527     }
3528 }
3529
3530 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3531 {
3532     unsigned int xh_len;
3533     int xh_flags;
3534     uint8_t *loaded_data;
3535
3536     /* extract RLE header */
3537     xh_flags = qemu_get_byte(f);
3538     xh_len = qemu_get_be16(f);
3539
3540     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3541         error_report("Failed to load XBZRLE page - wrong compression!");
3542         return -1;
3543     }
3544
3545     if (xh_len > TARGET_PAGE_SIZE) {
3546         error_report("Failed to load XBZRLE page - len overflow!");
3547         return -1;
3548     }
3549     loaded_data = XBZRLE.decoded_buf;
3550     /* load data and decode */
3551     /* it can change loaded_data to point to an internal buffer */
3552     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3553
3554     /* decode RLE */
3555     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3556                              TARGET_PAGE_SIZE) == -1) {
3557         error_report("Failed to load XBZRLE page - decode error!");
3558         return -1;
3559     }
3560
3561     return 0;
3562 }
3563
3564 /**
3565  * ram_block_from_stream: read a RAMBlock id from the migration stream
3566  *
3567  * Must be called from within a rcu critical section.
3568  *
3569  * Returns a pointer from within the RCU-protected ram_list.
3570  *
3571  * @mis: the migration incoming state pointer
3572  * @f: QEMUFile where to read the data from
3573  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3574  * @channel: the channel we're using
3575  */
3576 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3577                                               QEMUFile *f, int flags,
3578                                               int channel)
3579 {
3580     RAMBlock *block = mis->last_recv_block[channel];
3581     char id[256];
3582     uint8_t len;
3583
3584     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3585         if (!block) {
3586             error_report("Ack, bad migration stream!");
3587             return NULL;
3588         }
3589         return block;
3590     }
3591
3592     len = qemu_get_byte(f);
3593     qemu_get_buffer(f, (uint8_t *)id, len);
3594     id[len] = 0;
3595
3596     block = qemu_ram_block_by_name(id);
3597     if (!block) {
3598         error_report("Can't find block %s", id);
3599         return NULL;
3600     }
3601
3602     if (ramblock_is_ignored(block)) {
3603         error_report("block %s should not be migrated !", id);
3604         return NULL;
3605     }
3606
3607     mis->last_recv_block[channel] = block;
3608
3609     return block;
3610 }
3611
3612 static inline void *host_from_ram_block_offset(RAMBlock *block,
3613                                                ram_addr_t offset)
3614 {
3615     if (!offset_in_ramblock(block, offset)) {
3616         return NULL;
3617     }
3618
3619     return block->host + offset;
3620 }
3621
3622 static void *host_page_from_ram_block_offset(RAMBlock *block,
3623                                              ram_addr_t offset)
3624 {
3625     /* Note: Explicitly no check against offset_in_ramblock(). */
3626     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3627                                    block->page_size);
3628 }
3629
3630 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3631                                                          ram_addr_t offset)
3632 {
3633     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3634 }
3635
3636 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3637                              ram_addr_t offset, bool record_bitmap)
3638 {
3639     if (!offset_in_ramblock(block, offset)) {
3640         return NULL;
3641     }
3642     if (!block->colo_cache) {
3643         error_report("%s: colo_cache is NULL in block :%s",
3644                      __func__, block->idstr);
3645         return NULL;
3646     }
3647
3648     /*
3649     * During colo checkpoint, we need bitmap of these migrated pages.
3650     * It help us to decide which pages in ram cache should be flushed
3651     * into VM's RAM later.
3652     */
3653     if (record_bitmap &&
3654         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3655         ram_state->migration_dirty_pages++;
3656     }
3657     return block->colo_cache + offset;
3658 }
3659
3660 /**
3661  * ram_handle_compressed: handle the zero page case
3662  *
3663  * If a page (or a whole RDMA chunk) has been
3664  * determined to be zero, then zap it.
3665  *
3666  * @host: host address for the zero page
3667  * @ch: what the page is filled from.  We only support zero
3668  * @size: size of the zero page
3669  */
3670 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3671 {
3672     if (ch != 0 || !buffer_is_zero(host, size)) {
3673         memset(host, ch, size);
3674     }
3675 }
3676
3677 /* return the size after decompression, or negative value on error */
3678 static int
3679 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3680                      const uint8_t *source, size_t source_len)
3681 {
3682     int err;
3683
3684     err = inflateReset(stream);
3685     if (err != Z_OK) {
3686         return -1;
3687     }
3688
3689     stream->avail_in = source_len;
3690     stream->next_in = (uint8_t *)source;
3691     stream->avail_out = dest_len;
3692     stream->next_out = dest;
3693
3694     err = inflate(stream, Z_NO_FLUSH);
3695     if (err != Z_STREAM_END) {
3696         return -1;
3697     }
3698
3699     return stream->total_out;
3700 }
3701
3702 static void *do_data_decompress(void *opaque)
3703 {
3704     DecompressParam *param = opaque;
3705     unsigned long pagesize;
3706     uint8_t *des;
3707     int len, ret;
3708
3709     qemu_mutex_lock(&param->mutex);
3710     while (!param->quit) {
3711         if (param->des) {
3712             des = param->des;
3713             len = param->len;
3714             param->des = 0;
3715             qemu_mutex_unlock(&param->mutex);
3716
3717             pagesize = TARGET_PAGE_SIZE;
3718
3719             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3720                                        param->compbuf, len);
3721             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3722                 error_report("decompress data failed");
3723                 qemu_file_set_error(decomp_file, ret);
3724             }
3725
3726             qemu_mutex_lock(&decomp_done_lock);
3727             param->done = true;
3728             qemu_cond_signal(&decomp_done_cond);
3729             qemu_mutex_unlock(&decomp_done_lock);
3730
3731             qemu_mutex_lock(&param->mutex);
3732         } else {
3733             qemu_cond_wait(&param->cond, &param->mutex);
3734         }
3735     }
3736     qemu_mutex_unlock(&param->mutex);
3737
3738     return NULL;
3739 }
3740
3741 static int wait_for_decompress_done(void)
3742 {
3743     int idx, thread_count;
3744
3745     if (!migrate_compress()) {
3746         return 0;
3747     }
3748
3749     thread_count = migrate_decompress_threads();
3750     qemu_mutex_lock(&decomp_done_lock);
3751     for (idx = 0; idx < thread_count; idx++) {
3752         while (!decomp_param[idx].done) {
3753             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3754         }
3755     }
3756     qemu_mutex_unlock(&decomp_done_lock);
3757     return qemu_file_get_error(decomp_file);
3758 }
3759
3760 static void compress_threads_load_cleanup(void)
3761 {
3762     int i, thread_count;
3763
3764     if (!migrate_compress()) {
3765         return;
3766     }
3767     thread_count = migrate_decompress_threads();
3768     for (i = 0; i < thread_count; i++) {
3769         /*
3770          * we use it as a indicator which shows if the thread is
3771          * properly init'd or not
3772          */
3773         if (!decomp_param[i].compbuf) {
3774             break;
3775         }
3776
3777         qemu_mutex_lock(&decomp_param[i].mutex);
3778         decomp_param[i].quit = true;
3779         qemu_cond_signal(&decomp_param[i].cond);
3780         qemu_mutex_unlock(&decomp_param[i].mutex);
3781     }
3782     for (i = 0; i < thread_count; i++) {
3783         if (!decomp_param[i].compbuf) {
3784             break;
3785         }
3786
3787         qemu_thread_join(decompress_threads + i);
3788         qemu_mutex_destroy(&decomp_param[i].mutex);
3789         qemu_cond_destroy(&decomp_param[i].cond);
3790         inflateEnd(&decomp_param[i].stream);
3791         g_free(decomp_param[i].compbuf);
3792         decomp_param[i].compbuf = NULL;
3793     }
3794     g_free(decompress_threads);
3795     g_free(decomp_param);
3796     decompress_threads = NULL;
3797     decomp_param = NULL;
3798     decomp_file = NULL;
3799 }
3800
3801 static int compress_threads_load_setup(QEMUFile *f)
3802 {
3803     int i, thread_count;
3804
3805     if (!migrate_compress()) {
3806         return 0;
3807     }
3808
3809     thread_count = migrate_decompress_threads();
3810     decompress_threads = g_new0(QemuThread, thread_count);
3811     decomp_param = g_new0(DecompressParam, thread_count);
3812     qemu_mutex_init(&decomp_done_lock);
3813     qemu_cond_init(&decomp_done_cond);
3814     decomp_file = f;
3815     for (i = 0; i < thread_count; i++) {
3816         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3817             goto exit;
3818         }
3819
3820         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3821         qemu_mutex_init(&decomp_param[i].mutex);
3822         qemu_cond_init(&decomp_param[i].cond);
3823         decomp_param[i].done = true;
3824         decomp_param[i].quit = false;
3825         qemu_thread_create(decompress_threads + i, "decompress",
3826                            do_data_decompress, decomp_param + i,
3827                            QEMU_THREAD_JOINABLE);
3828     }
3829     return 0;
3830 exit:
3831     compress_threads_load_cleanup();
3832     return -1;
3833 }
3834
3835 static void decompress_data_with_multi_threads(QEMUFile *f,
3836                                                void *host, int len)
3837 {
3838     int idx, thread_count;
3839
3840     thread_count = migrate_decompress_threads();
3841     QEMU_LOCK_GUARD(&decomp_done_lock);
3842     while (true) {
3843         for (idx = 0; idx < thread_count; idx++) {
3844             if (decomp_param[idx].done) {
3845                 decomp_param[idx].done = false;
3846                 qemu_mutex_lock(&decomp_param[idx].mutex);
3847                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3848                 decomp_param[idx].des = host;
3849                 decomp_param[idx].len = len;
3850                 qemu_cond_signal(&decomp_param[idx].cond);
3851                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3852                 break;
3853             }
3854         }
3855         if (idx < thread_count) {
3856             break;
3857         } else {
3858             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3859         }
3860     }
3861 }
3862
3863 static void colo_init_ram_state(void)
3864 {
3865     ram_state_init(&ram_state);
3866 }
3867
3868 /*
3869  * colo cache: this is for secondary VM, we cache the whole
3870  * memory of the secondary VM, it is need to hold the global lock
3871  * to call this helper.
3872  */
3873 int colo_init_ram_cache(void)
3874 {
3875     RAMBlock *block;
3876
3877     WITH_RCU_READ_LOCK_GUARD() {
3878         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3879             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3880                                                     NULL, false, false);
3881             if (!block->colo_cache) {
3882                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3883                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3884                              block->used_length);
3885                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3886                     if (block->colo_cache) {
3887                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3888                         block->colo_cache = NULL;
3889                     }
3890                 }
3891                 return -errno;
3892             }
3893             if (!machine_dump_guest_core(current_machine)) {
3894                 qemu_madvise(block->colo_cache, block->used_length,
3895                              QEMU_MADV_DONTDUMP);
3896             }
3897         }
3898     }
3899
3900     /*
3901     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3902     * with to decide which page in cache should be flushed into SVM's RAM. Here
3903     * we use the same name 'ram_bitmap' as for migration.
3904     */
3905     if (ram_bytes_total()) {
3906         RAMBlock *block;
3907
3908         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3909             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3910             block->bmap = bitmap_new(pages);
3911         }
3912     }
3913
3914     colo_init_ram_state();
3915     return 0;
3916 }
3917
3918 /* TODO: duplicated with ram_init_bitmaps */
3919 void colo_incoming_start_dirty_log(void)
3920 {
3921     RAMBlock *block = NULL;
3922     /* For memory_global_dirty_log_start below. */
3923     qemu_mutex_lock_iothread();
3924     qemu_mutex_lock_ramlist();
3925
3926     memory_global_dirty_log_sync();
3927     WITH_RCU_READ_LOCK_GUARD() {
3928         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3929             ramblock_sync_dirty_bitmap(ram_state, block);
3930             /* Discard this dirty bitmap record */
3931             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3932         }
3933         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3934     }
3935     ram_state->migration_dirty_pages = 0;
3936     qemu_mutex_unlock_ramlist();
3937     qemu_mutex_unlock_iothread();
3938 }
3939
3940 /* It is need to hold the global lock to call this helper */
3941 void colo_release_ram_cache(void)
3942 {
3943     RAMBlock *block;
3944
3945     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3946     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3947         g_free(block->bmap);
3948         block->bmap = NULL;
3949     }
3950
3951     WITH_RCU_READ_LOCK_GUARD() {
3952         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3953             if (block->colo_cache) {
3954                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3955                 block->colo_cache = NULL;
3956             }
3957         }
3958     }
3959     ram_state_cleanup(&ram_state);
3960 }
3961
3962 /**
3963  * ram_load_setup: Setup RAM for migration incoming side
3964  *
3965  * Returns zero to indicate success and negative for error
3966  *
3967  * @f: QEMUFile where to receive the data
3968  * @opaque: RAMState pointer
3969  */
3970 static int ram_load_setup(QEMUFile *f, void *opaque)
3971 {
3972     if (compress_threads_load_setup(f)) {
3973         return -1;
3974     }
3975
3976     xbzrle_load_setup();
3977     ramblock_recv_map_init();
3978
3979     return 0;
3980 }
3981
3982 static int ram_load_cleanup(void *opaque)
3983 {
3984     RAMBlock *rb;
3985
3986     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3987         qemu_ram_block_writeback(rb);
3988     }
3989
3990     xbzrle_load_cleanup();
3991     compress_threads_load_cleanup();
3992
3993     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3994         g_free(rb->receivedmap);
3995         rb->receivedmap = NULL;
3996     }
3997
3998     return 0;
3999 }
4000
4001 /**
4002  * ram_postcopy_incoming_init: allocate postcopy data structures
4003  *
4004  * Returns 0 for success and negative if there was one error
4005  *
4006  * @mis: current migration incoming state
4007  *
4008  * Allocate data structures etc needed by incoming migration with
4009  * postcopy-ram. postcopy-ram's similarly names
4010  * postcopy_ram_incoming_init does the work.
4011  */
4012 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4013 {
4014     return postcopy_ram_incoming_init(mis);
4015 }
4016
4017 /**
4018  * ram_load_postcopy: load a page in postcopy case
4019  *
4020  * Returns 0 for success or -errno in case of error
4021  *
4022  * Called in postcopy mode by ram_load().
4023  * rcu_read_lock is taken prior to this being called.
4024  *
4025  * @f: QEMUFile where to send the data
4026  * @channel: the channel to use for loading
4027  */
4028 int ram_load_postcopy(QEMUFile *f, int channel)
4029 {
4030     int flags = 0, ret = 0;
4031     bool place_needed = false;
4032     bool matches_target_page_size = false;
4033     MigrationIncomingState *mis = migration_incoming_get_current();
4034     PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
4035
4036     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4037         ram_addr_t addr;
4038         void *page_buffer = NULL;
4039         void *place_source = NULL;
4040         RAMBlock *block = NULL;
4041         uint8_t ch;
4042         int len;
4043
4044         addr = qemu_get_be64(f);
4045
4046         /*
4047          * If qemu file error, we should stop here, and then "addr"
4048          * may be invalid
4049          */
4050         ret = qemu_file_get_error(f);
4051         if (ret) {
4052             break;
4053         }
4054
4055         flags = addr & ~TARGET_PAGE_MASK;
4056         addr &= TARGET_PAGE_MASK;
4057
4058         trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
4059         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4060                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
4061             block = ram_block_from_stream(mis, f, flags, channel);
4062             if (!block) {
4063                 ret = -EINVAL;
4064                 break;
4065             }
4066
4067             /*
4068              * Relying on used_length is racy and can result in false positives.
4069              * We might place pages beyond used_length in case RAM was shrunk
4070              * while in postcopy, which is fine - trying to place via
4071              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4072              */
4073             if (!block->host || addr >= block->postcopy_length) {
4074                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4075                 ret = -EINVAL;
4076                 break;
4077             }
4078             tmp_page->target_pages++;
4079             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4080             /*
4081              * Postcopy requires that we place whole host pages atomically;
4082              * these may be huge pages for RAMBlocks that are backed by
4083              * hugetlbfs.
4084              * To make it atomic, the data is read into a temporary page
4085              * that's moved into place later.
4086              * The migration protocol uses,  possibly smaller, target-pages
4087              * however the source ensures it always sends all the components
4088              * of a host page in one chunk.
4089              */
4090             page_buffer = tmp_page->tmp_huge_page +
4091                           host_page_offset_from_ram_block_offset(block, addr);
4092             /* If all TP are zero then we can optimise the place */
4093             if (tmp_page->target_pages == 1) {
4094                 tmp_page->host_addr =
4095                     host_page_from_ram_block_offset(block, addr);
4096             } else if (tmp_page->host_addr !=
4097                        host_page_from_ram_block_offset(block, addr)) {
4098                 /* not the 1st TP within the HP */
4099                 error_report("Non-same host page detected on channel %d: "
4100                              "Target host page %p, received host page %p "
4101                              "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
4102                              channel, tmp_page->host_addr,
4103                              host_page_from_ram_block_offset(block, addr),
4104                              block->idstr, addr, tmp_page->target_pages);
4105                 ret = -EINVAL;
4106                 break;
4107             }
4108
4109             /*
4110              * If it's the last part of a host page then we place the host
4111              * page
4112              */
4113             if (tmp_page->target_pages ==
4114                 (block->page_size / TARGET_PAGE_SIZE)) {
4115                 place_needed = true;
4116             }
4117             place_source = tmp_page->tmp_huge_page;
4118         }
4119
4120         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4121         case RAM_SAVE_FLAG_ZERO:
4122             ch = qemu_get_byte(f);
4123             /*
4124              * Can skip to set page_buffer when
4125              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4126              */
4127             if (ch || !matches_target_page_size) {
4128                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4129             }
4130             if (ch) {
4131                 tmp_page->all_zero = false;
4132             }
4133             break;
4134
4135         case RAM_SAVE_FLAG_PAGE:
4136             tmp_page->all_zero = false;
4137             if (!matches_target_page_size) {
4138                 /* For huge pages, we always use temporary buffer */
4139                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4140             } else {
4141                 /*
4142                  * For small pages that matches target page size, we
4143                  * avoid the qemu_file copy.  Instead we directly use
4144                  * the buffer of QEMUFile to place the page.  Note: we
4145                  * cannot do any QEMUFile operation before using that
4146                  * buffer to make sure the buffer is valid when
4147                  * placing the page.
4148                  */
4149                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4150                                          TARGET_PAGE_SIZE);
4151             }
4152             break;
4153         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4154             tmp_page->all_zero = false;
4155             len = qemu_get_be32(f);
4156             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4157                 error_report("Invalid compressed data length: %d", len);
4158                 ret = -EINVAL;
4159                 break;
4160             }
4161             decompress_data_with_multi_threads(f, page_buffer, len);
4162             break;
4163         case RAM_SAVE_FLAG_MULTIFD_FLUSH:
4164             multifd_recv_sync_main();
4165             break;
4166         case RAM_SAVE_FLAG_EOS:
4167             /* normal exit */
4168             if (migrate_multifd_flush_after_each_section()) {
4169                 multifd_recv_sync_main();
4170             }
4171             break;
4172         default:
4173             error_report("Unknown combination of migration flags: 0x%x"
4174                          " (postcopy mode)", flags);
4175             ret = -EINVAL;
4176             break;
4177         }
4178
4179         /* Got the whole host page, wait for decompress before placing. */
4180         if (place_needed) {
4181             ret |= wait_for_decompress_done();
4182         }
4183
4184         /* Detect for any possible file errors */
4185         if (!ret && qemu_file_get_error(f)) {
4186             ret = qemu_file_get_error(f);
4187         }
4188
4189         if (!ret && place_needed) {
4190             if (tmp_page->all_zero) {
4191                 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
4192             } else {
4193                 ret = postcopy_place_page(mis, tmp_page->host_addr,
4194                                           place_source, block);
4195             }
4196             place_needed = false;
4197             postcopy_temp_page_reset(tmp_page);
4198         }
4199     }
4200
4201     return ret;
4202 }
4203
4204 static bool postcopy_is_running(void)
4205 {
4206     PostcopyState ps = postcopy_state_get();
4207     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4208 }
4209
4210 /*
4211  * Flush content of RAM cache into SVM's memory.
4212  * Only flush the pages that be dirtied by PVM or SVM or both.
4213  */
4214 void colo_flush_ram_cache(void)
4215 {
4216     RAMBlock *block = NULL;
4217     void *dst_host;
4218     void *src_host;
4219     unsigned long offset = 0;
4220
4221     memory_global_dirty_log_sync();
4222     WITH_RCU_READ_LOCK_GUARD() {
4223         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4224             ramblock_sync_dirty_bitmap(ram_state, block);
4225         }
4226     }
4227
4228     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4229     WITH_RCU_READ_LOCK_GUARD() {
4230         block = QLIST_FIRST_RCU(&ram_list.blocks);
4231
4232         while (block) {
4233             unsigned long num = 0;
4234
4235             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
4236             if (!offset_in_ramblock(block,
4237                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
4238                 offset = 0;
4239                 num = 0;
4240                 block = QLIST_NEXT_RCU(block, next);
4241             } else {
4242                 unsigned long i = 0;
4243
4244                 for (i = 0; i < num; i++) {
4245                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
4246                 }
4247                 dst_host = block->host
4248                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4249                 src_host = block->colo_cache
4250                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4251                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4252                 offset += num;
4253             }
4254         }
4255     }
4256     trace_colo_flush_ram_cache_end();
4257 }
4258
4259 /**
4260  * ram_load_precopy: load pages in precopy case
4261  *
4262  * Returns 0 for success or -errno in case of error
4263  *
4264  * Called in precopy mode by ram_load().
4265  * rcu_read_lock is taken prior to this being called.
4266  *
4267  * @f: QEMUFile where to send the data
4268  */
4269 static int ram_load_precopy(QEMUFile *f)
4270 {
4271     MigrationIncomingState *mis = migration_incoming_get_current();
4272     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4273     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4274     bool postcopy_advised = migration_incoming_postcopy_advised();
4275     if (!migrate_compress()) {
4276         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4277     }
4278
4279     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4280         ram_addr_t addr, total_ram_bytes;
4281         void *host = NULL, *host_bak = NULL;
4282         uint8_t ch;
4283
4284         /*
4285          * Yield periodically to let main loop run, but an iteration of
4286          * the main loop is expensive, so do it each some iterations
4287          */
4288         if ((i & 32767) == 0 && qemu_in_coroutine()) {
4289             aio_co_schedule(qemu_get_current_aio_context(),
4290                             qemu_coroutine_self());
4291             qemu_coroutine_yield();
4292         }
4293         i++;
4294
4295         addr = qemu_get_be64(f);
4296         flags = addr & ~TARGET_PAGE_MASK;
4297         addr &= TARGET_PAGE_MASK;
4298
4299         if (flags & invalid_flags) {
4300             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4301                 error_report("Received an unexpected compressed page");
4302             }
4303
4304             ret = -EINVAL;
4305             break;
4306         }
4307
4308         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4309                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4310             RAMBlock *block = ram_block_from_stream(mis, f, flags,
4311                                                     RAM_CHANNEL_PRECOPY);
4312
4313             host = host_from_ram_block_offset(block, addr);
4314             /*
4315              * After going into COLO stage, we should not load the page
4316              * into SVM's memory directly, we put them into colo_cache firstly.
4317              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4318              * Previously, we copied all these memory in preparing stage of COLO
4319              * while we need to stop VM, which is a time-consuming process.
4320              * Here we optimize it by a trick, back-up every page while in
4321              * migration process while COLO is enabled, though it affects the
4322              * speed of the migration, but it obviously reduce the downtime of
4323              * back-up all SVM'S memory in COLO preparing stage.
4324              */
4325             if (migration_incoming_colo_enabled()) {
4326                 if (migration_incoming_in_colo_state()) {
4327                     /* In COLO stage, put all pages into cache temporarily */
4328                     host = colo_cache_from_block_offset(block, addr, true);
4329                 } else {
4330                    /*
4331                     * In migration stage but before COLO stage,
4332                     * Put all pages into both cache and SVM's memory.
4333                     */
4334                     host_bak = colo_cache_from_block_offset(block, addr, false);
4335                 }
4336             }
4337             if (!host) {
4338                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4339                 ret = -EINVAL;
4340                 break;
4341             }
4342             if (!migration_incoming_in_colo_state()) {
4343                 ramblock_recv_bitmap_set(block, host);
4344             }
4345
4346             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4347         }
4348
4349         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4350         case RAM_SAVE_FLAG_MEM_SIZE:
4351             /* Synchronize RAM block list */
4352             total_ram_bytes = addr;
4353             while (!ret && total_ram_bytes) {
4354                 RAMBlock *block;
4355                 char id[256];
4356                 ram_addr_t length;
4357
4358                 len = qemu_get_byte(f);
4359                 qemu_get_buffer(f, (uint8_t *)id, len);
4360                 id[len] = 0;
4361                 length = qemu_get_be64(f);
4362
4363                 block = qemu_ram_block_by_name(id);
4364                 if (block && !qemu_ram_is_migratable(block)) {
4365                     error_report("block %s should not be migrated !", id);
4366                     ret = -EINVAL;
4367                 } else if (block) {
4368                     if (length != block->used_length) {
4369                         Error *local_err = NULL;
4370
4371                         ret = qemu_ram_resize(block, length,
4372                                               &local_err);
4373                         if (local_err) {
4374                             error_report_err(local_err);
4375                         }
4376                     }
4377                     /* For postcopy we need to check hugepage sizes match */
4378                     if (postcopy_advised && migrate_postcopy_ram() &&
4379                         block->page_size != qemu_host_page_size) {
4380                         uint64_t remote_page_size = qemu_get_be64(f);
4381                         if (remote_page_size != block->page_size) {
4382                             error_report("Mismatched RAM page size %s "
4383                                          "(local) %zd != %" PRId64,
4384                                          id, block->page_size,
4385                                          remote_page_size);
4386                             ret = -EINVAL;
4387                         }
4388                     }
4389                     if (migrate_ignore_shared()) {
4390                         hwaddr addr = qemu_get_be64(f);
4391                         if (ramblock_is_ignored(block) &&
4392                             block->mr->addr != addr) {
4393                             error_report("Mismatched GPAs for block %s "
4394                                          "%" PRId64 "!= %" PRId64,
4395                                          id, (uint64_t)addr,
4396                                          (uint64_t)block->mr->addr);
4397                             ret = -EINVAL;
4398                         }
4399                     }
4400                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4401                                           block->idstr);
4402                 } else {
4403                     error_report("Unknown ramblock \"%s\", cannot "
4404                                  "accept migration", id);
4405                     ret = -EINVAL;
4406                 }
4407
4408                 total_ram_bytes -= length;
4409             }
4410             break;
4411
4412         case RAM_SAVE_FLAG_ZERO:
4413             ch = qemu_get_byte(f);
4414             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4415             break;
4416
4417         case RAM_SAVE_FLAG_PAGE:
4418             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4419             break;
4420
4421         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4422             len = qemu_get_be32(f);
4423             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4424                 error_report("Invalid compressed data length: %d", len);
4425                 ret = -EINVAL;
4426                 break;
4427             }
4428             decompress_data_with_multi_threads(f, host, len);
4429             break;
4430
4431         case RAM_SAVE_FLAG_XBZRLE:
4432             if (load_xbzrle(f, addr, host) < 0) {
4433                 error_report("Failed to decompress XBZRLE page at "
4434                              RAM_ADDR_FMT, addr);
4435                 ret = -EINVAL;
4436                 break;
4437             }
4438             break;
4439         case RAM_SAVE_FLAG_MULTIFD_FLUSH:
4440             multifd_recv_sync_main();
4441             break;
4442         case RAM_SAVE_FLAG_EOS:
4443             /* normal exit */
4444             if (migrate_multifd_flush_after_each_section()) {
4445                 multifd_recv_sync_main();
4446             }
4447             break;
4448         case RAM_SAVE_FLAG_HOOK:
4449             ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4450             break;
4451         default:
4452             error_report("Unknown combination of migration flags: 0x%x", flags);
4453             ret = -EINVAL;
4454         }
4455         if (!ret) {
4456             ret = qemu_file_get_error(f);
4457         }
4458         if (!ret && host_bak) {
4459             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4460         }
4461     }
4462
4463     ret |= wait_for_decompress_done();
4464     return ret;
4465 }
4466
4467 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4468 {
4469     int ret = 0;
4470     static uint64_t seq_iter;
4471     /*
4472      * If system is running in postcopy mode, page inserts to host memory must
4473      * be atomic
4474      */
4475     bool postcopy_running = postcopy_is_running();
4476
4477     seq_iter++;
4478
4479     if (version_id != 4) {
4480         return -EINVAL;
4481     }
4482
4483     /*
4484      * This RCU critical section can be very long running.
4485      * When RCU reclaims in the code start to become numerous,
4486      * it will be necessary to reduce the granularity of this
4487      * critical section.
4488      */
4489     WITH_RCU_READ_LOCK_GUARD() {
4490         if (postcopy_running) {
4491             /*
4492              * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
4493              * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4494              * service fast page faults.
4495              */
4496             ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4497         } else {
4498             ret = ram_load_precopy(f);
4499         }
4500     }
4501     trace_ram_load_complete(ret, seq_iter);
4502
4503     return ret;
4504 }
4505
4506 static bool ram_has_postcopy(void *opaque)
4507 {
4508     RAMBlock *rb;
4509     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4510         if (ramblock_is_pmem(rb)) {
4511             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4512                          "is not supported now!", rb->idstr, rb->host);
4513             return false;
4514         }
4515     }
4516
4517     return migrate_postcopy_ram();
4518 }
4519
4520 /* Sync all the dirty bitmap with destination VM.  */
4521 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4522 {
4523     RAMBlock *block;
4524     QEMUFile *file = s->to_dst_file;
4525     int ramblock_count = 0;
4526
4527     trace_ram_dirty_bitmap_sync_start();
4528
4529     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4530         qemu_savevm_send_recv_bitmap(file, block->idstr);
4531         trace_ram_dirty_bitmap_request(block->idstr);
4532         ramblock_count++;
4533     }
4534
4535     trace_ram_dirty_bitmap_sync_wait();
4536
4537     /* Wait until all the ramblocks' dirty bitmap synced */
4538     while (ramblock_count--) {
4539         qemu_sem_wait(&s->rp_state.rp_sem);
4540     }
4541
4542     trace_ram_dirty_bitmap_sync_complete();
4543
4544     return 0;
4545 }
4546
4547 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4548 {
4549     qemu_sem_post(&s->rp_state.rp_sem);
4550 }
4551
4552 /*
4553  * Read the received bitmap, revert it as the initial dirty bitmap.
4554  * This is only used when the postcopy migration is paused but wants
4555  * to resume from a middle point.
4556  */
4557 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4558 {
4559     int ret = -EINVAL;
4560     /* from_dst_file is always valid because we're within rp_thread */
4561     QEMUFile *file = s->rp_state.from_dst_file;
4562     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4563     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4564     uint64_t size, end_mark;
4565
4566     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4567
4568     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4569         error_report("%s: incorrect state %s", __func__,
4570                      MigrationStatus_str(s->state));
4571         return -EINVAL;
4572     }
4573
4574     /*
4575      * Note: see comments in ramblock_recv_bitmap_send() on why we
4576      * need the endianness conversion, and the paddings.
4577      */
4578     local_size = ROUND_UP(local_size, 8);
4579
4580     /* Add paddings */
4581     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4582
4583     size = qemu_get_be64(file);
4584
4585     /* The size of the bitmap should match with our ramblock */
4586     if (size != local_size) {
4587         error_report("%s: ramblock '%s' bitmap size mismatch "
4588                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4589                      block->idstr, size, local_size);
4590         ret = -EINVAL;
4591         goto out;
4592     }
4593
4594     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4595     end_mark = qemu_get_be64(file);
4596
4597     ret = qemu_file_get_error(file);
4598     if (ret || size != local_size) {
4599         error_report("%s: read bitmap failed for ramblock '%s': %d"
4600                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4601                      __func__, block->idstr, ret, local_size, size);
4602         ret = -EIO;
4603         goto out;
4604     }
4605
4606     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4607         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4608                      __func__, block->idstr, end_mark);
4609         ret = -EINVAL;
4610         goto out;
4611     }
4612
4613     /*
4614      * Endianness conversion. We are during postcopy (though paused).
4615      * The dirty bitmap won't change. We can directly modify it.
4616      */
4617     bitmap_from_le(block->bmap, le_bitmap, nbits);
4618
4619     /*
4620      * What we received is "received bitmap". Revert it as the initial
4621      * dirty bitmap for this ramblock.
4622      */
4623     bitmap_complement(block->bmap, block->bmap, nbits);
4624
4625     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4626     ramblock_dirty_bitmap_clear_discarded_pages(block);
4627
4628     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4629     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4630
4631     /*
4632      * We succeeded to sync bitmap for current ramblock. If this is
4633      * the last one to sync, we need to notify the main send thread.
4634      */
4635     ram_dirty_bitmap_reload_notify(s);
4636
4637     ret = 0;
4638 out:
4639     g_free(le_bitmap);
4640     return ret;
4641 }
4642
4643 static int ram_resume_prepare(MigrationState *s, void *opaque)
4644 {
4645     RAMState *rs = *(RAMState **)opaque;
4646     int ret;
4647
4648     ret = ram_dirty_bitmap_sync_all(s, rs);
4649     if (ret) {
4650         return ret;
4651     }
4652
4653     ram_state_resume_prepare(rs, s->to_dst_file);
4654
4655     return 0;
4656 }
4657
4658 void postcopy_preempt_shutdown_file(MigrationState *s)
4659 {
4660     qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4661     qemu_fflush(s->postcopy_qemufile_src);
4662 }
4663
4664 static SaveVMHandlers savevm_ram_handlers = {
4665     .save_setup = ram_save_setup,
4666     .save_live_iterate = ram_save_iterate,
4667     .save_live_complete_postcopy = ram_save_complete,
4668     .save_live_complete_precopy = ram_save_complete,
4669     .has_postcopy = ram_has_postcopy,
4670     .state_pending_exact = ram_state_pending_exact,
4671     .state_pending_estimate = ram_state_pending_estimate,
4672     .load_state = ram_load,
4673     .save_cleanup = ram_save_cleanup,
4674     .load_setup = ram_load_setup,
4675     .load_cleanup = ram_load_cleanup,
4676     .resume_prepare = ram_resume_prepare,
4677 };
4678
4679 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4680                                       size_t old_size, size_t new_size)
4681 {
4682     PostcopyState ps = postcopy_state_get();
4683     ram_addr_t offset;
4684     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4685     Error *err = NULL;
4686
4687     if (ramblock_is_ignored(rb)) {
4688         return;
4689     }
4690
4691     if (!migration_is_idle()) {
4692         /*
4693          * Precopy code on the source cannot deal with the size of RAM blocks
4694          * changing at random points in time - especially after sending the
4695          * RAM block sizes in the migration stream, they must no longer change.
4696          * Abort and indicate a proper reason.
4697          */
4698         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4699         migration_cancel(err);
4700         error_free(err);
4701     }
4702
4703     switch (ps) {
4704     case POSTCOPY_INCOMING_ADVISE:
4705         /*
4706          * Update what ram_postcopy_incoming_init()->init_range() does at the
4707          * time postcopy was advised. Syncing RAM blocks with the source will
4708          * result in RAM resizes.
4709          */
4710         if (old_size < new_size) {
4711             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4712                 error_report("RAM block '%s' discard of resized RAM failed",
4713                              rb->idstr);
4714             }
4715         }
4716         rb->postcopy_length = new_size;
4717         break;
4718     case POSTCOPY_INCOMING_NONE:
4719     case POSTCOPY_INCOMING_RUNNING:
4720     case POSTCOPY_INCOMING_END:
4721         /*
4722          * Once our guest is running, postcopy does no longer care about
4723          * resizes. When growing, the new memory was not available on the
4724          * source, no handler needed.
4725          */
4726         break;
4727     default:
4728         error_report("RAM block '%s' resized during postcopy state: %d",
4729                      rb->idstr, ps);
4730         exit(-1);
4731     }
4732 }
4733
4734 static RAMBlockNotifier ram_mig_ram_notifier = {
4735     .ram_block_resized = ram_mig_ram_block_resized,
4736 };
4737
4738 void ram_mig_init(void)
4739 {
4740     qemu_mutex_init(&XBZRLE.lock);
4741     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4742     ram_block_notifier_add(&ram_mig_ram_notifier);
4743 }