migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/madvise.h"
  34 #include "qemu/main-loop.h"
  35 #include "io/channel-null.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "migration/register.h"
  40 #include "migration/misc.h"
  41 #include "qemu-file.h"
  42 #include "postcopy-ram.h"
  43 #include "page_cache.h"
  44 #include "qemu/error-report.h"
  45 #include "qapi/error.h"
  46 #include "qapi/qapi-types-migration.h"
  47 #include "qapi/qapi-events-migration.h"
  48 #include "qapi/qmp/qerror.h"
  49 #include "trace.h"
  50 #include "exec/ram_addr.h"
  51 #include "exec/target_page.h"
  52 #include "qemu/rcu_queue.h"
  53 #include "migration/colo.h"
  54 #include "block.h"
  55 #include "sysemu/cpu-throttle.h"
  56 #include "savevm.h"
  57 #include "qemu/iov.h"
  58 #include "multifd.h"
  59 #include "sysemu/runstate.h"
  60
  61 #include "hw/boards.h" /* for machine_dump_guest_core() */
  62
  63 #if defined(__linux__)
  64 #include "qemu/userfaultfd.h"
  65 #endif /* defined(__linux__) */
  66
  67 /***********************************************************/
  68 /* ram save/restore */
  69
  70 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  71  * worked for pages that where filled with the same char.  We switched
  72  * it to only search for the zero value.  And to avoid confusion with
  73  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  74  */
  75
  76 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  77 #define RAM_SAVE_FLAG_ZERO     0x02
  78 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  79 #define RAM_SAVE_FLAG_PAGE     0x08
  80 #define RAM_SAVE_FLAG_EOS      0x10
  81 #define RAM_SAVE_FLAG_CONTINUE 0x20
  82 #define RAM_SAVE_FLAG_XBZRLE   0x40
  83 /* 0x80 is reserved in migration.h start with 0x100 next */
  84 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  85
  86 XBZRLECacheStats xbzrle_counters;
  87
  88 /* used by the search for pages to send */
  89 struct PageSearchStatus {
  90     /* The migration channel used for a specific host page */
  91     QEMUFile    *pss_channel;
  92     /* Last block from where we have sent data */
  93     RAMBlock *last_sent_block;
  94     /* Current block being searched */
  95     RAMBlock    *block;
  96     /* Current page to search from */
  97     unsigned long page;
  98     /* Set once we wrap around */
  99     bool         complete_round;
 100     /*
 101      * [POSTCOPY-ONLY] Whether current page is explicitly requested by
 102      * postcopy.  When set, the request is "urgent" because the dest QEMU
 103      * threads are waiting for us.
 104      */
 105     bool         postcopy_requested;
 106     /*
 107      * [POSTCOPY-ONLY] The target channel to use to send current page.
 108      *
 109      * Note: This may _not_ match with the value in postcopy_requested
 110      * above. Let's imagine the case where the postcopy request is exactly
 111      * the page that we're sending in progress during precopy. In this case
 112      * we'll have postcopy_requested set to true but the target channel
 113      * will be the precopy channel (so that we don't split brain on that
 114      * specific page since the precopy channel already contains partial of
 115      * that page data).
 116      *
 117      * Besides that specific use case, postcopy_target_channel should
 118      * always be equal to postcopy_requested, because by default we send
 119      * postcopy pages via postcopy preempt channel.
 120      */
 121     bool         postcopy_target_channel;
 122     /* Whether we're sending a host page */
 123     bool          host_page_sending;
 124     /* The start/end of current host page.  Invalid if host_page_sending==false */
 125     unsigned long host_page_start;
 126     unsigned long host_page_end;
 127 };
 128 typedef struct PageSearchStatus PageSearchStatus;
 129
 130 /* struct contains XBZRLE cache and a static page
 131    used by the compression */
 132 static struct {
 133     /* buffer used for XBZRLE encoding */
 134     uint8_t *encoded_buf;
 135     /* buffer for storing page content */
 136     uint8_t *current_buf;
 137     /* Cache for XBZRLE, Protected by lock. */
 138     PageCache *cache;
 139     QemuMutex lock;
 140     /* it will store a page full of zeros */
 141     uint8_t *zero_target_page;
 142     /* buffer used for XBZRLE decoding */
 143     uint8_t *decoded_buf;
 144 } XBZRLE;
 145
 146 static void XBZRLE_cache_lock(void)
 147 {
 148     if (migrate_use_xbzrle()) {
 149         qemu_mutex_lock(&XBZRLE.lock);
 150     }
 151 }
 152
 153 static void XBZRLE_cache_unlock(void)
 154 {
 155     if (migrate_use_xbzrle()) {
 156         qemu_mutex_unlock(&XBZRLE.lock);
 157     }
 158 }
 159
 160 /**
 161  * xbzrle_cache_resize: resize the xbzrle cache
 162  *
 163  * This function is called from migrate_params_apply in main
 164  * thread, possibly while a migration is in progress.  A running
 165  * migration may be using the cache and might finish during this call,
 166  * hence changes to the cache are protected by XBZRLE.lock().
 167  *
 168  * Returns 0 for success or -1 for error
 169  *
 170  * @new_size: new cache size
 171  * @errp: set *errp if the check failed, with reason
 172  */
 173 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 174 {
 175     PageCache *new_cache;
 176     int64_t ret = 0;
 177
 178     /* Check for truncation */
 179     if (new_size != (size_t)new_size) {
 180         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 181                    "exceeding address space");
 182         return -1;
 183     }
 184
 185     if (new_size == migrate_xbzrle_cache_size()) {
 186         /* nothing to do */
 187         return 0;
 188     }
 189
 190     XBZRLE_cache_lock();
 191
 192     if (XBZRLE.cache != NULL) {
 193         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 194         if (!new_cache) {
 195             ret = -1;
 196             goto out;
 197         }
 198
 199         cache_fini(XBZRLE.cache);
 200         XBZRLE.cache = new_cache;
 201     }
 202 out:
 203     XBZRLE_cache_unlock();
 204     return ret;
 205 }
 206
 207 static bool postcopy_preempt_active(void)
 208 {
 209     return migrate_postcopy_preempt() && migration_in_postcopy();
 210 }
 211
 212 bool ramblock_is_ignored(RAMBlock *block)
 213 {
 214     return !qemu_ram_is_migratable(block) ||
 215            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 216 }
 217
 218 #undef RAMBLOCK_FOREACH
 219
 220 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 221 {
 222     RAMBlock *block;
 223     int ret = 0;
 224
 225     RCU_READ_LOCK_GUARD();
 226
 227     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 228         ret = func(block, opaque);
 229         if (ret) {
 230             break;
 231         }
 232     }
 233     return ret;
 234 }
 235
 236 static void ramblock_recv_map_init(void)
 237 {
 238     RAMBlock *rb;
 239
 240     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 241         assert(!rb->receivedmap);
 242         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 243     }
 244 }
 245
 246 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 247 {
 248     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 249                     rb->receivedmap);
 250 }
 251
 252 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 253 {
 254     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 255 }
 256
 257 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 258 {
 259     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 260 }
 261
 262 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 263                                     size_t nr)
 264 {
 265     bitmap_set_atomic(rb->receivedmap,
 266                       ramblock_recv_bitmap_offset(host_addr, rb),
 267                       nr);
 268 }
 269
 270 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 271
 272 /*
 273  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 274  *
 275  * Returns >0 if success with sent bytes, or <0 if error.
 276  */
 277 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 278                                   const char *block_name)
 279 {
 280     RAMBlock *block = qemu_ram_block_by_name(block_name);
 281     unsigned long *le_bitmap, nbits;
 282     uint64_t size;
 283
 284     if (!block) {
 285         error_report("%s: invalid block name: %s", __func__, block_name);
 286         return -1;
 287     }
 288
 289     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 290
 291     /*
 292      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 293      * machines we may need 4 more bytes for padding (see below
 294      * comment). So extend it a bit before hand.
 295      */
 296     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 297
 298     /*
 299      * Always use little endian when sending the bitmap. This is
 300      * required that when source and destination VMs are not using the
 301      * same endianness. (Note: big endian won't work.)
 302      */
 303     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 304
 305     /* Size of the bitmap, in bytes */
 306     size = DIV_ROUND_UP(nbits, 8);
 307
 308     /*
 309      * size is always aligned to 8 bytes for 64bit machines, but it
 310      * may not be true for 32bit machines. We need this padding to
 311      * make sure the migration can survive even between 32bit and
 312      * 64bit machines.
 313      */
 314     size = ROUND_UP(size, 8);
 315
 316     qemu_put_be64(file, size);
 317     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 318     /*
 319      * Mark as an end, in case the middle part is screwed up due to
 320      * some "mysterious" reason.
 321      */
 322     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 323     qemu_fflush(file);
 324
 325     g_free(le_bitmap);
 326
 327     if (qemu_file_get_error(file)) {
 328         return qemu_file_get_error(file);
 329     }
 330
 331     return size + sizeof(size);
 332 }
 333
 334 /*
 335  * An outstanding page request, on the source, having been received
 336  * and queued
 337  */
 338 struct RAMSrcPageRequest {
 339     RAMBlock *rb;
 340     hwaddr    offset;
 341     hwaddr    len;
 342
 343     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 344 };
 345
 346 typedef struct {
 347     /*
 348      * Cached ramblock/offset values if preempted.  They're only meaningful if
 349      * preempted==true below.
 350      */
 351     RAMBlock *ram_block;
 352     unsigned long ram_page;
 353     /*
 354      * Whether a postcopy preemption just happened.  Will be reset after
 355      * precopy recovered to background migration.
 356      */
 357     bool preempted;
 358 } PostcopyPreemptState;
 359
 360 /* State of RAM for migration */
 361 struct RAMState {
 362     /* QEMUFile used for this migration */
 363     QEMUFile *f;
 364     /*
 365      * PageSearchStatus structures for the channels when send pages.
 366      * Protected by the bitmap_mutex.
 367      */
 368     PageSearchStatus pss[RAM_CHANNEL_MAX];
 369     /* UFFD file descriptor, used in 'write-tracking' migration */
 370     int uffdio_fd;
 371     /* Last block that we have visited searching for dirty pages */
 372     RAMBlock *last_seen_block;
 373     /* Last dirty target page we have sent */
 374     ram_addr_t last_page;
 375     /* last ram version we have seen */
 376     uint32_t last_version;
 377     /* How many times we have dirty too many pages */
 378     int dirty_rate_high_cnt;
 379     /* these variables are used for bitmap sync */
 380     /* last time we did a full bitmap_sync */
 381     int64_t time_last_bitmap_sync;
 382     /* bytes transferred at start_time */
 383     uint64_t bytes_xfer_prev;
 384     /* number of dirty pages since start_time */
 385     uint64_t num_dirty_pages_period;
 386     /* xbzrle misses since the beginning of the period */
 387     uint64_t xbzrle_cache_miss_prev;
 388     /* Amount of xbzrle pages since the beginning of the period */
 389     uint64_t xbzrle_pages_prev;
 390     /* Amount of xbzrle encoded bytes since the beginning of the period */
 391     uint64_t xbzrle_bytes_prev;
 392     /* Start using XBZRLE (e.g., after the first round). */
 393     bool xbzrle_enabled;
 394     /* Are we on the last stage of migration */
 395     bool last_stage;
 396     /* compression statistics since the beginning of the period */
 397     /* amount of count that no free thread to compress data */
 398     uint64_t compress_thread_busy_prev;
 399     /* amount bytes after compression */
 400     uint64_t compressed_size_prev;
 401     /* amount of compressed pages */
 402     uint64_t compress_pages_prev;
 403
 404     /* total handled target pages at the beginning of period */
 405     uint64_t target_page_count_prev;
 406     /* total handled target pages since start */
 407     uint64_t target_page_count;
 408     /* number of dirty bits in the bitmap */
 409     uint64_t migration_dirty_pages;
 410     /*
 411      * Protects:
 412      * - dirty/clear bitmap
 413      * - migration_dirty_pages
 414      * - pss structures
 415      */
 416     QemuMutex bitmap_mutex;
 417     /* The RAMBlock used in the last src_page_requests */
 418     RAMBlock *last_req_rb;
 419     /* Queue of outstanding page requests from the destination */
 420     QemuMutex src_page_req_mutex;
 421     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 422
 423     /* Postcopy preemption informations */
 424     PostcopyPreemptState postcopy_preempt_state;
 425     /*
 426      * Current channel we're using on src VM.  Only valid if postcopy-preempt
 427      * is enabled.
 428      */
 429     unsigned int postcopy_channel;
 430 };
 431 typedef struct RAMState RAMState;
 432
 433 static RAMState *ram_state;
 434
 435 static NotifierWithReturnList precopy_notifier_list;
 436
 437 static void postcopy_preempt_reset(RAMState *rs)
 438 {
 439     memset(&rs->postcopy_preempt_state, 0, sizeof(PostcopyPreemptState));
 440 }
 441
 442 /* Whether postcopy has queued requests? */
 443 static bool postcopy_has_request(RAMState *rs)
 444 {
 445     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
 446 }
 447
 448 void precopy_infrastructure_init(void)
 449 {
 450     notifier_with_return_list_init(&precopy_notifier_list);
 451 }
 452
 453 void precopy_add_notifier(NotifierWithReturn *n)
 454 {
 455     notifier_with_return_list_add(&precopy_notifier_list, n);
 456 }
 457
 458 void precopy_remove_notifier(NotifierWithReturn *n)
 459 {
 460     notifier_with_return_remove(n);
 461 }
 462
 463 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 464 {
 465     PrecopyNotifyData pnd;
 466     pnd.reason = reason;
 467     pnd.errp = errp;
 468
 469     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 470 }
 471
 472 uint64_t ram_bytes_remaining(void)
 473 {
 474     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 475                        0;
 476 }
 477
 478 /*
 479  * NOTE: not all stats in ram_counters are used in reality.  See comments
 480  * for struct MigrationAtomicStats.  The ultimate result of ram migration
 481  * counters will be a merged version with both ram_counters and the atomic
 482  * fields in ram_atomic_counters.
 483  */
 484 MigrationStats ram_counters;
 485 MigrationAtomicStats ram_atomic_counters;
 486
 487 void ram_transferred_add(uint64_t bytes)
 488 {
 489     if (runstate_is_running()) {
 490         ram_counters.precopy_bytes += bytes;
 491     } else if (migration_in_postcopy()) {
 492         stat64_add(&ram_atomic_counters.postcopy_bytes, bytes);
 493     } else {
 494         ram_counters.downtime_bytes += bytes;
 495     }
 496     stat64_add(&ram_atomic_counters.transferred, bytes);
 497 }
 498
 499 void dirty_sync_missed_zero_copy(void)
 500 {
 501     ram_counters.dirty_sync_missed_zero_copy++;
 502 }
 503
 504 CompressionStats compression_counters;
 505
 506 struct CompressParam {
 507     bool done;
 508     bool quit;
 509     bool zero_page;
 510     QEMUFile *file;
 511     QemuMutex mutex;
 512     QemuCond cond;
 513     RAMBlock *block;
 514     ram_addr_t offset;
 515
 516     /* internally used fields */
 517     z_stream stream;
 518     uint8_t *originbuf;
 519 };
 520 typedef struct CompressParam CompressParam;
 521
 522 struct DecompressParam {
 523     bool done;
 524     bool quit;
 525     QemuMutex mutex;
 526     QemuCond cond;
 527     void *des;
 528     uint8_t *compbuf;
 529     int len;
 530     z_stream stream;
 531 };
 532 typedef struct DecompressParam DecompressParam;
 533
 534 static CompressParam *comp_param;
 535 static QemuThread *compress_threads;
 536 /* comp_done_cond is used to wake up the migration thread when
 537  * one of the compression threads has finished the compression.
 538  * comp_done_lock is used to co-work with comp_done_cond.
 539  */
 540 static QemuMutex comp_done_lock;
 541 static QemuCond comp_done_cond;
 542
 543 static QEMUFile *decomp_file;
 544 static DecompressParam *decomp_param;
 545 static QemuThread *decompress_threads;
 546 static QemuMutex decomp_done_lock;
 547 static QemuCond decomp_done_cond;
 548
 549 static int ram_save_host_page_urgent(PageSearchStatus *pss);
 550
 551 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 552                                  ram_addr_t offset, uint8_t *source_buf);
 553
 554 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
 555                                      bool postcopy_requested);
 556
 557 /* NOTE: page is the PFN not real ram_addr_t. */
 558 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
 559 {
 560     pss->block = rb;
 561     pss->page = page;
 562     pss->complete_round = false;
 563 }
 564
 565 /*
 566  * Check whether two PSSs are actively sending the same page.  Return true
 567  * if it is, false otherwise.
 568  */
 569 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
 570 {
 571     return pss1->host_page_sending && pss2->host_page_sending &&
 572         (pss1->host_page_start == pss2->host_page_start);
 573 }
 574
 575 static void *do_data_compress(void *opaque)
 576 {
 577     CompressParam *param = opaque;
 578     RAMBlock *block;
 579     ram_addr_t offset;
 580     bool zero_page;
 581
 582     qemu_mutex_lock(&param->mutex);
 583     while (!param->quit) {
 584         if (param->block) {
 585             block = param->block;
 586             offset = param->offset;
 587             param->block = NULL;
 588             qemu_mutex_unlock(&param->mutex);
 589
 590             zero_page = do_compress_ram_page(param->file, &param->stream,
 591                                              block, offset, param->originbuf);
 592
 593             qemu_mutex_lock(&comp_done_lock);
 594             param->done = true;
 595             param->zero_page = zero_page;
 596             qemu_cond_signal(&comp_done_cond);
 597             qemu_mutex_unlock(&comp_done_lock);
 598
 599             qemu_mutex_lock(&param->mutex);
 600         } else {
 601             qemu_cond_wait(&param->cond, &param->mutex);
 602         }
 603     }
 604     qemu_mutex_unlock(&param->mutex);
 605
 606     return NULL;
 607 }
 608
 609 static void compress_threads_save_cleanup(void)
 610 {
 611     int i, thread_count;
 612
 613     if (!migrate_use_compression() || !comp_param) {
 614         return;
 615     }
 616
 617     thread_count = migrate_compress_threads();
 618     for (i = 0; i < thread_count; i++) {
 619         /*
 620          * we use it as a indicator which shows if the thread is
 621          * properly init'd or not
 622          */
 623         if (!comp_param[i].file) {
 624             break;
 625         }
 626
 627         qemu_mutex_lock(&comp_param[i].mutex);
 628         comp_param[i].quit = true;
 629         qemu_cond_signal(&comp_param[i].cond);
 630         qemu_mutex_unlock(&comp_param[i].mutex);
 631
 632         qemu_thread_join(compress_threads + i);
 633         qemu_mutex_destroy(&comp_param[i].mutex);
 634         qemu_cond_destroy(&comp_param[i].cond);
 635         deflateEnd(&comp_param[i].stream);
 636         g_free(comp_param[i].originbuf);
 637         qemu_fclose(comp_param[i].file);
 638         comp_param[i].file = NULL;
 639     }
 640     qemu_mutex_destroy(&comp_done_lock);
 641     qemu_cond_destroy(&comp_done_cond);
 642     g_free(compress_threads);
 643     g_free(comp_param);
 644     compress_threads = NULL;
 645     comp_param = NULL;
 646 }
 647
 648 static int compress_threads_save_setup(void)
 649 {
 650     int i, thread_count;
 651
 652     if (!migrate_use_compression()) {
 653         return 0;
 654     }
 655     thread_count = migrate_compress_threads();
 656     compress_threads = g_new0(QemuThread, thread_count);
 657     comp_param = g_new0(CompressParam, thread_count);
 658     qemu_cond_init(&comp_done_cond);
 659     qemu_mutex_init(&comp_done_lock);
 660     for (i = 0; i < thread_count; i++) {
 661         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 662         if (!comp_param[i].originbuf) {
 663             goto exit;
 664         }
 665
 666         if (deflateInit(&comp_param[i].stream,
 667                         migrate_compress_level()) != Z_OK) {
 668             g_free(comp_param[i].originbuf);
 669             goto exit;
 670         }
 671
 672         /* comp_param[i].file is just used as a dummy buffer to save data,
 673          * set its ops to empty.
 674          */
 675         comp_param[i].file = qemu_file_new_output(
 676             QIO_CHANNEL(qio_channel_null_new()));
 677         comp_param[i].done = true;
 678         comp_param[i].quit = false;
 679         qemu_mutex_init(&comp_param[i].mutex);
 680         qemu_cond_init(&comp_param[i].cond);
 681         qemu_thread_create(compress_threads + i, "compress",
 682                            do_data_compress, comp_param + i,
 683                            QEMU_THREAD_JOINABLE);
 684     }
 685     return 0;
 686
 687 exit:
 688     compress_threads_save_cleanup();
 689     return -1;
 690 }
 691
 692 /**
 693  * save_page_header: write page header to wire
 694  *
 695  * If this is the 1st block, it also writes the block identification
 696  *
 697  * Returns the number of bytes written
 698  *
 699  * @pss: current PSS channel status
 700  * @block: block that contains the page we want to send
 701  * @offset: offset inside the block for the page
 702  *          in the lower bits, it contains flags
 703  */
 704 static size_t save_page_header(PageSearchStatus *pss, RAMBlock *block,
 705                                ram_addr_t offset)
 706 {
 707     size_t size, len;
 708     bool same_block = (block == pss->last_sent_block);
 709     QEMUFile *f = pss->pss_channel;
 710
 711     if (same_block) {
 712         offset |= RAM_SAVE_FLAG_CONTINUE;
 713     }
 714     qemu_put_be64(f, offset);
 715     size = 8;
 716
 717     if (!same_block) {
 718         len = strlen(block->idstr);
 719         qemu_put_byte(f, len);
 720         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 721         size += 1 + len;
 722         pss->last_sent_block = block;
 723     }
 724     return size;
 725 }
 726
 727 /**
 728  * mig_throttle_guest_down: throttle down the guest
 729  *
 730  * Reduce amount of guest cpu execution to hopefully slow down memory
 731  * writes. If guest dirty memory rate is reduced below the rate at
 732  * which we can transfer pages to the destination then we should be
 733  * able to complete migration. Some workloads dirty memory way too
 734  * fast and will not effectively converge, even with auto-converge.
 735  */
 736 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 737                                     uint64_t bytes_dirty_threshold)
 738 {
 739     MigrationState *s = migrate_get_current();
 740     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 741     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 742     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 743     int pct_max = s->parameters.max_cpu_throttle;
 744
 745     uint64_t throttle_now = cpu_throttle_get_percentage();
 746     uint64_t cpu_now, cpu_ideal, throttle_inc;
 747
 748     /* We have not started throttling yet. Let's start it. */
 749     if (!cpu_throttle_active()) {
 750         cpu_throttle_set(pct_initial);
 751     } else {
 752         /* Throttling already on, just increase the rate */
 753         if (!pct_tailslow) {
 754             throttle_inc = pct_increment;
 755         } else {
 756             /* Compute the ideal CPU percentage used by Guest, which may
 757              * make the dirty rate match the dirty rate threshold. */
 758             cpu_now = 100 - throttle_now;
 759             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 760                         bytes_dirty_period);
 761             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 762         }
 763         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 764     }
 765 }
 766
 767 void mig_throttle_counter_reset(void)
 768 {
 769     RAMState *rs = ram_state;
 770
 771     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 772     rs->num_dirty_pages_period = 0;
 773     rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
 774 }
 775
 776 /**
 777  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 778  *
 779  * @rs: current RAM state
 780  * @current_addr: address for the zero page
 781  *
 782  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 783  * The important thing is that a stale (not-yet-0'd) page be replaced
 784  * by the new data.
 785  * As a bonus, if the page wasn't in the cache it gets added so that
 786  * when a small write is made into the 0'd page it gets XBZRLE sent.
 787  */
 788 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 789 {
 790     /* We don't care if this fails to allocate a new cache page
 791      * as long as it updated an old one */
 792     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 793                  ram_counters.dirty_sync_count);
 794 }
 795
 796 #define ENCODING_FLAG_XBZRLE 0x1
 797
 798 /**
 799  * save_xbzrle_page: compress and send current page
 800  *
 801  * Returns: 1 means that we wrote the page
 802  *          0 means that page is identical to the one already sent
 803  *          -1 means that xbzrle would be longer than normal
 804  *
 805  * @rs: current RAM state
 806  * @pss: current PSS channel
 807  * @current_data: pointer to the address of the page contents
 808  * @current_addr: addr of the page
 809  * @block: block that contains the page we want to send
 810  * @offset: offset inside the block for the page
 811  */
 812 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
 813                             uint8_t **current_data, ram_addr_t current_addr,
 814                             RAMBlock *block, ram_addr_t offset)
 815 {
 816     int encoded_len = 0, bytes_xbzrle;
 817     uint8_t *prev_cached_page;
 818     QEMUFile *file = pss->pss_channel;
 819
 820     if (!cache_is_cached(XBZRLE.cache, current_addr,
 821                          ram_counters.dirty_sync_count)) {
 822         xbzrle_counters.cache_miss++;
 823         if (!rs->last_stage) {
 824             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 825                              ram_counters.dirty_sync_count) == -1) {
 826                 return -1;
 827             } else {
 828                 /* update *current_data when the page has been
 829                    inserted into cache */
 830                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 831             }
 832         }
 833         return -1;
 834     }
 835
 836     /*
 837      * Reaching here means the page has hit the xbzrle cache, no matter what
 838      * encoding result it is (normal encoding, overflow or skipping the page),
 839      * count the page as encoded. This is used to calculate the encoding rate.
 840      *
 841      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 842      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 843      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 844      * skipped page included. In this way, the encoding rate can tell if the
 845      * guest page is good for xbzrle encoding.
 846      */
 847     xbzrle_counters.pages++;
 848     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 849
 850     /* save current buffer into memory */
 851     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 852
 853     /* XBZRLE encoding (if there is no overflow) */
 854     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 855                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 856                                        TARGET_PAGE_SIZE);
 857
 858     /*
 859      * Update the cache contents, so that it corresponds to the data
 860      * sent, in all cases except where we skip the page.
 861      */
 862     if (!rs->last_stage && encoded_len != 0) {
 863         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 864         /*
 865          * In the case where we couldn't compress, ensure that the caller
 866          * sends the data from the cache, since the guest might have
 867          * changed the RAM since we copied it.
 868          */
 869         *current_data = prev_cached_page;
 870     }
 871
 872     if (encoded_len == 0) {
 873         trace_save_xbzrle_page_skipping();
 874         return 0;
 875     } else if (encoded_len == -1) {
 876         trace_save_xbzrle_page_overflow();
 877         xbzrle_counters.overflow++;
 878         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 879         return -1;
 880     }
 881
 882     /* Send XBZRLE based compressed page */
 883     bytes_xbzrle = save_page_header(pss, block,
 884                                     offset | RAM_SAVE_FLAG_XBZRLE);
 885     qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
 886     qemu_put_be16(file, encoded_len);
 887     qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
 888     bytes_xbzrle += encoded_len + 1 + 2;
 889     /*
 890      * Like compressed_size (please see update_compress_thread_counts),
 891      * the xbzrle encoded bytes don't count the 8 byte header with
 892      * RAM_SAVE_FLAG_CONTINUE.
 893      */
 894     xbzrle_counters.bytes += bytes_xbzrle - 8;
 895     ram_transferred_add(bytes_xbzrle);
 896
 897     return 1;
 898 }
 899
 900 /**
 901  * pss_find_next_dirty: find the next dirty page of current ramblock
 902  *
 903  * This function updates pss->page to point to the next dirty page index
 904  * within the ramblock to migrate, or the end of ramblock when nothing
 905  * found.  Note that when pss->host_page_sending==true it means we're
 906  * during sending a host page, so we won't look for dirty page that is
 907  * outside the host page boundary.
 908  *
 909  * @pss: the current page search status
 910  */
 911 static void pss_find_next_dirty(PageSearchStatus *pss)
 912 {
 913     RAMBlock *rb = pss->block;
 914     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 915     unsigned long *bitmap = rb->bmap;
 916
 917     if (ramblock_is_ignored(rb)) {
 918         /* Points directly to the end, so we know no dirty page */
 919         pss->page = size;
 920         return;
 921     }
 922
 923     /*
 924      * If during sending a host page, only look for dirty pages within the
 925      * current host page being send.
 926      */
 927     if (pss->host_page_sending) {
 928         assert(pss->host_page_end);
 929         size = MIN(size, pss->host_page_end);
 930     }
 931
 932     pss->page = find_next_bit(bitmap, size, pss->page);
 933 }
 934
 935 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
 936                                                        unsigned long page)
 937 {
 938     uint8_t shift;
 939     hwaddr size, start;
 940
 941     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
 942         return;
 943     }
 944
 945     shift = rb->clear_bmap_shift;
 946     /*
 947      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 948      * can make things easier sometimes since then start address
 949      * of the small chunk will always be 64 pages aligned so the
 950      * bitmap will always be aligned to unsigned long. We should
 951      * even be able to remove this restriction but I'm simply
 952      * keeping it.
 953      */
 954     assert(shift >= 6);
 955
 956     size = 1ULL << (TARGET_PAGE_BITS + shift);
 957     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
 958     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 959     memory_region_clear_dirty_bitmap(rb->mr, start, size);
 960 }
 961
 962 static void
 963 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
 964                                                  unsigned long start,
 965                                                  unsigned long npages)
 966 {
 967     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
 968     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
 969     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
 970
 971     /*
 972      * Clear pages from start to start + npages - 1, so the end boundary is
 973      * exclusive.
 974      */
 975     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
 976         migration_clear_memory_region_dirty_bitmap(rb, i);
 977     }
 978 }
 979
 980 /*
 981  * colo_bitmap_find_diry:find contiguous dirty pages from start
 982  *
 983  * Returns the page offset within memory region of the start of the contiguout
 984  * dirty page
 985  *
 986  * @rs: current RAM state
 987  * @rb: RAMBlock where to search for dirty pages
 988  * @start: page where we start the search
 989  * @num: the number of contiguous dirty pages
 990  */
 991 static inline
 992 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 993                                      unsigned long start, unsigned long *num)
 994 {
 995     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 996     unsigned long *bitmap = rb->bmap;
 997     unsigned long first, next;
 998
 999     *num = 0;
1000
1001     if (ramblock_is_ignored(rb)) {
1002         return size;
1003     }
1004
1005     first = find_next_bit(bitmap, size, start);
1006     if (first >= size) {
1007         return first;
1008     }
1009     next = find_next_zero_bit(bitmap, size, first + 1);
1010     assert(next >= first);
1011     *num = next - first;
1012     return first;
1013 }
1014
1015 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1016                                                 RAMBlock *rb,
1017                                                 unsigned long page)
1018 {
1019     bool ret;
1020
1021     /*
1022      * Clear dirty bitmap if needed.  This _must_ be called before we
1023      * send any of the page in the chunk because we need to make sure
1024      * we can capture further page content changes when we sync dirty
1025      * log the next time.  So as long as we are going to send any of
1026      * the page in the chunk we clear the remote dirty bitmap for all.
1027      * Clearing it earlier won't be a problem, but too late will.
1028      */
1029     migration_clear_memory_region_dirty_bitmap(rb, page);
1030
1031     ret = test_and_clear_bit(page, rb->bmap);
1032     if (ret) {
1033         rs->migration_dirty_pages--;
1034     }
1035
1036     return ret;
1037 }
1038
1039 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
1040                                        void *opaque)
1041 {
1042     const hwaddr offset = section->offset_within_region;
1043     const hwaddr size = int128_get64(section->size);
1044     const unsigned long start = offset >> TARGET_PAGE_BITS;
1045     const unsigned long npages = size >> TARGET_PAGE_BITS;
1046     RAMBlock *rb = section->mr->ram_block;
1047     uint64_t *cleared_bits = opaque;
1048
1049     /*
1050      * We don't grab ram_state->bitmap_mutex because we expect to run
1051      * only when starting migration or during postcopy recovery where
1052      * we don't have concurrent access.
1053      */
1054     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
1055         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
1056     }
1057     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
1058     bitmap_clear(rb->bmap, start, npages);
1059 }
1060
1061 /*
1062  * Exclude all dirty pages from migration that fall into a discarded range as
1063  * managed by a RamDiscardManager responsible for the mapped memory region of
1064  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1065  *
1066  * Discarded pages ("logically unplugged") have undefined content and must
1067  * not get migrated, because even reading these pages for migration might
1068  * result in undesired behavior.
1069  *
1070  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1071  *
1072  * Note: The result is only stable while migrating (precopy/postcopy).
1073  */
1074 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1075 {
1076     uint64_t cleared_bits = 0;
1077
1078     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1079         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1080         MemoryRegionSection section = {
1081             .mr = rb->mr,
1082             .offset_within_region = 0,
1083             .size = int128_make64(qemu_ram_get_used_length(rb)),
1084         };
1085
1086         ram_discard_manager_replay_discarded(rdm, &section,
1087                                              dirty_bitmap_clear_section,
1088                                              &cleared_bits);
1089     }
1090     return cleared_bits;
1091 }
1092
1093 /*
1094  * Check if a host-page aligned page falls into a discarded range as managed by
1095  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1096  *
1097  * Note: The result is only stable while migrating (precopy/postcopy).
1098  */
1099 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1100 {
1101     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1102         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1103         MemoryRegionSection section = {
1104             .mr = rb->mr,
1105             .offset_within_region = start,
1106             .size = int128_make64(qemu_ram_pagesize(rb)),
1107         };
1108
1109         return !ram_discard_manager_is_populated(rdm, &section);
1110     }
1111     return false;
1112 }
1113
1114 /* Called with RCU critical section */
1115 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1116 {
1117     uint64_t new_dirty_pages =
1118         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1119
1120     rs->migration_dirty_pages += new_dirty_pages;
1121     rs->num_dirty_pages_period += new_dirty_pages;
1122 }
1123
1124 /**
1125  * ram_pagesize_summary: calculate all the pagesizes of a VM
1126  *
1127  * Returns a summary bitmap of the page sizes of all RAMBlocks
1128  *
1129  * For VMs with just normal pages this is equivalent to the host page
1130  * size. If it's got some huge pages then it's the OR of all the
1131  * different page sizes.
1132  */
1133 uint64_t ram_pagesize_summary(void)
1134 {
1135     RAMBlock *block;
1136     uint64_t summary = 0;
1137
1138     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1139         summary |= block->page_size;
1140     }
1141
1142     return summary;
1143 }
1144
1145 uint64_t ram_get_total_transferred_pages(void)
1146 {
1147     return  stat64_get(&ram_atomic_counters.normal) +
1148         stat64_get(&ram_atomic_counters.duplicate) +
1149         compression_counters.pages + xbzrle_counters.pages;
1150 }
1151
1152 static void migration_update_rates(RAMState *rs, int64_t end_time)
1153 {
1154     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1155     double compressed_size;
1156
1157     /* calculate period counters */
1158     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1159                 / (end_time - rs->time_last_bitmap_sync);
1160
1161     if (!page_count) {
1162         return;
1163     }
1164
1165     if (migrate_use_xbzrle()) {
1166         double encoded_size, unencoded_size;
1167
1168         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1169             rs->xbzrle_cache_miss_prev) / page_count;
1170         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1171         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1172                          TARGET_PAGE_SIZE;
1173         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1174         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1175             xbzrle_counters.encoding_rate = 0;
1176         } else {
1177             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1178         }
1179         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1180         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1181     }
1182
1183     if (migrate_use_compression()) {
1184         compression_counters.busy_rate = (double)(compression_counters.busy -
1185             rs->compress_thread_busy_prev) / page_count;
1186         rs->compress_thread_busy_prev = compression_counters.busy;
1187
1188         compressed_size = compression_counters.compressed_size -
1189                           rs->compressed_size_prev;
1190         if (compressed_size) {
1191             double uncompressed_size = (compression_counters.pages -
1192                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1193
1194             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1195             compression_counters.compression_rate =
1196                                         uncompressed_size / compressed_size;
1197
1198             rs->compress_pages_prev = compression_counters.pages;
1199             rs->compressed_size_prev = compression_counters.compressed_size;
1200         }
1201     }
1202 }
1203
1204 static void migration_trigger_throttle(RAMState *rs)
1205 {
1206     MigrationState *s = migrate_get_current();
1207     uint64_t threshold = s->parameters.throttle_trigger_threshold;
1208     uint64_t bytes_xfer_period =
1209         stat64_get(&ram_atomic_counters.transferred) - rs->bytes_xfer_prev;
1210     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1211     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1212
1213     /* During block migration the auto-converge logic incorrectly detects
1214      * that ram migration makes no progress. Avoid this by disabling the
1215      * throttling logic during the bulk phase of block migration. */
1216     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1217         /* The following detection logic can be refined later. For now:
1218            Check to see if the ratio between dirtied bytes and the approx.
1219            amount of bytes that just got transferred since the last time
1220            we were in this routine reaches the threshold. If that happens
1221            twice, start or increase throttling. */
1222
1223         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1224             (++rs->dirty_rate_high_cnt >= 2)) {
1225             trace_migration_throttle();
1226             rs->dirty_rate_high_cnt = 0;
1227             mig_throttle_guest_down(bytes_dirty_period,
1228                                     bytes_dirty_threshold);
1229         }
1230     }
1231 }
1232
1233 static void migration_bitmap_sync(RAMState *rs)
1234 {
1235     RAMBlock *block;
1236     int64_t end_time;
1237
1238     ram_counters.dirty_sync_count++;
1239
1240     if (!rs->time_last_bitmap_sync) {
1241         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1242     }
1243
1244     trace_migration_bitmap_sync_start();
1245     memory_global_dirty_log_sync();
1246
1247     qemu_mutex_lock(&rs->bitmap_mutex);
1248     WITH_RCU_READ_LOCK_GUARD() {
1249         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1250             ramblock_sync_dirty_bitmap(rs, block);
1251         }
1252         ram_counters.remaining = ram_bytes_remaining();
1253     }
1254     qemu_mutex_unlock(&rs->bitmap_mutex);
1255
1256     memory_global_after_dirty_log_sync();
1257     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1258
1259     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1260
1261     /* more than 1 second = 1000 millisecons */
1262     if (end_time > rs->time_last_bitmap_sync + 1000) {
1263         migration_trigger_throttle(rs);
1264
1265         migration_update_rates(rs, end_time);
1266
1267         rs->target_page_count_prev = rs->target_page_count;
1268
1269         /* reset period counters */
1270         rs->time_last_bitmap_sync = end_time;
1271         rs->num_dirty_pages_period = 0;
1272         rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
1273     }
1274     if (migrate_use_events()) {
1275         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1276     }
1277 }
1278
1279 static void migration_bitmap_sync_precopy(RAMState *rs)
1280 {
1281     Error *local_err = NULL;
1282
1283     /*
1284      * The current notifier usage is just an optimization to migration, so we
1285      * don't stop the normal migration process in the error case.
1286      */
1287     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1288         error_report_err(local_err);
1289         local_err = NULL;
1290     }
1291
1292     migration_bitmap_sync(rs);
1293
1294     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1295         error_report_err(local_err);
1296     }
1297 }
1298
1299 void ram_release_page(const char *rbname, uint64_t offset)
1300 {
1301     if (!migrate_release_ram() || !migration_in_postcopy()) {
1302         return;
1303     }
1304
1305     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1306 }
1307
1308 /**
1309  * save_zero_page_to_file: send the zero page to the file
1310  *
1311  * Returns the size of data written to the file, 0 means the page is not
1312  * a zero page
1313  *
1314  * @pss: current PSS channel
1315  * @block: block that contains the page we want to send
1316  * @offset: offset inside the block for the page
1317  */
1318 static int save_zero_page_to_file(PageSearchStatus *pss,
1319                                   RAMBlock *block, ram_addr_t offset)
1320 {
1321     uint8_t *p = block->host + offset;
1322     QEMUFile *file = pss->pss_channel;
1323     int len = 0;
1324
1325     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1326         len += save_page_header(pss, block, offset | RAM_SAVE_FLAG_ZERO);
1327         qemu_put_byte(file, 0);
1328         len += 1;
1329         ram_release_page(block->idstr, offset);
1330     }
1331     return len;
1332 }
1333
1334 /**
1335  * save_zero_page: send the zero page to the stream
1336  *
1337  * Returns the number of pages written.
1338  *
1339  * @pss: current PSS channel
1340  * @block: block that contains the page we want to send
1341  * @offset: offset inside the block for the page
1342  */
1343 static int save_zero_page(PageSearchStatus *pss, RAMBlock *block,
1344                           ram_addr_t offset)
1345 {
1346     int len = save_zero_page_to_file(pss, block, offset);
1347
1348     if (len) {
1349         stat64_add(&ram_atomic_counters.duplicate, 1);
1350         ram_transferred_add(len);
1351         return 1;
1352     }
1353     return -1;
1354 }
1355
1356 /*
1357  * @pages: the number of pages written by the control path,
1358  *        < 0 - error
1359  *        > 0 - number of pages written
1360  *
1361  * Return true if the pages has been saved, otherwise false is returned.
1362  */
1363 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1364                               ram_addr_t offset, int *pages)
1365 {
1366     uint64_t bytes_xmit = 0;
1367     int ret;
1368
1369     *pages = -1;
1370     ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1371                                 TARGET_PAGE_SIZE, &bytes_xmit);
1372     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1373         return false;
1374     }
1375
1376     if (bytes_xmit) {
1377         ram_transferred_add(bytes_xmit);
1378         *pages = 1;
1379     }
1380
1381     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1382         return true;
1383     }
1384
1385     if (bytes_xmit > 0) {
1386         stat64_add(&ram_atomic_counters.normal, 1);
1387     } else if (bytes_xmit == 0) {
1388         stat64_add(&ram_atomic_counters.duplicate, 1);
1389     }
1390
1391     return true;
1392 }
1393
1394 /*
1395  * directly send the page to the stream
1396  *
1397  * Returns the number of pages written.
1398  *
1399  * @pss: current PSS channel
1400  * @block: block that contains the page we want to send
1401  * @offset: offset inside the block for the page
1402  * @buf: the page to be sent
1403  * @async: send to page asyncly
1404  */
1405 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1406                             ram_addr_t offset, uint8_t *buf, bool async)
1407 {
1408     QEMUFile *file = pss->pss_channel;
1409
1410     ram_transferred_add(save_page_header(pss, block,
1411                                          offset | RAM_SAVE_FLAG_PAGE));
1412     if (async) {
1413         qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1414                               migrate_release_ram() &&
1415                               migration_in_postcopy());
1416     } else {
1417         qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1418     }
1419     ram_transferred_add(TARGET_PAGE_SIZE);
1420     stat64_add(&ram_atomic_counters.normal, 1);
1421     return 1;
1422 }
1423
1424 /**
1425  * ram_save_page: send the given page to the stream
1426  *
1427  * Returns the number of pages written.
1428  *          < 0 - error
1429  *          >=0 - Number of pages written - this might legally be 0
1430  *                if xbzrle noticed the page was the same.
1431  *
1432  * @rs: current RAM state
1433  * @block: block that contains the page we want to send
1434  * @offset: offset inside the block for the page
1435  */
1436 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1437 {
1438     int pages = -1;
1439     uint8_t *p;
1440     bool send_async = true;
1441     RAMBlock *block = pss->block;
1442     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1443     ram_addr_t current_addr = block->offset + offset;
1444
1445     p = block->host + offset;
1446     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1447
1448     XBZRLE_cache_lock();
1449     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1450         pages = save_xbzrle_page(rs, pss, &p, current_addr,
1451                                  block, offset);
1452         if (!rs->last_stage) {
1453             /* Can't send this cached data async, since the cache page
1454              * might get updated before it gets to the wire
1455              */
1456             send_async = false;
1457         }
1458     }
1459
1460     /* XBZRLE overflow or normal page */
1461     if (pages == -1) {
1462         pages = save_normal_page(pss, block, offset, p, send_async);
1463     }
1464
1465     XBZRLE_cache_unlock();
1466
1467     return pages;
1468 }
1469
1470 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
1471                                  ram_addr_t offset)
1472 {
1473     if (multifd_queue_page(file, block, offset) < 0) {
1474         return -1;
1475     }
1476     stat64_add(&ram_atomic_counters.normal, 1);
1477
1478     return 1;
1479 }
1480
1481 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1482                                  ram_addr_t offset, uint8_t *source_buf)
1483 {
1484     RAMState *rs = ram_state;
1485     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
1486     uint8_t *p = block->host + offset;
1487     int ret;
1488
1489     if (save_zero_page_to_file(pss, block, offset)) {
1490         return true;
1491     }
1492
1493     save_page_header(pss, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1494
1495     /*
1496      * copy it to a internal buffer to avoid it being modified by VM
1497      * so that we can catch up the error during compression and
1498      * decompression
1499      */
1500     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1501     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1502     if (ret < 0) {
1503         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1504         error_report("compressed data failed!");
1505     }
1506     return false;
1507 }
1508
1509 static void
1510 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1511 {
1512     ram_transferred_add(bytes_xmit);
1513
1514     if (param->zero_page) {
1515         stat64_add(&ram_atomic_counters.duplicate, 1);
1516         return;
1517     }
1518
1519     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1520     compression_counters.compressed_size += bytes_xmit - 8;
1521     compression_counters.pages++;
1522 }
1523
1524 static bool save_page_use_compression(RAMState *rs);
1525
1526 static void flush_compressed_data(RAMState *rs)
1527 {
1528     MigrationState *ms = migrate_get_current();
1529     int idx, len, thread_count;
1530
1531     if (!save_page_use_compression(rs)) {
1532         return;
1533     }
1534     thread_count = migrate_compress_threads();
1535
1536     qemu_mutex_lock(&comp_done_lock);
1537     for (idx = 0; idx < thread_count; idx++) {
1538         while (!comp_param[idx].done) {
1539             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1540         }
1541     }
1542     qemu_mutex_unlock(&comp_done_lock);
1543
1544     for (idx = 0; idx < thread_count; idx++) {
1545         qemu_mutex_lock(&comp_param[idx].mutex);
1546         if (!comp_param[idx].quit) {
1547             len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
1548             /*
1549              * it's safe to fetch zero_page without holding comp_done_lock
1550              * as there is no further request submitted to the thread,
1551              * i.e, the thread should be waiting for a request at this point.
1552              */
1553             update_compress_thread_counts(&comp_param[idx], len);
1554         }
1555         qemu_mutex_unlock(&comp_param[idx].mutex);
1556     }
1557 }
1558
1559 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1560                                        ram_addr_t offset)
1561 {
1562     param->block = block;
1563     param->offset = offset;
1564 }
1565
1566 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
1567 {
1568     int idx, thread_count, bytes_xmit = -1, pages = -1;
1569     bool wait = migrate_compress_wait_thread();
1570     MigrationState *ms = migrate_get_current();
1571
1572     thread_count = migrate_compress_threads();
1573     qemu_mutex_lock(&comp_done_lock);
1574 retry:
1575     for (idx = 0; idx < thread_count; idx++) {
1576         if (comp_param[idx].done) {
1577             comp_param[idx].done = false;
1578             bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
1579                                             comp_param[idx].file);
1580             qemu_mutex_lock(&comp_param[idx].mutex);
1581             set_compress_params(&comp_param[idx], block, offset);
1582             qemu_cond_signal(&comp_param[idx].cond);
1583             qemu_mutex_unlock(&comp_param[idx].mutex);
1584             pages = 1;
1585             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1586             break;
1587         }
1588     }
1589
1590     /*
1591      * wait for the free thread if the user specifies 'compress-wait-thread',
1592      * otherwise we will post the page out in the main thread as normal page.
1593      */
1594     if (pages < 0 && wait) {
1595         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1596         goto retry;
1597     }
1598     qemu_mutex_unlock(&comp_done_lock);
1599
1600     return pages;
1601 }
1602
1603 /**
1604  * find_dirty_block: find the next dirty page and update any state
1605  * associated with the search process.
1606  *
1607  * Returns true if a page is found
1608  *
1609  * @rs: current RAM state
1610  * @pss: data about the state of the current dirty page scan
1611  * @again: set to false if the search has scanned the whole of RAM
1612  */
1613 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1614 {
1615     /*
1616      * This is not a postcopy requested page, mark it "not urgent", and use
1617      * precopy channel to send it.
1618      */
1619     pss->postcopy_requested = false;
1620     pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
1621
1622     /* Update pss->page for the next dirty bit in ramblock */
1623     pss_find_next_dirty(pss);
1624
1625     if (pss->complete_round && pss->block == rs->last_seen_block &&
1626         pss->page >= rs->last_page) {
1627         /*
1628          * We've been once around the RAM and haven't found anything.
1629          * Give up.
1630          */
1631         *again = false;
1632         return false;
1633     }
1634     if (!offset_in_ramblock(pss->block,
1635                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1636         /* Didn't find anything in this RAM Block */
1637         pss->page = 0;
1638         pss->block = QLIST_NEXT_RCU(pss->block, next);
1639         if (!pss->block) {
1640             /*
1641              * If memory migration starts over, we will meet a dirtied page
1642              * which may still exists in compression threads's ring, so we
1643              * should flush the compressed data to make sure the new page
1644              * is not overwritten by the old one in the destination.
1645              *
1646              * Also If xbzrle is on, stop using the data compression at this
1647              * point. In theory, xbzrle can do better than compression.
1648              */
1649             flush_compressed_data(rs);
1650
1651             /* Hit the end of the list */
1652             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1653             /* Flag that we've looped */
1654             pss->complete_round = true;
1655             /* After the first round, enable XBZRLE. */
1656             if (migrate_use_xbzrle()) {
1657                 rs->xbzrle_enabled = true;
1658             }
1659         }
1660         /* Didn't find anything this time, but try again on the new block */
1661         *again = true;
1662         return false;
1663     } else {
1664         /* Can go around again, but... */
1665         *again = true;
1666         /* We've found something so probably don't need to */
1667         return true;
1668     }
1669 }
1670
1671 /**
1672  * unqueue_page: gets a page of the queue
1673  *
1674  * Helper for 'get_queued_page' - gets a page off the queue
1675  *
1676  * Returns the block of the page (or NULL if none available)
1677  *
1678  * @rs: current RAM state
1679  * @offset: used to return the offset within the RAMBlock
1680  */
1681 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1682 {
1683     struct RAMSrcPageRequest *entry;
1684     RAMBlock *block = NULL;
1685
1686     if (!postcopy_has_request(rs)) {
1687         return NULL;
1688     }
1689
1690     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1691
1692     /*
1693      * This should _never_ change even after we take the lock, because no one
1694      * should be taking anything off the request list other than us.
1695      */
1696     assert(postcopy_has_request(rs));
1697
1698     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1699     block = entry->rb;
1700     *offset = entry->offset;
1701
1702     if (entry->len > TARGET_PAGE_SIZE) {
1703         entry->len -= TARGET_PAGE_SIZE;
1704         entry->offset += TARGET_PAGE_SIZE;
1705     } else {
1706         memory_region_unref(block->mr);
1707         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1708         g_free(entry);
1709         migration_consume_urgent_request();
1710     }
1711
1712     return block;
1713 }
1714
1715 #if defined(__linux__)
1716 /**
1717  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1718  *   is found, return RAM block pointer and page offset
1719  *
1720  * Returns pointer to the RAMBlock containing faulting page,
1721  *   NULL if no write faults are pending
1722  *
1723  * @rs: current RAM state
1724  * @offset: page offset from the beginning of the block
1725  */
1726 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1727 {
1728     struct uffd_msg uffd_msg;
1729     void *page_address;
1730     RAMBlock *block;
1731     int res;
1732
1733     if (!migrate_background_snapshot()) {
1734         return NULL;
1735     }
1736
1737     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1738     if (res <= 0) {
1739         return NULL;
1740     }
1741
1742     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1743     block = qemu_ram_block_from_host(page_address, false, offset);
1744     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1745     return block;
1746 }
1747
1748 /**
1749  * ram_save_release_protection: release UFFD write protection after
1750  *   a range of pages has been saved
1751  *
1752  * @rs: current RAM state
1753  * @pss: page-search-status structure
1754  * @start_page: index of the first page in the range relative to pss->block
1755  *
1756  * Returns 0 on success, negative value in case of an error
1757 */
1758 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1759         unsigned long start_page)
1760 {
1761     int res = 0;
1762
1763     /* Check if page is from UFFD-managed region. */
1764     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1765         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1766         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1767
1768         /* Flush async buffers before un-protect. */
1769         qemu_fflush(pss->pss_channel);
1770         /* Un-protect memory range. */
1771         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1772                 false, false);
1773     }
1774
1775     return res;
1776 }
1777
1778 /* ram_write_tracking_available: check if kernel supports required UFFD features
1779  *
1780  * Returns true if supports, false otherwise
1781  */
1782 bool ram_write_tracking_available(void)
1783 {
1784     uint64_t uffd_features;
1785     int res;
1786
1787     res = uffd_query_features(&uffd_features);
1788     return (res == 0 &&
1789             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1790 }
1791
1792 /* ram_write_tracking_compatible: check if guest configuration is
1793  *   compatible with 'write-tracking'
1794  *
1795  * Returns true if compatible, false otherwise
1796  */
1797 bool ram_write_tracking_compatible(void)
1798 {
1799     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1800     int uffd_fd;
1801     RAMBlock *block;
1802     bool ret = false;
1803
1804     /* Open UFFD file descriptor */
1805     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1806     if (uffd_fd < 0) {
1807         return false;
1808     }
1809
1810     RCU_READ_LOCK_GUARD();
1811
1812     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1813         uint64_t uffd_ioctls;
1814
1815         /* Nothing to do with read-only and MMIO-writable regions */
1816         if (block->mr->readonly || block->mr->rom_device) {
1817             continue;
1818         }
1819         /* Try to register block memory via UFFD-IO to track writes */
1820         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1821                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1822             goto out;
1823         }
1824         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1825             goto out;
1826         }
1827     }
1828     ret = true;
1829
1830 out:
1831     uffd_close_fd(uffd_fd);
1832     return ret;
1833 }
1834
1835 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1836                                        ram_addr_t size)
1837 {
1838     /*
1839      * We read one byte of each page; this will preallocate page tables if
1840      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1841      * where no page was populated yet. This might require adaption when
1842      * supporting other mappings, like shmem.
1843      */
1844     for (; offset < size; offset += block->page_size) {
1845         char tmp = *((char *)block->host + offset);
1846
1847         /* Don't optimize the read out */
1848         asm volatile("" : "+r" (tmp));
1849     }
1850 }
1851
1852 static inline int populate_read_section(MemoryRegionSection *section,
1853                                         void *opaque)
1854 {
1855     const hwaddr size = int128_get64(section->size);
1856     hwaddr offset = section->offset_within_region;
1857     RAMBlock *block = section->mr->ram_block;
1858
1859     populate_read_range(block, offset, size);
1860     return 0;
1861 }
1862
1863 /*
1864  * ram_block_populate_read: preallocate page tables and populate pages in the
1865  *   RAM block by reading a byte of each page.
1866  *
1867  * Since it's solely used for userfault_fd WP feature, here we just
1868  *   hardcode page size to qemu_real_host_page_size.
1869  *
1870  * @block: RAM block to populate
1871  */
1872 static void ram_block_populate_read(RAMBlock *rb)
1873 {
1874     /*
1875      * Skip populating all pages that fall into a discarded range as managed by
1876      * a RamDiscardManager responsible for the mapped memory region of the
1877      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1878      * must not get populated automatically. We don't have to track
1879      * modifications via userfaultfd WP reliably, because these pages will
1880      * not be part of the migration stream either way -- see
1881      * ramblock_dirty_bitmap_exclude_discarded_pages().
1882      *
1883      * Note: The result is only stable while migrating (precopy/postcopy).
1884      */
1885     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1886         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1887         MemoryRegionSection section = {
1888             .mr = rb->mr,
1889             .offset_within_region = 0,
1890             .size = rb->mr->size,
1891         };
1892
1893         ram_discard_manager_replay_populated(rdm, &section,
1894                                              populate_read_section, NULL);
1895     } else {
1896         populate_read_range(rb, 0, rb->used_length);
1897     }
1898 }
1899
1900 /*
1901  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1902  */
1903 void ram_write_tracking_prepare(void)
1904 {
1905     RAMBlock *block;
1906
1907     RCU_READ_LOCK_GUARD();
1908
1909     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1910         /* Nothing to do with read-only and MMIO-writable regions */
1911         if (block->mr->readonly || block->mr->rom_device) {
1912             continue;
1913         }
1914
1915         /*
1916          * Populate pages of the RAM block before enabling userfault_fd
1917          * write protection.
1918          *
1919          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1920          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1921          * pages with pte_none() entries in page table.
1922          */
1923         ram_block_populate_read(block);
1924     }
1925 }
1926
1927 /*
1928  * ram_write_tracking_start: start UFFD-WP memory tracking
1929  *
1930  * Returns 0 for success or negative value in case of error
1931  */
1932 int ram_write_tracking_start(void)
1933 {
1934     int uffd_fd;
1935     RAMState *rs = ram_state;
1936     RAMBlock *block;
1937
1938     /* Open UFFD file descriptor */
1939     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1940     if (uffd_fd < 0) {
1941         return uffd_fd;
1942     }
1943     rs->uffdio_fd = uffd_fd;
1944
1945     RCU_READ_LOCK_GUARD();
1946
1947     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1948         /* Nothing to do with read-only and MMIO-writable regions */
1949         if (block->mr->readonly || block->mr->rom_device) {
1950             continue;
1951         }
1952
1953         /* Register block memory with UFFD to track writes */
1954         if (uffd_register_memory(rs->uffdio_fd, block->host,
1955                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1956             goto fail;
1957         }
1958         /* Apply UFFD write protection to the block memory range */
1959         if (uffd_change_protection(rs->uffdio_fd, block->host,
1960                 block->max_length, true, false)) {
1961             goto fail;
1962         }
1963         block->flags |= RAM_UF_WRITEPROTECT;
1964         memory_region_ref(block->mr);
1965
1966         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1967                 block->host, block->max_length);
1968     }
1969
1970     return 0;
1971
1972 fail:
1973     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1974
1975     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1976         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1977             continue;
1978         }
1979         /*
1980          * In case some memory block failed to be write-protected
1981          * remove protection and unregister all succeeded RAM blocks
1982          */
1983         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1984                 false, false);
1985         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1986         /* Cleanup flags and remove reference */
1987         block->flags &= ~RAM_UF_WRITEPROTECT;
1988         memory_region_unref(block->mr);
1989     }
1990
1991     uffd_close_fd(uffd_fd);
1992     rs->uffdio_fd = -1;
1993     return -1;
1994 }
1995
1996 /**
1997  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1998  */
1999 void ram_write_tracking_stop(void)
2000 {
2001     RAMState *rs = ram_state;
2002     RAMBlock *block;
2003
2004     RCU_READ_LOCK_GUARD();
2005
2006     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2007         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
2008             continue;
2009         }
2010         /* Remove protection and unregister all affected RAM blocks */
2011         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
2012                 false, false);
2013         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
2014
2015         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
2016                 block->host, block->max_length);
2017
2018         /* Cleanup flags and remove reference */
2019         block->flags &= ~RAM_UF_WRITEPROTECT;
2020         memory_region_unref(block->mr);
2021     }
2022
2023     /* Finally close UFFD file descriptor */
2024     uffd_close_fd(rs->uffdio_fd);
2025     rs->uffdio_fd = -1;
2026 }
2027
2028 #else
2029 /* No target OS support, stubs just fail or ignore */
2030
2031 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
2032 {
2033     (void) rs;
2034     (void) offset;
2035
2036     return NULL;
2037 }
2038
2039 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
2040         unsigned long start_page)
2041 {
2042     (void) rs;
2043     (void) pss;
2044     (void) start_page;
2045
2046     return 0;
2047 }
2048
2049 bool ram_write_tracking_available(void)
2050 {
2051     return false;
2052 }
2053
2054 bool ram_write_tracking_compatible(void)
2055 {
2056     assert(0);
2057     return false;
2058 }
2059
2060 int ram_write_tracking_start(void)
2061 {
2062     assert(0);
2063     return -1;
2064 }
2065
2066 void ram_write_tracking_stop(void)
2067 {
2068     assert(0);
2069 }
2070 #endif /* defined(__linux__) */
2071
2072 /*
2073  * Check whether two addr/offset of the ramblock falls onto the same host huge
2074  * page.  Returns true if so, false otherwise.
2075  */
2076 static bool offset_on_same_huge_page(RAMBlock *rb, uint64_t addr1,
2077                                      uint64_t addr2)
2078 {
2079     size_t page_size = qemu_ram_pagesize(rb);
2080
2081     addr1 = ROUND_DOWN(addr1, page_size);
2082     addr2 = ROUND_DOWN(addr2, page_size);
2083
2084     return addr1 == addr2;
2085 }
2086
2087 /*
2088  * Whether a previous preempted precopy huge page contains current requested
2089  * page?  Returns true if so, false otherwise.
2090  *
2091  * This should really happen very rarely, because it means when we were sending
2092  * during background migration for postcopy we're sending exactly the page that
2093  * some vcpu got faulted on on dest node.  When it happens, we probably don't
2094  * need to do much but drop the request, because we know right after we restore
2095  * the precopy stream it'll be serviced.  It'll slightly affect the order of
2096  * postcopy requests to be serviced (e.g. it'll be the same as we move current
2097  * request to the end of the queue) but it shouldn't be a big deal.  The most
2098  * imporant thing is we can _never_ try to send a partial-sent huge page on the
2099  * POSTCOPY channel again, otherwise that huge page will got "split brain" on
2100  * two channels (PRECOPY, POSTCOPY).
2101  */
2102 static bool postcopy_preempted_contains(RAMState *rs, RAMBlock *block,
2103                                         ram_addr_t offset)
2104 {
2105     PostcopyPreemptState *state = &rs->postcopy_preempt_state;
2106
2107     /* No preemption at all? */
2108     if (!state->preempted) {
2109         return false;
2110     }
2111
2112     /* Not even the same ramblock? */
2113     if (state->ram_block != block) {
2114         return false;
2115     }
2116
2117     return offset_on_same_huge_page(block, offset,
2118                                     state->ram_page << TARGET_PAGE_BITS);
2119 }
2120
2121 /**
2122  * get_queued_page: unqueue a page from the postcopy requests
2123  *
2124  * Skips pages that are already sent (!dirty)
2125  *
2126  * Returns true if a queued page is found
2127  *
2128  * @rs: current RAM state
2129  * @pss: data about the state of the current dirty page scan
2130  */
2131 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2132 {
2133     RAMBlock  *block;
2134     ram_addr_t offset;
2135     bool dirty;
2136
2137     do {
2138         block = unqueue_page(rs, &offset);
2139         /*
2140          * We're sending this page, and since it's postcopy nothing else
2141          * will dirty it, and we must make sure it doesn't get sent again
2142          * even if this queue request was received after the background
2143          * search already sent it.
2144          */
2145         if (block) {
2146             unsigned long page;
2147
2148             page = offset >> TARGET_PAGE_BITS;
2149             dirty = test_bit(page, block->bmap);
2150             if (!dirty) {
2151                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2152                                                 page);
2153             } else {
2154                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2155             }
2156         }
2157
2158     } while (block && !dirty);
2159
2160     if (block) {
2161         /* See comment above postcopy_preempted_contains() */
2162         if (postcopy_preempted_contains(rs, block, offset)) {
2163             trace_postcopy_preempt_hit(block->idstr, offset);
2164             /*
2165              * If what we preempted previously was exactly what we're
2166              * requesting right now, restore the preempted precopy
2167              * immediately, boosting its priority as it's requested by
2168              * postcopy.
2169              */
2170             postcopy_preempt_restore(rs, pss, true);
2171             return true;
2172         }
2173     } else {
2174         /*
2175          * Poll write faults too if background snapshot is enabled; that's
2176          * when we have vcpus got blocked by the write protected pages.
2177          */
2178         block = poll_fault_page(rs, &offset);
2179     }
2180
2181     if (block) {
2182         /*
2183          * We want the background search to continue from the queued page
2184          * since the guest is likely to want other pages near to the page
2185          * it just requested.
2186          */
2187         pss->block = block;
2188         pss->page = offset >> TARGET_PAGE_BITS;
2189
2190         /*
2191          * This unqueued page would break the "one round" check, even is
2192          * really rare.
2193          */
2194         pss->complete_round = false;
2195         /* Mark it an urgent request, meanwhile using POSTCOPY channel */
2196         pss->postcopy_requested = true;
2197         pss->postcopy_target_channel = RAM_CHANNEL_POSTCOPY;
2198     }
2199
2200     return !!block;
2201 }
2202
2203 /**
2204  * migration_page_queue_free: drop any remaining pages in the ram
2205  * request queue
2206  *
2207  * It should be empty at the end anyway, but in error cases there may
2208  * be some left.  in case that there is any page left, we drop it.
2209  *
2210  */
2211 static void migration_page_queue_free(RAMState *rs)
2212 {
2213     struct RAMSrcPageRequest *mspr, *next_mspr;
2214     /* This queue generally should be empty - but in the case of a failed
2215      * migration might have some droppings in.
2216      */
2217     RCU_READ_LOCK_GUARD();
2218     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2219         memory_region_unref(mspr->rb->mr);
2220         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2221         g_free(mspr);
2222     }
2223 }
2224
2225 /**
2226  * ram_save_queue_pages: queue the page for transmission
2227  *
2228  * A request from postcopy destination for example.
2229  *
2230  * Returns zero on success or negative on error
2231  *
2232  * @rbname: Name of the RAMBLock of the request. NULL means the
2233  *          same that last one.
2234  * @start: starting address from the start of the RAMBlock
2235  * @len: length (in bytes) to send
2236  */
2237 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2238 {
2239     RAMBlock *ramblock;
2240     RAMState *rs = ram_state;
2241
2242     ram_counters.postcopy_requests++;
2243     RCU_READ_LOCK_GUARD();
2244
2245     if (!rbname) {
2246         /* Reuse last RAMBlock */
2247         ramblock = rs->last_req_rb;
2248
2249         if (!ramblock) {
2250             /*
2251              * Shouldn't happen, we can't reuse the last RAMBlock if
2252              * it's the 1st request.
2253              */
2254             error_report("ram_save_queue_pages no previous block");
2255             return -1;
2256         }
2257     } else {
2258         ramblock = qemu_ram_block_by_name(rbname);
2259
2260         if (!ramblock) {
2261             /* We shouldn't be asked for a non-existent RAMBlock */
2262             error_report("ram_save_queue_pages no block '%s'", rbname);
2263             return -1;
2264         }
2265         rs->last_req_rb = ramblock;
2266     }
2267     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2268     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2269         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2270                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2271                      __func__, start, len, ramblock->used_length);
2272         return -1;
2273     }
2274
2275     /*
2276      * When with postcopy preempt, we send back the page directly in the
2277      * rp-return thread.
2278      */
2279     if (postcopy_preempt_active()) {
2280         ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2281         size_t page_size = qemu_ram_pagesize(ramblock);
2282         PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2283         int ret = 0;
2284
2285         qemu_mutex_lock(&rs->bitmap_mutex);
2286
2287         pss_init(pss, ramblock, page_start);
2288         /*
2289          * Always use the preempt channel, and make sure it's there.  It's
2290          * safe to access without lock, because when rp-thread is running
2291          * we should be the only one who operates on the qemufile
2292          */
2293         pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
2294         pss->postcopy_requested = true;
2295         assert(pss->pss_channel);
2296
2297         /*
2298          * It must be either one or multiple of host page size.  Just
2299          * assert; if something wrong we're mostly split brain anyway.
2300          */
2301         assert(len % page_size == 0);
2302         while (len) {
2303             if (ram_save_host_page_urgent(pss)) {
2304                 error_report("%s: ram_save_host_page_urgent() failed: "
2305                              "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2306                              __func__, ramblock->idstr, start);
2307                 ret = -1;
2308                 break;
2309             }
2310             /*
2311              * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2312              * will automatically be moved and point to the next host page
2313              * we're going to send, so no need to update here.
2314              *
2315              * Normally QEMU never sends >1 host page in requests, so
2316              * logically we don't even need that as the loop should only
2317              * run once, but just to be consistent.
2318              */
2319             len -= page_size;
2320         };
2321         qemu_mutex_unlock(&rs->bitmap_mutex);
2322
2323         return ret;
2324     }
2325
2326     struct RAMSrcPageRequest *new_entry =
2327         g_new0(struct RAMSrcPageRequest, 1);
2328     new_entry->rb = ramblock;
2329     new_entry->offset = start;
2330     new_entry->len = len;
2331
2332     memory_region_ref(ramblock->mr);
2333     qemu_mutex_lock(&rs->src_page_req_mutex);
2334     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2335     migration_make_urgent_request();
2336     qemu_mutex_unlock(&rs->src_page_req_mutex);
2337
2338     return 0;
2339 }
2340
2341 static bool save_page_use_compression(RAMState *rs)
2342 {
2343     if (!migrate_use_compression()) {
2344         return false;
2345     }
2346
2347     /*
2348      * If xbzrle is enabled (e.g., after first round of migration), stop
2349      * using the data compression. In theory, xbzrle can do better than
2350      * compression.
2351      */
2352     if (rs->xbzrle_enabled) {
2353         return false;
2354     }
2355
2356     return true;
2357 }
2358
2359 /*
2360  * try to compress the page before posting it out, return true if the page
2361  * has been properly handled by compression, otherwise needs other
2362  * paths to handle it
2363  */
2364 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2365                                RAMBlock *block, ram_addr_t offset)
2366 {
2367     if (!save_page_use_compression(rs)) {
2368         return false;
2369     }
2370
2371     /*
2372      * When starting the process of a new block, the first page of
2373      * the block should be sent out before other pages in the same
2374      * block, and all the pages in last block should have been sent
2375      * out, keeping this order is important, because the 'cont' flag
2376      * is used to avoid resending the block name.
2377      *
2378      * We post the fist page as normal page as compression will take
2379      * much CPU resource.
2380      */
2381     if (block != pss->last_sent_block) {
2382         flush_compressed_data(rs);
2383         return false;
2384     }
2385
2386     if (compress_page_with_multi_thread(block, offset) > 0) {
2387         return true;
2388     }
2389
2390     compression_counters.busy++;
2391     return false;
2392 }
2393
2394 /**
2395  * ram_save_target_page: save one target page
2396  *
2397  * Returns the number of pages written
2398  *
2399  * @rs: current RAM state
2400  * @pss: data about the page we want to send
2401  */
2402 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
2403 {
2404     RAMBlock *block = pss->block;
2405     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2406     int res;
2407
2408     if (control_save_page(pss, block, offset, &res)) {
2409         return res;
2410     }
2411
2412     if (save_compress_page(rs, pss, block, offset)) {
2413         return 1;
2414     }
2415
2416     res = save_zero_page(pss, block, offset);
2417     if (res > 0) {
2418         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2419          * page would be stale
2420          */
2421         if (rs->xbzrle_enabled) {
2422             XBZRLE_cache_lock();
2423             xbzrle_cache_zero_page(rs, block->offset + offset);
2424             XBZRLE_cache_unlock();
2425         }
2426         return res;
2427     }
2428
2429     /*
2430      * Do not use multifd in postcopy as one whole host page should be
2431      * placed.  Meanwhile postcopy requires atomic update of pages, so even
2432      * if host page size == guest page size the dest guest during run may
2433      * still see partially copied pages which is data corruption.
2434      */
2435     if (migrate_use_multifd() && !migration_in_postcopy()) {
2436         return ram_save_multifd_page(pss->pss_channel, block, offset);
2437     }
2438
2439     return ram_save_page(rs, pss);
2440 }
2441
2442 static bool postcopy_needs_preempt(RAMState *rs, PageSearchStatus *pss)
2443 {
2444     MigrationState *ms = migrate_get_current();
2445
2446     /* Not enabled eager preempt?  Then never do that. */
2447     if (!migrate_postcopy_preempt()) {
2448         return false;
2449     }
2450
2451     /* If the user explicitly disabled breaking of huge page, skip */
2452     if (!ms->postcopy_preempt_break_huge) {
2453         return false;
2454     }
2455
2456     /* If the ramblock we're sending is a small page?  Never bother. */
2457     if (qemu_ram_pagesize(pss->block) == TARGET_PAGE_SIZE) {
2458         return false;
2459     }
2460
2461     /* Not in postcopy at all? */
2462     if (!migration_in_postcopy()) {
2463         return false;
2464     }
2465
2466     /*
2467      * If we're already handling a postcopy request, don't preempt as this page
2468      * has got the same high priority.
2469      */
2470     if (pss->postcopy_requested) {
2471         return false;
2472     }
2473
2474     /* If there's postcopy requests, then check it up! */
2475     return postcopy_has_request(rs);
2476 }
2477
2478 /* Returns true if we preempted precopy, false otherwise */
2479 static void postcopy_do_preempt(RAMState *rs, PageSearchStatus *pss)
2480 {
2481     PostcopyPreemptState *p_state = &rs->postcopy_preempt_state;
2482
2483     trace_postcopy_preempt_triggered(pss->block->idstr, pss->page);
2484
2485     /*
2486      * Time to preempt precopy. Cache current PSS into preempt state, so that
2487      * after handling the postcopy pages we can recover to it.  We need to do
2488      * so because the dest VM will have partial of the precopy huge page kept
2489      * over in its tmp huge page caches; better move on with it when we can.
2490      */
2491     p_state->ram_block = pss->block;
2492     p_state->ram_page = pss->page;
2493     p_state->preempted = true;
2494 }
2495
2496 /* Whether we're preempted by a postcopy request during sending a huge page */
2497 static bool postcopy_preempt_triggered(RAMState *rs)
2498 {
2499     return rs->postcopy_preempt_state.preempted;
2500 }
2501
2502 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
2503                                      bool postcopy_requested)
2504 {
2505     PostcopyPreemptState *state = &rs->postcopy_preempt_state;
2506
2507     assert(state->preempted);
2508
2509     pss->block = state->ram_block;
2510     pss->page = state->ram_page;
2511
2512     /* Whether this is a postcopy request? */
2513     pss->postcopy_requested = postcopy_requested;
2514     /*
2515      * When restoring a preempted page, the old data resides in PRECOPY
2516      * slow channel, even if postcopy_requested is set.  So always use
2517      * PRECOPY channel here.
2518      */
2519     pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
2520
2521     trace_postcopy_preempt_restored(pss->block->idstr, pss->page);
2522
2523     /* Reset preempt state, most importantly, set preempted==false */
2524     postcopy_preempt_reset(rs);
2525 }
2526
2527 static void postcopy_preempt_choose_channel(RAMState *rs, PageSearchStatus *pss)
2528 {
2529     MigrationState *s = migrate_get_current();
2530     unsigned int channel = pss->postcopy_target_channel;
2531     QEMUFile *next;
2532
2533     if (channel != rs->postcopy_channel) {
2534         if (channel == RAM_CHANNEL_PRECOPY) {
2535             next = s->to_dst_file;
2536         } else {
2537             next = s->postcopy_qemufile_src;
2538         }
2539         /* Update and cache the current channel */
2540         rs->f = next;
2541         rs->postcopy_channel = channel;
2542
2543         /*
2544          * If channel switched, reset last_sent_block since the old sent block
2545          * may not be on the same channel.
2546          */
2547         pss->last_sent_block = NULL;
2548
2549         trace_postcopy_preempt_switch_channel(channel);
2550     }
2551
2552     trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2553 }
2554
2555 /* We need to make sure rs->f always points to the default channel elsewhere */
2556 static void postcopy_preempt_reset_channel(RAMState *rs)
2557 {
2558     if (postcopy_preempt_active()) {
2559         rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
2560         rs->f = migrate_get_current()->to_dst_file;
2561         trace_postcopy_preempt_reset_channel();
2562     }
2563 }
2564
2565 /* Should be called before sending a host page */
2566 static void pss_host_page_prepare(PageSearchStatus *pss)
2567 {
2568     /* How many guest pages are there in one host page? */
2569     size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2570
2571     pss->host_page_sending = true;
2572     pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2573     pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2574 }
2575
2576 /*
2577  * Whether the page pointed by PSS is within the host page being sent.
2578  * Must be called after a previous pss_host_page_prepare().
2579  */
2580 static bool pss_within_range(PageSearchStatus *pss)
2581 {
2582     ram_addr_t ram_addr;
2583
2584     assert(pss->host_page_sending);
2585
2586     /* Over host-page boundary? */
2587     if (pss->page >= pss->host_page_end) {
2588         return false;
2589     }
2590
2591     ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2592
2593     return offset_in_ramblock(pss->block, ram_addr);
2594 }
2595
2596 static void pss_host_page_finish(PageSearchStatus *pss)
2597 {
2598     pss->host_page_sending = false;
2599     /* This is not needed, but just to reset it */
2600     pss->host_page_start = pss->host_page_end = 0;
2601 }
2602
2603 /*
2604  * Send an urgent host page specified by `pss'.  Need to be called with
2605  * bitmap_mutex held.
2606  *
2607  * Returns 0 if save host page succeeded, false otherwise.
2608  */
2609 static int ram_save_host_page_urgent(PageSearchStatus *pss)
2610 {
2611     bool page_dirty, sent = false;
2612     RAMState *rs = ram_state;
2613     int ret = 0;
2614
2615     trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2616     pss_host_page_prepare(pss);
2617
2618     /*
2619      * If precopy is sending the same page, let it be done in precopy, or
2620      * we could send the same page in two channels and none of them will
2621      * receive the whole page.
2622      */
2623     if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2624         trace_postcopy_preempt_hit(pss->block->idstr,
2625                                    pss->page << TARGET_PAGE_BITS);
2626         return 0;
2627     }
2628
2629     do {
2630         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2631
2632         if (page_dirty) {
2633             /* Be strict to return code; it must be 1, or what else? */
2634             if (ram_save_target_page(rs, pss) != 1) {
2635                 error_report_once("%s: ram_save_target_page failed", __func__);
2636                 ret = -1;
2637                 goto out;
2638             }
2639             sent = true;
2640         }
2641         pss_find_next_dirty(pss);
2642     } while (pss_within_range(pss));
2643 out:
2644     pss_host_page_finish(pss);
2645     /* For urgent requests, flush immediately if sent */
2646     if (sent) {
2647         qemu_fflush(pss->pss_channel);
2648     }
2649     return ret;
2650 }
2651
2652 /**
2653  * ram_save_host_page: save a whole host page
2654  *
2655  * Starting at *offset send pages up to the end of the current host
2656  * page. It's valid for the initial offset to point into the middle of
2657  * a host page in which case the remainder of the hostpage is sent.
2658  * Only dirty target pages are sent. Note that the host page size may
2659  * be a huge page for this block.
2660  *
2661  * The saving stops at the boundary of the used_length of the block
2662  * if the RAMBlock isn't a multiple of the host page size.
2663  *
2664  * The caller must be with ram_state.bitmap_mutex held to call this
2665  * function.  Note that this function can temporarily release the lock, but
2666  * when the function is returned it'll make sure the lock is still held.
2667  *
2668  * Returns the number of pages written or negative on error
2669  *
2670  * @rs: current RAM state
2671  * @pss: data about the page we want to send
2672  */
2673 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2674 {
2675     bool page_dirty, preempt_active = postcopy_preempt_active();
2676     int tmppages, pages = 0;
2677     size_t pagesize_bits =
2678         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2679     unsigned long start_page = pss->page;
2680     int res;
2681
2682     if (ramblock_is_ignored(pss->block)) {
2683         error_report("block %s should not be migrated !", pss->block->idstr);
2684         return 0;
2685     }
2686
2687     /* Update host page boundary information */
2688     pss_host_page_prepare(pss);
2689
2690     do {
2691         if (postcopy_needs_preempt(rs, pss)) {
2692             postcopy_do_preempt(rs, pss);
2693             break;
2694         }
2695
2696         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2697
2698         /* Check the pages is dirty and if it is send it */
2699         if (page_dirty) {
2700             /*
2701              * Properly yield the lock only in postcopy preempt mode
2702              * because both migration thread and rp-return thread can
2703              * operate on the bitmaps.
2704              */
2705             if (preempt_active) {
2706                 qemu_mutex_unlock(&rs->bitmap_mutex);
2707             }
2708             tmppages = ram_save_target_page(rs, pss);
2709             if (tmppages >= 0) {
2710                 pages += tmppages;
2711                 /*
2712                  * Allow rate limiting to happen in the middle of huge pages if
2713                  * something is sent in the current iteration.
2714                  */
2715                 if (pagesize_bits > 1 && tmppages > 0) {
2716                     migration_rate_limit();
2717                 }
2718             }
2719             if (preempt_active) {
2720                 qemu_mutex_lock(&rs->bitmap_mutex);
2721             }
2722         } else {
2723             tmppages = 0;
2724         }
2725
2726         if (tmppages < 0) {
2727             pss_host_page_finish(pss);
2728             return tmppages;
2729         }
2730
2731         pss_find_next_dirty(pss);
2732     } while (pss_within_range(pss));
2733
2734     pss_host_page_finish(pss);
2735
2736     /*
2737      * When with postcopy preempt mode, flush the data as soon as possible for
2738      * postcopy requests, because we've already sent a whole huge page, so the
2739      * dst node should already have enough resource to atomically filling in
2740      * the current missing page.
2741      *
2742      * More importantly, when using separate postcopy channel, we must do
2743      * explicit flush or it won't flush until the buffer is full.
2744      */
2745     if (migrate_postcopy_preempt() && pss->postcopy_requested) {
2746         qemu_fflush(pss->pss_channel);
2747     }
2748
2749     res = ram_save_release_protection(rs, pss, start_page);
2750     return (res < 0 ? res : pages);
2751 }
2752
2753 /**
2754  * ram_find_and_save_block: finds a dirty page and sends it to f
2755  *
2756  * Called within an RCU critical section.
2757  *
2758  * Returns the number of pages written where zero means no dirty pages,
2759  * or negative on error
2760  *
2761  * @rs: current RAM state
2762  *
2763  * On systems where host-page-size > target-page-size it will send all the
2764  * pages in a host page that are dirty.
2765  */
2766 static int ram_find_and_save_block(RAMState *rs)
2767 {
2768     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2769     int pages = 0;
2770     bool again, found;
2771
2772     /* No dirty page as there is zero RAM */
2773     if (!ram_bytes_total()) {
2774         return pages;
2775     }
2776
2777     /*
2778      * Always keep last_seen_block/last_page valid during this procedure,
2779      * because find_dirty_block() relies on these values (e.g., we compare
2780      * last_seen_block with pss.block to see whether we searched all the
2781      * ramblocks) to detect the completion of migration.  Having NULL value
2782      * of last_seen_block can conditionally cause below loop to run forever.
2783      */
2784     if (!rs->last_seen_block) {
2785         rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2786         rs->last_page = 0;
2787     }
2788
2789     pss_init(pss, rs->last_seen_block, rs->last_page);
2790
2791     do {
2792         again = true;
2793         found = get_queued_page(rs, pss);
2794
2795         if (!found) {
2796             /*
2797              * Recover previous precopy ramblock/offset if postcopy has
2798              * preempted precopy.  Otherwise find the next dirty bit.
2799              */
2800             if (postcopy_preempt_triggered(rs)) {
2801                 postcopy_preempt_restore(rs, pss, false);
2802                 found = true;
2803             } else {
2804                 /* priority queue empty, so just search for something dirty */
2805                 found = find_dirty_block(rs, pss, &again);
2806             }
2807         }
2808
2809         if (found) {
2810             /* Update rs->f with correct channel */
2811             if (postcopy_preempt_active()) {
2812                 postcopy_preempt_choose_channel(rs, pss);
2813             }
2814             /* Cache rs->f in pss_channel (TODO: remove rs->f) */
2815             pss->pss_channel = rs->f;
2816             pages = ram_save_host_page(rs, pss);
2817         }
2818     } while (!pages && again);
2819
2820     rs->last_seen_block = pss->block;
2821     rs->last_page = pss->page;
2822
2823     return pages;
2824 }
2825
2826 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2827 {
2828     uint64_t pages = size / TARGET_PAGE_SIZE;
2829
2830     if (zero) {
2831         stat64_add(&ram_atomic_counters.duplicate, pages);
2832     } else {
2833         stat64_add(&ram_atomic_counters.normal, pages);
2834         ram_transferred_add(size);
2835         qemu_file_credit_transfer(f, size);
2836     }
2837 }
2838
2839 static uint64_t ram_bytes_total_common(bool count_ignored)
2840 {
2841     RAMBlock *block;
2842     uint64_t total = 0;
2843
2844     RCU_READ_LOCK_GUARD();
2845
2846     if (count_ignored) {
2847         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2848             total += block->used_length;
2849         }
2850     } else {
2851         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2852             total += block->used_length;
2853         }
2854     }
2855     return total;
2856 }
2857
2858 uint64_t ram_bytes_total(void)
2859 {
2860     return ram_bytes_total_common(false);
2861 }
2862
2863 static void xbzrle_load_setup(void)
2864 {
2865     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2866 }
2867
2868 static void xbzrle_load_cleanup(void)
2869 {
2870     g_free(XBZRLE.decoded_buf);
2871     XBZRLE.decoded_buf = NULL;
2872 }
2873
2874 static void ram_state_cleanup(RAMState **rsp)
2875 {
2876     if (*rsp) {
2877         migration_page_queue_free(*rsp);
2878         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2879         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2880         g_free(*rsp);
2881         *rsp = NULL;
2882     }
2883 }
2884
2885 static void xbzrle_cleanup(void)
2886 {
2887     XBZRLE_cache_lock();
2888     if (XBZRLE.cache) {
2889         cache_fini(XBZRLE.cache);
2890         g_free(XBZRLE.encoded_buf);
2891         g_free(XBZRLE.current_buf);
2892         g_free(XBZRLE.zero_target_page);
2893         XBZRLE.cache = NULL;
2894         XBZRLE.encoded_buf = NULL;
2895         XBZRLE.current_buf = NULL;
2896         XBZRLE.zero_target_page = NULL;
2897     }
2898     XBZRLE_cache_unlock();
2899 }
2900
2901 static void ram_save_cleanup(void *opaque)
2902 {
2903     RAMState **rsp = opaque;
2904     RAMBlock *block;
2905
2906     /* We don't use dirty log with background snapshots */
2907     if (!migrate_background_snapshot()) {
2908         /* caller have hold iothread lock or is in a bh, so there is
2909          * no writing race against the migration bitmap
2910          */
2911         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2912             /*
2913              * do not stop dirty log without starting it, since
2914              * memory_global_dirty_log_stop will assert that
2915              * memory_global_dirty_log_start/stop used in pairs
2916              */
2917             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2918         }
2919     }
2920
2921     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2922         g_free(block->clear_bmap);
2923         block->clear_bmap = NULL;
2924         g_free(block->bmap);
2925         block->bmap = NULL;
2926     }
2927
2928     xbzrle_cleanup();
2929     compress_threads_save_cleanup();
2930     ram_state_cleanup(rsp);
2931 }
2932
2933 static void ram_state_reset(RAMState *rs)
2934 {
2935     int i;
2936
2937     for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2938         rs->pss[i].last_sent_block = NULL;
2939     }
2940
2941     rs->last_seen_block = NULL;
2942     rs->last_page = 0;
2943     rs->last_version = ram_list.version;
2944     rs->xbzrle_enabled = false;
2945     postcopy_preempt_reset(rs);
2946     rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
2947 }
2948
2949 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2950
2951 /* **** functions for postcopy ***** */
2952
2953 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2954 {
2955     struct RAMBlock *block;
2956
2957     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2958         unsigned long *bitmap = block->bmap;
2959         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2960         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2961
2962         while (run_start < range) {
2963             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2964             ram_discard_range(block->idstr,
2965                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2966                               ((ram_addr_t)(run_end - run_start))
2967                                 << TARGET_PAGE_BITS);
2968             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2969         }
2970     }
2971 }
2972
2973 /**
2974  * postcopy_send_discard_bm_ram: discard a RAMBlock
2975  *
2976  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2977  *
2978  * @ms: current migration state
2979  * @block: RAMBlock to discard
2980  */
2981 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2982 {
2983     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2984     unsigned long current;
2985     unsigned long *bitmap = block->bmap;
2986
2987     for (current = 0; current < end; ) {
2988         unsigned long one = find_next_bit(bitmap, end, current);
2989         unsigned long zero, discard_length;
2990
2991         if (one >= end) {
2992             break;
2993         }
2994
2995         zero = find_next_zero_bit(bitmap, end, one + 1);
2996
2997         if (zero >= end) {
2998             discard_length = end - one;
2999         } else {
3000             discard_length = zero - one;
3001         }
3002         postcopy_discard_send_range(ms, one, discard_length);
3003         current = one + discard_length;
3004     }
3005 }
3006
3007 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
3008
3009 /**
3010  * postcopy_each_ram_send_discard: discard all RAMBlocks
3011  *
3012  * Utility for the outgoing postcopy code.
3013  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
3014  *   passing it bitmap indexes and name.
3015  * (qemu_ram_foreach_block ends up passing unscaled lengths
3016  *  which would mean postcopy code would have to deal with target page)
3017  *
3018  * @ms: current migration state
3019  */
3020 static void postcopy_each_ram_send_discard(MigrationState *ms)
3021 {
3022     struct RAMBlock *block;
3023
3024     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3025         postcopy_discard_send_init(ms, block->idstr);
3026
3027         /*
3028          * Deal with TPS != HPS and huge pages.  It discard any partially sent
3029          * host-page size chunks, mark any partially dirty host-page size
3030          * chunks as all dirty.  In this case the host-page is the host-page
3031          * for the particular RAMBlock, i.e. it might be a huge page.
3032          */
3033         postcopy_chunk_hostpages_pass(ms, block);
3034
3035         /*
3036          * Postcopy sends chunks of bitmap over the wire, but it
3037          * just needs indexes at this point, avoids it having
3038          * target page specific code.
3039          */
3040         postcopy_send_discard_bm_ram(ms, block);
3041         postcopy_discard_send_finish(ms);
3042     }
3043 }
3044
3045 /**
3046  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
3047  *
3048  * Helper for postcopy_chunk_hostpages; it's called twice to
3049  * canonicalize the two bitmaps, that are similar, but one is
3050  * inverted.
3051  *
3052  * Postcopy requires that all target pages in a hostpage are dirty or
3053  * clean, not a mix.  This function canonicalizes the bitmaps.
3054  *
3055  * @ms: current migration state
3056  * @block: block that contains the page we want to canonicalize
3057  */
3058 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
3059 {
3060     RAMState *rs = ram_state;
3061     unsigned long *bitmap = block->bmap;
3062     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
3063     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
3064     unsigned long run_start;
3065
3066     if (block->page_size == TARGET_PAGE_SIZE) {
3067         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
3068         return;
3069     }
3070
3071     /* Find a dirty page */
3072     run_start = find_next_bit(bitmap, pages, 0);
3073
3074     while (run_start < pages) {
3075
3076         /*
3077          * If the start of this run of pages is in the middle of a host
3078          * page, then we need to fixup this host page.
3079          */
3080         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
3081             /* Find the end of this run */
3082             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
3083             /*
3084              * If the end isn't at the start of a host page, then the
3085              * run doesn't finish at the end of a host page
3086              * and we need to discard.
3087              */
3088         }
3089
3090         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
3091             unsigned long page;
3092             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
3093                                                              host_ratio);
3094             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
3095
3096             /* Clean up the bitmap */
3097             for (page = fixup_start_addr;
3098                  page < fixup_start_addr + host_ratio; page++) {
3099                 /*
3100                  * Remark them as dirty, updating the count for any pages
3101                  * that weren't previously dirty.
3102                  */
3103                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
3104             }
3105         }
3106
3107         /* Find the next dirty page for the next iteration */
3108         run_start = find_next_bit(bitmap, pages, run_start);
3109     }
3110 }
3111
3112 /**
3113  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
3114  *
3115  * Transmit the set of pages to be discarded after precopy to the target
3116  * these are pages that:
3117  *     a) Have been previously transmitted but are now dirty again
3118  *     b) Pages that have never been transmitted, this ensures that
3119  *        any pages on the destination that have been mapped by background
3120  *        tasks get discarded (transparent huge pages is the specific concern)
3121  * Hopefully this is pretty sparse
3122  *
3123  * @ms: current migration state
3124  */
3125 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
3126 {
3127     RAMState *rs = ram_state;
3128
3129     RCU_READ_LOCK_GUARD();
3130
3131     /* This should be our last sync, the src is now paused */
3132     migration_bitmap_sync(rs);
3133
3134     /* Easiest way to make sure we don't resume in the middle of a host-page */
3135     rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
3136     rs->last_seen_block = NULL;
3137     rs->last_page = 0;
3138
3139     postcopy_each_ram_send_discard(ms);
3140
3141     trace_ram_postcopy_send_discard_bitmap();
3142 }
3143
3144 /**
3145  * ram_discard_range: discard dirtied pages at the beginning of postcopy
3146  *
3147  * Returns zero on success
3148  *
3149  * @rbname: name of the RAMBlock of the request. NULL means the
3150  *          same that last one.
3151  * @start: RAMBlock starting page
3152  * @length: RAMBlock size
3153  */
3154 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
3155 {
3156     trace_ram_discard_range(rbname, start, length);
3157
3158     RCU_READ_LOCK_GUARD();
3159     RAMBlock *rb = qemu_ram_block_by_name(rbname);
3160
3161     if (!rb) {
3162         error_report("ram_discard_range: Failed to find block '%s'", rbname);
3163         return -1;
3164     }
3165
3166     /*
3167      * On source VM, we don't need to update the received bitmap since
3168      * we don't even have one.
3169      */
3170     if (rb->receivedmap) {
3171         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3172                      length >> qemu_target_page_bits());
3173     }
3174
3175     return ram_block_discard_range(rb, start, length);
3176 }
3177
3178 /*
3179  * For every allocation, we will try not to crash the VM if the
3180  * allocation failed.
3181  */
3182 static int xbzrle_init(void)
3183 {
3184     Error *local_err = NULL;
3185
3186     if (!migrate_use_xbzrle()) {
3187         return 0;
3188     }
3189
3190     XBZRLE_cache_lock();
3191
3192     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3193     if (!XBZRLE.zero_target_page) {
3194         error_report("%s: Error allocating zero page", __func__);
3195         goto err_out;
3196     }
3197
3198     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3199                               TARGET_PAGE_SIZE, &local_err);
3200     if (!XBZRLE.cache) {
3201         error_report_err(local_err);
3202         goto free_zero_page;
3203     }
3204
3205     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3206     if (!XBZRLE.encoded_buf) {
3207         error_report("%s: Error allocating encoded_buf", __func__);
3208         goto free_cache;
3209     }
3210
3211     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3212     if (!XBZRLE.current_buf) {
3213         error_report("%s: Error allocating current_buf", __func__);
3214         goto free_encoded_buf;
3215     }
3216
3217     /* We are all good */
3218     XBZRLE_cache_unlock();
3219     return 0;
3220
3221 free_encoded_buf:
3222     g_free(XBZRLE.encoded_buf);
3223     XBZRLE.encoded_buf = NULL;
3224 free_cache:
3225     cache_fini(XBZRLE.cache);
3226     XBZRLE.cache = NULL;
3227 free_zero_page:
3228     g_free(XBZRLE.zero_target_page);
3229     XBZRLE.zero_target_page = NULL;
3230 err_out:
3231     XBZRLE_cache_unlock();
3232     return -ENOMEM;
3233 }
3234
3235 static int ram_state_init(RAMState **rsp)
3236 {
3237     *rsp = g_try_new0(RAMState, 1);
3238
3239     if (!*rsp) {
3240         error_report("%s: Init ramstate fail", __func__);
3241         return -1;
3242     }
3243
3244     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3245     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3246     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3247
3248     /*
3249      * Count the total number of pages used by ram blocks not including any
3250      * gaps due to alignment or unplugs.
3251      * This must match with the initial values of dirty bitmap.
3252      */
3253     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3254     ram_state_reset(*rsp);
3255
3256     return 0;
3257 }
3258
3259 static void ram_list_init_bitmaps(void)
3260 {
3261     MigrationState *ms = migrate_get_current();
3262     RAMBlock *block;
3263     unsigned long pages;
3264     uint8_t shift;
3265
3266     /* Skip setting bitmap if there is no RAM */
3267     if (ram_bytes_total()) {
3268         shift = ms->clear_bitmap_shift;
3269         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3270             error_report("clear_bitmap_shift (%u) too big, using "
3271                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3272             shift = CLEAR_BITMAP_SHIFT_MAX;
3273         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3274             error_report("clear_bitmap_shift (%u) too small, using "
3275                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3276             shift = CLEAR_BITMAP_SHIFT_MIN;
3277         }
3278
3279         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3280             pages = block->max_length >> TARGET_PAGE_BITS;
3281             /*
3282              * The initial dirty bitmap for migration must be set with all
3283              * ones to make sure we'll migrate every guest RAM page to
3284              * destination.
3285              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3286              * new migration after a failed migration, ram_list.
3287              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3288              * guest memory.
3289              */
3290             block->bmap = bitmap_new(pages);
3291             bitmap_set(block->bmap, 0, pages);
3292             block->clear_bmap_shift = shift;
3293             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3294         }
3295     }
3296 }
3297
3298 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3299 {
3300     unsigned long pages;
3301     RAMBlock *rb;
3302
3303     RCU_READ_LOCK_GUARD();
3304
3305     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3306             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3307             rs->migration_dirty_pages -= pages;
3308     }
3309 }
3310
3311 static void ram_init_bitmaps(RAMState *rs)
3312 {
3313     /* For memory_global_dirty_log_start below.  */
3314     qemu_mutex_lock_iothread();
3315     qemu_mutex_lock_ramlist();
3316
3317     WITH_RCU_READ_LOCK_GUARD() {
3318         ram_list_init_bitmaps();
3319         /* We don't use dirty log with background snapshots */
3320         if (!migrate_background_snapshot()) {
3321             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3322             migration_bitmap_sync_precopy(rs);
3323         }
3324     }
3325     qemu_mutex_unlock_ramlist();
3326     qemu_mutex_unlock_iothread();
3327
3328     /*
3329      * After an eventual first bitmap sync, fixup the initial bitmap
3330      * containing all 1s to exclude any discarded pages from migration.
3331      */
3332     migration_bitmap_clear_discarded_pages(rs);
3333 }
3334
3335 static int ram_init_all(RAMState **rsp)
3336 {
3337     if (ram_state_init(rsp)) {
3338         return -1;
3339     }
3340
3341     if (xbzrle_init()) {
3342         ram_state_cleanup(rsp);
3343         return -1;
3344     }
3345
3346     ram_init_bitmaps(*rsp);
3347
3348     return 0;
3349 }
3350
3351 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3352 {
3353     RAMBlock *block;
3354     uint64_t pages = 0;
3355
3356     /*
3357      * Postcopy is not using xbzrle/compression, so no need for that.
3358      * Also, since source are already halted, we don't need to care
3359      * about dirty page logging as well.
3360      */
3361
3362     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3363         pages += bitmap_count_one(block->bmap,
3364                                   block->used_length >> TARGET_PAGE_BITS);
3365     }
3366
3367     /* This may not be aligned with current bitmaps. Recalculate. */
3368     rs->migration_dirty_pages = pages;
3369
3370     ram_state_reset(rs);
3371
3372     /* Update RAMState cache of output QEMUFile */
3373     rs->f = out;
3374
3375     trace_ram_state_resume_prepare(pages);
3376 }
3377
3378 /*
3379  * This function clears bits of the free pages reported by the caller from the
3380  * migration dirty bitmap. @addr is the host address corresponding to the
3381  * start of the continuous guest free pages, and @len is the total bytes of
3382  * those pages.
3383  */
3384 void qemu_guest_free_page_hint(void *addr, size_t len)
3385 {
3386     RAMBlock *block;
3387     ram_addr_t offset;
3388     size_t used_len, start, npages;
3389     MigrationState *s = migrate_get_current();
3390
3391     /* This function is currently expected to be used during live migration */
3392     if (!migration_is_setup_or_active(s->state)) {
3393         return;
3394     }
3395
3396     for (; len > 0; len -= used_len, addr += used_len) {
3397         block = qemu_ram_block_from_host(addr, false, &offset);
3398         if (unlikely(!block || offset >= block->used_length)) {
3399             /*
3400              * The implementation might not support RAMBlock resize during
3401              * live migration, but it could happen in theory with future
3402              * updates. So we add a check here to capture that case.
3403              */
3404             error_report_once("%s unexpected error", __func__);
3405             return;
3406         }
3407
3408         if (len <= block->used_length - offset) {
3409             used_len = len;
3410         } else {
3411             used_len = block->used_length - offset;
3412         }
3413
3414         start = offset >> TARGET_PAGE_BITS;
3415         npages = used_len >> TARGET_PAGE_BITS;
3416
3417         qemu_mutex_lock(&ram_state->bitmap_mutex);
3418         /*
3419          * The skipped free pages are equavalent to be sent from clear_bmap's
3420          * perspective, so clear the bits from the memory region bitmap which
3421          * are initially set. Otherwise those skipped pages will be sent in
3422          * the next round after syncing from the memory region bitmap.
3423          */
3424         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
3425         ram_state->migration_dirty_pages -=
3426                       bitmap_count_one_with_offset(block->bmap, start, npages);
3427         bitmap_clear(block->bmap, start, npages);
3428         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3429     }
3430 }
3431
3432 /*
3433  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3434  * long-running RCU critical section.  When rcu-reclaims in the code
3435  * start to become numerous it will be necessary to reduce the
3436  * granularity of these critical sections.
3437  */
3438
3439 /**
3440  * ram_save_setup: Setup RAM for migration
3441  *
3442  * Returns zero to indicate success and negative for error
3443  *
3444  * @f: QEMUFile where to send the data
3445  * @opaque: RAMState pointer
3446  */
3447 static int ram_save_setup(QEMUFile *f, void *opaque)
3448 {
3449     RAMState **rsp = opaque;
3450     RAMBlock *block;
3451     int ret;
3452
3453     if (compress_threads_save_setup()) {
3454         return -1;
3455     }
3456
3457     /* migration has already setup the bitmap, reuse it. */
3458     if (!migration_in_colo_state()) {
3459         if (ram_init_all(rsp) != 0) {
3460             compress_threads_save_cleanup();
3461             return -1;
3462         }
3463     }
3464     (*rsp)->f = f;
3465
3466     WITH_RCU_READ_LOCK_GUARD() {
3467         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3468
3469         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3470             qemu_put_byte(f, strlen(block->idstr));
3471             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3472             qemu_put_be64(f, block->used_length);
3473             if (migrate_postcopy_ram() && block->page_size !=
3474                                           qemu_host_page_size) {
3475                 qemu_put_be64(f, block->page_size);
3476             }
3477             if (migrate_ignore_shared()) {
3478                 qemu_put_be64(f, block->mr->addr);
3479             }
3480         }
3481     }
3482
3483     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3484     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3485
3486     ret =  multifd_send_sync_main(f);
3487     if (ret < 0) {
3488         return ret;
3489     }
3490
3491     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3492     qemu_fflush(f);
3493
3494     return 0;
3495 }
3496
3497 /**
3498  * ram_save_iterate: iterative stage for migration
3499  *
3500  * Returns zero to indicate success and negative for error
3501  *
3502  * @f: QEMUFile where to send the data
3503  * @opaque: RAMState pointer
3504  */
3505 static int ram_save_iterate(QEMUFile *f, void *opaque)
3506 {
3507     RAMState **temp = opaque;
3508     RAMState *rs = *temp;
3509     int ret = 0;
3510     int i;
3511     int64_t t0;
3512     int done = 0;
3513
3514     if (blk_mig_bulk_active()) {
3515         /* Avoid transferring ram during bulk phase of block migration as
3516          * the bulk phase will usually take a long time and transferring
3517          * ram updates during that time is pointless. */
3518         goto out;
3519     }
3520
3521     /*
3522      * We'll take this lock a little bit long, but it's okay for two reasons.
3523      * Firstly, the only possible other thread to take it is who calls
3524      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3525      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3526      * guarantees that we'll at least released it in a regular basis.
3527      */
3528     qemu_mutex_lock(&rs->bitmap_mutex);
3529     WITH_RCU_READ_LOCK_GUARD() {
3530         if (ram_list.version != rs->last_version) {
3531             ram_state_reset(rs);
3532         }
3533
3534         /* Read version before ram_list.blocks */
3535         smp_rmb();
3536
3537         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3538
3539         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3540         i = 0;
3541         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3542                postcopy_has_request(rs)) {
3543             int pages;
3544
3545             if (qemu_file_get_error(f)) {
3546                 break;
3547             }
3548
3549             pages = ram_find_and_save_block(rs);
3550             /* no more pages to sent */
3551             if (pages == 0) {
3552                 done = 1;
3553                 break;
3554             }
3555
3556             if (pages < 0) {
3557                 qemu_file_set_error(f, pages);
3558                 break;
3559             }
3560
3561             rs->target_page_count += pages;
3562
3563             /*
3564              * During postcopy, it is necessary to make sure one whole host
3565              * page is sent in one chunk.
3566              */
3567             if (migrate_postcopy_ram()) {
3568                 flush_compressed_data(rs);
3569             }
3570
3571             /*
3572              * we want to check in the 1st loop, just in case it was the 1st
3573              * time and we had to sync the dirty bitmap.
3574              * qemu_clock_get_ns() is a bit expensive, so we only check each
3575              * some iterations
3576              */
3577             if ((i & 63) == 0) {
3578                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3579                               1000000;
3580                 if (t1 > MAX_WAIT) {
3581                     trace_ram_save_iterate_big_wait(t1, i);
3582                     break;
3583                 }
3584             }
3585             i++;
3586         }
3587     }
3588     qemu_mutex_unlock(&rs->bitmap_mutex);
3589
3590     postcopy_preempt_reset_channel(rs);
3591
3592     /*
3593      * Must occur before EOS (or any QEMUFile operation)
3594      * because of RDMA protocol.
3595      */
3596     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3597
3598 out:
3599     if (ret >= 0
3600         && migration_is_setup_or_active(migrate_get_current()->state)) {
3601         ret = multifd_send_sync_main(rs->f);
3602         if (ret < 0) {
3603             return ret;
3604         }
3605
3606         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3607         qemu_fflush(f);
3608         ram_transferred_add(8);
3609
3610         ret = qemu_file_get_error(f);
3611     }
3612     if (ret < 0) {
3613         return ret;
3614     }
3615
3616     return done;
3617 }
3618
3619 /**
3620  * ram_save_complete: function called to send the remaining amount of ram
3621  *
3622  * Returns zero to indicate success or negative on error
3623  *
3624  * Called with iothread lock
3625  *
3626  * @f: QEMUFile where to send the data
3627  * @opaque: RAMState pointer
3628  */
3629 static int ram_save_complete(QEMUFile *f, void *opaque)
3630 {
3631     RAMState **temp = opaque;
3632     RAMState *rs = *temp;
3633     int ret = 0;
3634
3635     rs->last_stage = !migration_in_colo_state();
3636
3637     WITH_RCU_READ_LOCK_GUARD() {
3638         if (!migration_in_postcopy()) {
3639             migration_bitmap_sync_precopy(rs);
3640         }
3641
3642         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3643
3644         /* try transferring iterative blocks of memory */
3645
3646         /* flush all remaining blocks regardless of rate limiting */
3647         qemu_mutex_lock(&rs->bitmap_mutex);
3648         while (true) {
3649             int pages;
3650
3651             pages = ram_find_and_save_block(rs);
3652             /* no more blocks to sent */
3653             if (pages == 0) {
3654                 break;
3655             }
3656             if (pages < 0) {
3657                 ret = pages;
3658                 break;
3659             }
3660         }
3661         qemu_mutex_unlock(&rs->bitmap_mutex);
3662
3663         flush_compressed_data(rs);
3664         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3665     }
3666
3667     if (ret < 0) {
3668         return ret;
3669     }
3670
3671     postcopy_preempt_reset_channel(rs);
3672
3673     ret = multifd_send_sync_main(rs->f);
3674     if (ret < 0) {
3675         return ret;
3676     }
3677
3678     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3679     qemu_fflush(f);
3680
3681     return 0;
3682 }
3683
3684 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3685                              uint64_t *res_precopy_only,
3686                              uint64_t *res_compatible,
3687                              uint64_t *res_postcopy_only)
3688 {
3689     RAMState **temp = opaque;
3690     RAMState *rs = *temp;
3691     uint64_t remaining_size;
3692
3693     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3694
3695     if (!migration_in_postcopy() &&
3696         remaining_size < max_size) {
3697         qemu_mutex_lock_iothread();
3698         WITH_RCU_READ_LOCK_GUARD() {
3699             migration_bitmap_sync_precopy(rs);
3700         }
3701         qemu_mutex_unlock_iothread();
3702         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3703     }
3704
3705     if (migrate_postcopy_ram()) {
3706         /* We can do postcopy, and all the data is postcopiable */
3707         *res_compatible += remaining_size;
3708     } else {
3709         *res_precopy_only += remaining_size;
3710     }
3711 }
3712
3713 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3714 {
3715     unsigned int xh_len;
3716     int xh_flags;
3717     uint8_t *loaded_data;
3718
3719     /* extract RLE header */
3720     xh_flags = qemu_get_byte(f);
3721     xh_len = qemu_get_be16(f);
3722
3723     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3724         error_report("Failed to load XBZRLE page - wrong compression!");
3725         return -1;
3726     }
3727
3728     if (xh_len > TARGET_PAGE_SIZE) {
3729         error_report("Failed to load XBZRLE page - len overflow!");
3730         return -1;
3731     }
3732     loaded_data = XBZRLE.decoded_buf;
3733     /* load data and decode */
3734     /* it can change loaded_data to point to an internal buffer */
3735     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3736
3737     /* decode RLE */
3738     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3739                              TARGET_PAGE_SIZE) == -1) {
3740         error_report("Failed to load XBZRLE page - decode error!");
3741         return -1;
3742     }
3743
3744     return 0;
3745 }
3746
3747 /**
3748  * ram_block_from_stream: read a RAMBlock id from the migration stream
3749  *
3750  * Must be called from within a rcu critical section.
3751  *
3752  * Returns a pointer from within the RCU-protected ram_list.
3753  *
3754  * @mis: the migration incoming state pointer
3755  * @f: QEMUFile where to read the data from
3756  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3757  * @channel: the channel we're using
3758  */
3759 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3760                                               QEMUFile *f, int flags,
3761                                               int channel)
3762 {
3763     RAMBlock *block = mis->last_recv_block[channel];
3764     char id[256];
3765     uint8_t len;
3766
3767     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3768         if (!block) {
3769             error_report("Ack, bad migration stream!");
3770             return NULL;
3771         }
3772         return block;
3773     }
3774
3775     len = qemu_get_byte(f);
3776     qemu_get_buffer(f, (uint8_t *)id, len);
3777     id[len] = 0;
3778
3779     block = qemu_ram_block_by_name(id);
3780     if (!block) {
3781         error_report("Can't find block %s", id);
3782         return NULL;
3783     }
3784
3785     if (ramblock_is_ignored(block)) {
3786         error_report("block %s should not be migrated !", id);
3787         return NULL;
3788     }
3789
3790     mis->last_recv_block[channel] = block;
3791
3792     return block;
3793 }
3794
3795 static inline void *host_from_ram_block_offset(RAMBlock *block,
3796                                                ram_addr_t offset)
3797 {
3798     if (!offset_in_ramblock(block, offset)) {
3799         return NULL;
3800     }
3801
3802     return block->host + offset;
3803 }
3804
3805 static void *host_page_from_ram_block_offset(RAMBlock *block,
3806                                              ram_addr_t offset)
3807 {
3808     /* Note: Explicitly no check against offset_in_ramblock(). */
3809     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3810                                    block->page_size);
3811 }
3812
3813 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3814                                                          ram_addr_t offset)
3815 {
3816     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3817 }
3818
3819 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3820                              ram_addr_t offset, bool record_bitmap)
3821 {
3822     if (!offset_in_ramblock(block, offset)) {
3823         return NULL;
3824     }
3825     if (!block->colo_cache) {
3826         error_report("%s: colo_cache is NULL in block :%s",
3827                      __func__, block->idstr);
3828         return NULL;
3829     }
3830
3831     /*
3832     * During colo checkpoint, we need bitmap of these migrated pages.
3833     * It help us to decide which pages in ram cache should be flushed
3834     * into VM's RAM later.
3835     */
3836     if (record_bitmap &&
3837         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3838         ram_state->migration_dirty_pages++;
3839     }
3840     return block->colo_cache + offset;
3841 }
3842
3843 /**
3844  * ram_handle_compressed: handle the zero page case
3845  *
3846  * If a page (or a whole RDMA chunk) has been
3847  * determined to be zero, then zap it.
3848  *
3849  * @host: host address for the zero page
3850  * @ch: what the page is filled from.  We only support zero
3851  * @size: size of the zero page
3852  */
3853 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3854 {
3855     if (ch != 0 || !buffer_is_zero(host, size)) {
3856         memset(host, ch, size);
3857     }
3858 }
3859
3860 /* return the size after decompression, or negative value on error */
3861 static int
3862 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3863                      const uint8_t *source, size_t source_len)
3864 {
3865     int err;
3866
3867     err = inflateReset(stream);
3868     if (err != Z_OK) {
3869         return -1;
3870     }
3871
3872     stream->avail_in = source_len;
3873     stream->next_in = (uint8_t *)source;
3874     stream->avail_out = dest_len;
3875     stream->next_out = dest;
3876
3877     err = inflate(stream, Z_NO_FLUSH);
3878     if (err != Z_STREAM_END) {
3879         return -1;
3880     }
3881
3882     return stream->total_out;
3883 }
3884
3885 static void *do_data_decompress(void *opaque)
3886 {
3887     DecompressParam *param = opaque;
3888     unsigned long pagesize;
3889     uint8_t *des;
3890     int len, ret;
3891
3892     qemu_mutex_lock(&param->mutex);
3893     while (!param->quit) {
3894         if (param->des) {
3895             des = param->des;
3896             len = param->len;
3897             param->des = 0;
3898             qemu_mutex_unlock(&param->mutex);
3899
3900             pagesize = TARGET_PAGE_SIZE;
3901
3902             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3903                                        param->compbuf, len);
3904             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3905                 error_report("decompress data failed");
3906                 qemu_file_set_error(decomp_file, ret);
3907             }
3908
3909             qemu_mutex_lock(&decomp_done_lock);
3910             param->done = true;
3911             qemu_cond_signal(&decomp_done_cond);
3912             qemu_mutex_unlock(&decomp_done_lock);
3913
3914             qemu_mutex_lock(&param->mutex);
3915         } else {
3916             qemu_cond_wait(&param->cond, &param->mutex);
3917         }
3918     }
3919     qemu_mutex_unlock(&param->mutex);
3920
3921     return NULL;
3922 }
3923
3924 static int wait_for_decompress_done(void)
3925 {
3926     int idx, thread_count;
3927
3928     if (!migrate_use_compression()) {
3929         return 0;
3930     }
3931
3932     thread_count = migrate_decompress_threads();
3933     qemu_mutex_lock(&decomp_done_lock);
3934     for (idx = 0; idx < thread_count; idx++) {
3935         while (!decomp_param[idx].done) {
3936             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3937         }
3938     }
3939     qemu_mutex_unlock(&decomp_done_lock);
3940     return qemu_file_get_error(decomp_file);
3941 }
3942
3943 static void compress_threads_load_cleanup(void)
3944 {
3945     int i, thread_count;
3946
3947     if (!migrate_use_compression()) {
3948         return;
3949     }
3950     thread_count = migrate_decompress_threads();
3951     for (i = 0; i < thread_count; i++) {
3952         /*
3953          * we use it as a indicator which shows if the thread is
3954          * properly init'd or not
3955          */
3956         if (!decomp_param[i].compbuf) {
3957             break;
3958         }
3959
3960         qemu_mutex_lock(&decomp_param[i].mutex);
3961         decomp_param[i].quit = true;
3962         qemu_cond_signal(&decomp_param[i].cond);
3963         qemu_mutex_unlock(&decomp_param[i].mutex);
3964     }
3965     for (i = 0; i < thread_count; i++) {
3966         if (!decomp_param[i].compbuf) {
3967             break;
3968         }
3969
3970         qemu_thread_join(decompress_threads + i);
3971         qemu_mutex_destroy(&decomp_param[i].mutex);
3972         qemu_cond_destroy(&decomp_param[i].cond);
3973         inflateEnd(&decomp_param[i].stream);
3974         g_free(decomp_param[i].compbuf);
3975         decomp_param[i].compbuf = NULL;
3976     }
3977     g_free(decompress_threads);
3978     g_free(decomp_param);
3979     decompress_threads = NULL;
3980     decomp_param = NULL;
3981     decomp_file = NULL;
3982 }
3983
3984 static int compress_threads_load_setup(QEMUFile *f)
3985 {
3986     int i, thread_count;
3987
3988     if (!migrate_use_compression()) {
3989         return 0;
3990     }
3991
3992     thread_count = migrate_decompress_threads();
3993     decompress_threads = g_new0(QemuThread, thread_count);
3994     decomp_param = g_new0(DecompressParam, thread_count);
3995     qemu_mutex_init(&decomp_done_lock);
3996     qemu_cond_init(&decomp_done_cond);
3997     decomp_file = f;
3998     for (i = 0; i < thread_count; i++) {
3999         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
4000             goto exit;
4001         }
4002
4003         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
4004         qemu_mutex_init(&decomp_param[i].mutex);
4005         qemu_cond_init(&decomp_param[i].cond);
4006         decomp_param[i].done = true;
4007         decomp_param[i].quit = false;
4008         qemu_thread_create(decompress_threads + i, "decompress",
4009                            do_data_decompress, decomp_param + i,
4010                            QEMU_THREAD_JOINABLE);
4011     }
4012     return 0;
4013 exit:
4014     compress_threads_load_cleanup();
4015     return -1;
4016 }
4017
4018 static void decompress_data_with_multi_threads(QEMUFile *f,
4019                                                void *host, int len)
4020 {
4021     int idx, thread_count;
4022
4023     thread_count = migrate_decompress_threads();
4024     QEMU_LOCK_GUARD(&decomp_done_lock);
4025     while (true) {
4026         for (idx = 0; idx < thread_count; idx++) {
4027             if (decomp_param[idx].done) {
4028                 decomp_param[idx].done = false;
4029                 qemu_mutex_lock(&decomp_param[idx].mutex);
4030                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
4031                 decomp_param[idx].des = host;
4032                 decomp_param[idx].len = len;
4033                 qemu_cond_signal(&decomp_param[idx].cond);
4034                 qemu_mutex_unlock(&decomp_param[idx].mutex);
4035                 break;
4036             }
4037         }
4038         if (idx < thread_count) {
4039             break;
4040         } else {
4041             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
4042         }
4043     }
4044 }
4045
4046 static void colo_init_ram_state(void)
4047 {
4048     ram_state_init(&ram_state);
4049 }
4050
4051 /*
4052  * colo cache: this is for secondary VM, we cache the whole
4053  * memory of the secondary VM, it is need to hold the global lock
4054  * to call this helper.
4055  */
4056 int colo_init_ram_cache(void)
4057 {
4058     RAMBlock *block;
4059
4060     WITH_RCU_READ_LOCK_GUARD() {
4061         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4062             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
4063                                                     NULL, false, false);
4064             if (!block->colo_cache) {
4065                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
4066                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
4067                              block->used_length);
4068                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4069                     if (block->colo_cache) {
4070                         qemu_anon_ram_free(block->colo_cache, block->used_length);
4071                         block->colo_cache = NULL;
4072                     }
4073                 }
4074                 return -errno;
4075             }
4076             if (!machine_dump_guest_core(current_machine)) {
4077                 qemu_madvise(block->colo_cache, block->used_length,
4078                              QEMU_MADV_DONTDUMP);
4079             }
4080         }
4081     }
4082
4083     /*
4084     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
4085     * with to decide which page in cache should be flushed into SVM's RAM. Here
4086     * we use the same name 'ram_bitmap' as for migration.
4087     */
4088     if (ram_bytes_total()) {
4089         RAMBlock *block;
4090
4091         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4092             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
4093             block->bmap = bitmap_new(pages);
4094         }
4095     }
4096
4097     colo_init_ram_state();
4098     return 0;
4099 }
4100
4101 /* TODO: duplicated with ram_init_bitmaps */
4102 void colo_incoming_start_dirty_log(void)
4103 {
4104     RAMBlock *block = NULL;
4105     /* For memory_global_dirty_log_start below. */
4106     qemu_mutex_lock_iothread();
4107     qemu_mutex_lock_ramlist();
4108
4109     memory_global_dirty_log_sync();
4110     WITH_RCU_READ_LOCK_GUARD() {
4111         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4112             ramblock_sync_dirty_bitmap(ram_state, block);
4113             /* Discard this dirty bitmap record */
4114             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
4115         }
4116         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
4117     }
4118     ram_state->migration_dirty_pages = 0;
4119     qemu_mutex_unlock_ramlist();
4120     qemu_mutex_unlock_iothread();
4121 }
4122
4123 /* It is need to hold the global lock to call this helper */
4124 void colo_release_ram_cache(void)
4125 {
4126     RAMBlock *block;
4127
4128     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
4129     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4130         g_free(block->bmap);
4131         block->bmap = NULL;
4132     }
4133
4134     WITH_RCU_READ_LOCK_GUARD() {
4135         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4136             if (block->colo_cache) {
4137                 qemu_anon_ram_free(block->colo_cache, block->used_length);
4138                 block->colo_cache = NULL;
4139             }
4140         }
4141     }
4142     ram_state_cleanup(&ram_state);
4143 }
4144
4145 /**
4146  * ram_load_setup: Setup RAM for migration incoming side
4147  *
4148  * Returns zero to indicate success and negative for error
4149  *
4150  * @f: QEMUFile where to receive the data
4151  * @opaque: RAMState pointer
4152  */
4153 static int ram_load_setup(QEMUFile *f, void *opaque)
4154 {
4155     if (compress_threads_load_setup(f)) {
4156         return -1;
4157     }
4158
4159     xbzrle_load_setup();
4160     ramblock_recv_map_init();
4161
4162     return 0;
4163 }
4164
4165 static int ram_load_cleanup(void *opaque)
4166 {
4167     RAMBlock *rb;
4168
4169     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4170         qemu_ram_block_writeback(rb);
4171     }
4172
4173     xbzrle_load_cleanup();
4174     compress_threads_load_cleanup();
4175
4176     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4177         g_free(rb->receivedmap);
4178         rb->receivedmap = NULL;
4179     }
4180
4181     return 0;
4182 }
4183
4184 /**
4185  * ram_postcopy_incoming_init: allocate postcopy data structures
4186  *
4187  * Returns 0 for success and negative if there was one error
4188  *
4189  * @mis: current migration incoming state
4190  *
4191  * Allocate data structures etc needed by incoming migration with
4192  * postcopy-ram. postcopy-ram's similarly names
4193  * postcopy_ram_incoming_init does the work.
4194  */
4195 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4196 {
4197     return postcopy_ram_incoming_init(mis);
4198 }
4199
4200 /**
4201  * ram_load_postcopy: load a page in postcopy case
4202  *
4203  * Returns 0 for success or -errno in case of error
4204  *
4205  * Called in postcopy mode by ram_load().
4206  * rcu_read_lock is taken prior to this being called.
4207  *
4208  * @f: QEMUFile where to send the data
4209  * @channel: the channel to use for loading
4210  */
4211 int ram_load_postcopy(QEMUFile *f, int channel)
4212 {
4213     int flags = 0, ret = 0;
4214     bool place_needed = false;
4215     bool matches_target_page_size = false;
4216     MigrationIncomingState *mis = migration_incoming_get_current();
4217     PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
4218
4219     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4220         ram_addr_t addr;
4221         void *page_buffer = NULL;
4222         void *place_source = NULL;
4223         RAMBlock *block = NULL;
4224         uint8_t ch;
4225         int len;
4226
4227         addr = qemu_get_be64(f);
4228
4229         /*
4230          * If qemu file error, we should stop here, and then "addr"
4231          * may be invalid
4232          */
4233         ret = qemu_file_get_error(f);
4234         if (ret) {
4235             break;
4236         }
4237
4238         flags = addr & ~TARGET_PAGE_MASK;
4239         addr &= TARGET_PAGE_MASK;
4240
4241         trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
4242         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4243                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
4244             block = ram_block_from_stream(mis, f, flags, channel);
4245             if (!block) {
4246                 ret = -EINVAL;
4247                 break;
4248             }
4249
4250             /*
4251              * Relying on used_length is racy and can result in false positives.
4252              * We might place pages beyond used_length in case RAM was shrunk
4253              * while in postcopy, which is fine - trying to place via
4254              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4255              */
4256             if (!block->host || addr >= block->postcopy_length) {
4257                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4258                 ret = -EINVAL;
4259                 break;
4260             }
4261             tmp_page->target_pages++;
4262             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4263             /*
4264              * Postcopy requires that we place whole host pages atomically;
4265              * these may be huge pages for RAMBlocks that are backed by
4266              * hugetlbfs.
4267              * To make it atomic, the data is read into a temporary page
4268              * that's moved into place later.
4269              * The migration protocol uses,  possibly smaller, target-pages
4270              * however the source ensures it always sends all the components
4271              * of a host page in one chunk.
4272              */
4273             page_buffer = tmp_page->tmp_huge_page +
4274                           host_page_offset_from_ram_block_offset(block, addr);
4275             /* If all TP are zero then we can optimise the place */
4276             if (tmp_page->target_pages == 1) {
4277                 tmp_page->host_addr =
4278                     host_page_from_ram_block_offset(block, addr);
4279             } else if (tmp_page->host_addr !=
4280                        host_page_from_ram_block_offset(block, addr)) {
4281                 /* not the 1st TP within the HP */
4282                 error_report("Non-same host page detected on channel %d: "
4283                              "Target host page %p, received host page %p "
4284                              "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
4285                              channel, tmp_page->host_addr,
4286                              host_page_from_ram_block_offset(block, addr),
4287                              block->idstr, addr, tmp_page->target_pages);
4288                 ret = -EINVAL;
4289                 break;
4290             }
4291
4292             /*
4293              * If it's the last part of a host page then we place the host
4294              * page
4295              */
4296             if (tmp_page->target_pages ==
4297                 (block->page_size / TARGET_PAGE_SIZE)) {
4298                 place_needed = true;
4299             }
4300             place_source = tmp_page->tmp_huge_page;
4301         }
4302
4303         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4304         case RAM_SAVE_FLAG_ZERO:
4305             ch = qemu_get_byte(f);
4306             /*
4307              * Can skip to set page_buffer when
4308              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4309              */
4310             if (ch || !matches_target_page_size) {
4311                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4312             }
4313             if (ch) {
4314                 tmp_page->all_zero = false;
4315             }
4316             break;
4317
4318         case RAM_SAVE_FLAG_PAGE:
4319             tmp_page->all_zero = false;
4320             if (!matches_target_page_size) {
4321                 /* For huge pages, we always use temporary buffer */
4322                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4323             } else {
4324                 /*
4325                  * For small pages that matches target page size, we
4326                  * avoid the qemu_file copy.  Instead we directly use
4327                  * the buffer of QEMUFile to place the page.  Note: we
4328                  * cannot do any QEMUFile operation before using that
4329                  * buffer to make sure the buffer is valid when
4330                  * placing the page.
4331                  */
4332                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4333                                          TARGET_PAGE_SIZE);
4334             }
4335             break;
4336         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4337             tmp_page->all_zero = false;
4338             len = qemu_get_be32(f);
4339             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4340                 error_report("Invalid compressed data length: %d", len);
4341                 ret = -EINVAL;
4342                 break;
4343             }
4344             decompress_data_with_multi_threads(f, page_buffer, len);
4345             break;
4346
4347         case RAM_SAVE_FLAG_EOS:
4348             /* normal exit */
4349             multifd_recv_sync_main();
4350             break;
4351         default:
4352             error_report("Unknown combination of migration flags: 0x%x"
4353                          " (postcopy mode)", flags);
4354             ret = -EINVAL;
4355             break;
4356         }
4357
4358         /* Got the whole host page, wait for decompress before placing. */
4359         if (place_needed) {
4360             ret |= wait_for_decompress_done();
4361         }
4362
4363         /* Detect for any possible file errors */
4364         if (!ret && qemu_file_get_error(f)) {
4365             ret = qemu_file_get_error(f);
4366         }
4367
4368         if (!ret && place_needed) {
4369             if (tmp_page->all_zero) {
4370                 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
4371             } else {
4372                 ret = postcopy_place_page(mis, tmp_page->host_addr,
4373                                           place_source, block);
4374             }
4375             place_needed = false;
4376             postcopy_temp_page_reset(tmp_page);
4377         }
4378     }
4379
4380     return ret;
4381 }
4382
4383 static bool postcopy_is_advised(void)
4384 {
4385     PostcopyState ps = postcopy_state_get();
4386     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4387 }
4388
4389 static bool postcopy_is_running(void)
4390 {
4391     PostcopyState ps = postcopy_state_get();
4392     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4393 }
4394
4395 /*
4396  * Flush content of RAM cache into SVM's memory.
4397  * Only flush the pages that be dirtied by PVM or SVM or both.
4398  */
4399 void colo_flush_ram_cache(void)
4400 {
4401     RAMBlock *block = NULL;
4402     void *dst_host;
4403     void *src_host;
4404     unsigned long offset = 0;
4405
4406     memory_global_dirty_log_sync();
4407     WITH_RCU_READ_LOCK_GUARD() {
4408         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4409             ramblock_sync_dirty_bitmap(ram_state, block);
4410         }
4411     }
4412
4413     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4414     WITH_RCU_READ_LOCK_GUARD() {
4415         block = QLIST_FIRST_RCU(&ram_list.blocks);
4416
4417         while (block) {
4418             unsigned long num = 0;
4419
4420             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
4421             if (!offset_in_ramblock(block,
4422                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
4423                 offset = 0;
4424                 num = 0;
4425                 block = QLIST_NEXT_RCU(block, next);
4426             } else {
4427                 unsigned long i = 0;
4428
4429                 for (i = 0; i < num; i++) {
4430                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
4431                 }
4432                 dst_host = block->host
4433                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4434                 src_host = block->colo_cache
4435                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4436                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4437                 offset += num;
4438             }
4439         }
4440     }
4441     trace_colo_flush_ram_cache_end();
4442 }
4443
4444 /**
4445  * ram_load_precopy: load pages in precopy case
4446  *
4447  * Returns 0 for success or -errno in case of error
4448  *
4449  * Called in precopy mode by ram_load().
4450  * rcu_read_lock is taken prior to this being called.
4451  *
4452  * @f: QEMUFile where to send the data
4453  */
4454 static int ram_load_precopy(QEMUFile *f)
4455 {
4456     MigrationIncomingState *mis = migration_incoming_get_current();
4457     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4458     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4459     bool postcopy_advised = postcopy_is_advised();
4460     if (!migrate_use_compression()) {
4461         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4462     }
4463
4464     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4465         ram_addr_t addr, total_ram_bytes;
4466         void *host = NULL, *host_bak = NULL;
4467         uint8_t ch;
4468
4469         /*
4470          * Yield periodically to let main loop run, but an iteration of
4471          * the main loop is expensive, so do it each some iterations
4472          */
4473         if ((i & 32767) == 0 && qemu_in_coroutine()) {
4474             aio_co_schedule(qemu_get_current_aio_context(),
4475                             qemu_coroutine_self());
4476             qemu_coroutine_yield();
4477         }
4478         i++;
4479
4480         addr = qemu_get_be64(f);
4481         flags = addr & ~TARGET_PAGE_MASK;
4482         addr &= TARGET_PAGE_MASK;
4483
4484         if (flags & invalid_flags) {
4485             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4486                 error_report("Received an unexpected compressed page");
4487             }
4488
4489             ret = -EINVAL;
4490             break;
4491         }
4492
4493         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4494                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4495             RAMBlock *block = ram_block_from_stream(mis, f, flags,
4496                                                     RAM_CHANNEL_PRECOPY);
4497
4498             host = host_from_ram_block_offset(block, addr);
4499             /*
4500              * After going into COLO stage, we should not load the page
4501              * into SVM's memory directly, we put them into colo_cache firstly.
4502              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4503              * Previously, we copied all these memory in preparing stage of COLO
4504              * while we need to stop VM, which is a time-consuming process.
4505              * Here we optimize it by a trick, back-up every page while in
4506              * migration process while COLO is enabled, though it affects the
4507              * speed of the migration, but it obviously reduce the downtime of
4508              * back-up all SVM'S memory in COLO preparing stage.
4509              */
4510             if (migration_incoming_colo_enabled()) {
4511                 if (migration_incoming_in_colo_state()) {
4512                     /* In COLO stage, put all pages into cache temporarily */
4513                     host = colo_cache_from_block_offset(block, addr, true);
4514                 } else {
4515                    /*
4516                     * In migration stage but before COLO stage,
4517                     * Put all pages into both cache and SVM's memory.
4518                     */
4519                     host_bak = colo_cache_from_block_offset(block, addr, false);
4520                 }
4521             }
4522             if (!host) {
4523                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4524                 ret = -EINVAL;
4525                 break;
4526             }
4527             if (!migration_incoming_in_colo_state()) {
4528                 ramblock_recv_bitmap_set(block, host);
4529             }
4530
4531             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4532         }
4533
4534         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4535         case RAM_SAVE_FLAG_MEM_SIZE:
4536             /* Synchronize RAM block list */
4537             total_ram_bytes = addr;
4538             while (!ret && total_ram_bytes) {
4539                 RAMBlock *block;
4540                 char id[256];
4541                 ram_addr_t length;
4542
4543                 len = qemu_get_byte(f);
4544                 qemu_get_buffer(f, (uint8_t *)id, len);
4545                 id[len] = 0;
4546                 length = qemu_get_be64(f);
4547
4548                 block = qemu_ram_block_by_name(id);
4549                 if (block && !qemu_ram_is_migratable(block)) {
4550                     error_report("block %s should not be migrated !", id);
4551                     ret = -EINVAL;
4552                 } else if (block) {
4553                     if (length != block->used_length) {
4554                         Error *local_err = NULL;
4555
4556                         ret = qemu_ram_resize(block, length,
4557                                               &local_err);
4558                         if (local_err) {
4559                             error_report_err(local_err);
4560                         }
4561                     }
4562                     /* For postcopy we need to check hugepage sizes match */
4563                     if (postcopy_advised && migrate_postcopy_ram() &&
4564                         block->page_size != qemu_host_page_size) {
4565                         uint64_t remote_page_size = qemu_get_be64(f);
4566                         if (remote_page_size != block->page_size) {
4567                             error_report("Mismatched RAM page size %s "
4568                                          "(local) %zd != %" PRId64,
4569                                          id, block->page_size,
4570                                          remote_page_size);
4571                             ret = -EINVAL;
4572                         }
4573                     }
4574                     if (migrate_ignore_shared()) {
4575                         hwaddr addr = qemu_get_be64(f);
4576                         if (ramblock_is_ignored(block) &&
4577                             block->mr->addr != addr) {
4578                             error_report("Mismatched GPAs for block %s "
4579                                          "%" PRId64 "!= %" PRId64,
4580                                          id, (uint64_t)addr,
4581                                          (uint64_t)block->mr->addr);
4582                             ret = -EINVAL;
4583                         }
4584                     }
4585                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4586                                           block->idstr);
4587                 } else {
4588                     error_report("Unknown ramblock \"%s\", cannot "
4589                                  "accept migration", id);
4590                     ret = -EINVAL;
4591                 }
4592
4593                 total_ram_bytes -= length;
4594             }
4595             break;
4596
4597         case RAM_SAVE_FLAG_ZERO:
4598             ch = qemu_get_byte(f);
4599             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4600             break;
4601
4602         case RAM_SAVE_FLAG_PAGE:
4603             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4604             break;
4605
4606         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4607             len = qemu_get_be32(f);
4608             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4609                 error_report("Invalid compressed data length: %d", len);
4610                 ret = -EINVAL;
4611                 break;
4612             }
4613             decompress_data_with_multi_threads(f, host, len);
4614             break;
4615
4616         case RAM_SAVE_FLAG_XBZRLE:
4617             if (load_xbzrle(f, addr, host) < 0) {
4618                 error_report("Failed to decompress XBZRLE page at "
4619                              RAM_ADDR_FMT, addr);
4620                 ret = -EINVAL;
4621                 break;
4622             }
4623             break;
4624         case RAM_SAVE_FLAG_EOS:
4625             /* normal exit */
4626             multifd_recv_sync_main();
4627             break;
4628         default:
4629             if (flags & RAM_SAVE_FLAG_HOOK) {
4630                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4631             } else {
4632                 error_report("Unknown combination of migration flags: 0x%x",
4633                              flags);
4634                 ret = -EINVAL;
4635             }
4636         }
4637         if (!ret) {
4638             ret = qemu_file_get_error(f);
4639         }
4640         if (!ret && host_bak) {
4641             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4642         }
4643     }
4644
4645     ret |= wait_for_decompress_done();
4646     return ret;
4647 }
4648
4649 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4650 {
4651     int ret = 0;
4652     static uint64_t seq_iter;
4653     /*
4654      * If system is running in postcopy mode, page inserts to host memory must
4655      * be atomic
4656      */
4657     bool postcopy_running = postcopy_is_running();
4658
4659     seq_iter++;
4660
4661     if (version_id != 4) {
4662         return -EINVAL;
4663     }
4664
4665     /*
4666      * This RCU critical section can be very long running.
4667      * When RCU reclaims in the code start to become numerous,
4668      * it will be necessary to reduce the granularity of this
4669      * critical section.
4670      */
4671     WITH_RCU_READ_LOCK_GUARD() {
4672         if (postcopy_running) {
4673             /*
4674              * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
4675              * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4676              * service fast page faults.
4677              */
4678             ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4679         } else {
4680             ret = ram_load_precopy(f);
4681         }
4682     }
4683     trace_ram_load_complete(ret, seq_iter);
4684
4685     return ret;
4686 }
4687
4688 static bool ram_has_postcopy(void *opaque)
4689 {
4690     RAMBlock *rb;
4691     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4692         if (ramblock_is_pmem(rb)) {
4693             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4694                          "is not supported now!", rb->idstr, rb->host);
4695             return false;
4696         }
4697     }
4698
4699     return migrate_postcopy_ram();
4700 }
4701
4702 /* Sync all the dirty bitmap with destination VM.  */
4703 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4704 {
4705     RAMBlock *block;
4706     QEMUFile *file = s->to_dst_file;
4707     int ramblock_count = 0;
4708
4709     trace_ram_dirty_bitmap_sync_start();
4710
4711     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4712         qemu_savevm_send_recv_bitmap(file, block->idstr);
4713         trace_ram_dirty_bitmap_request(block->idstr);
4714         ramblock_count++;
4715     }
4716
4717     trace_ram_dirty_bitmap_sync_wait();
4718
4719     /* Wait until all the ramblocks' dirty bitmap synced */
4720     while (ramblock_count--) {
4721         qemu_sem_wait(&s->rp_state.rp_sem);
4722     }
4723
4724     trace_ram_dirty_bitmap_sync_complete();
4725
4726     return 0;
4727 }
4728
4729 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4730 {
4731     qemu_sem_post(&s->rp_state.rp_sem);
4732 }
4733
4734 /*
4735  * Read the received bitmap, revert it as the initial dirty bitmap.
4736  * This is only used when the postcopy migration is paused but wants
4737  * to resume from a middle point.
4738  */
4739 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4740 {
4741     int ret = -EINVAL;
4742     /* from_dst_file is always valid because we're within rp_thread */
4743     QEMUFile *file = s->rp_state.from_dst_file;
4744     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4745     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4746     uint64_t size, end_mark;
4747
4748     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4749
4750     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4751         error_report("%s: incorrect state %s", __func__,
4752                      MigrationStatus_str(s->state));
4753         return -EINVAL;
4754     }
4755
4756     /*
4757      * Note: see comments in ramblock_recv_bitmap_send() on why we
4758      * need the endianness conversion, and the paddings.
4759      */
4760     local_size = ROUND_UP(local_size, 8);
4761
4762     /* Add paddings */
4763     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4764
4765     size = qemu_get_be64(file);
4766
4767     /* The size of the bitmap should match with our ramblock */
4768     if (size != local_size) {
4769         error_report("%s: ramblock '%s' bitmap size mismatch "
4770                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4771                      block->idstr, size, local_size);
4772         ret = -EINVAL;
4773         goto out;
4774     }
4775
4776     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4777     end_mark = qemu_get_be64(file);
4778
4779     ret = qemu_file_get_error(file);
4780     if (ret || size != local_size) {
4781         error_report("%s: read bitmap failed for ramblock '%s': %d"
4782                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4783                      __func__, block->idstr, ret, local_size, size);
4784         ret = -EIO;
4785         goto out;
4786     }
4787
4788     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4789         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4790                      __func__, block->idstr, end_mark);
4791         ret = -EINVAL;
4792         goto out;
4793     }
4794
4795     /*
4796      * Endianness conversion. We are during postcopy (though paused).
4797      * The dirty bitmap won't change. We can directly modify it.
4798      */
4799     bitmap_from_le(block->bmap, le_bitmap, nbits);
4800
4801     /*
4802      * What we received is "received bitmap". Revert it as the initial
4803      * dirty bitmap for this ramblock.
4804      */
4805     bitmap_complement(block->bmap, block->bmap, nbits);
4806
4807     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4808     ramblock_dirty_bitmap_clear_discarded_pages(block);
4809
4810     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4811     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4812
4813     /*
4814      * We succeeded to sync bitmap for current ramblock. If this is
4815      * the last one to sync, we need to notify the main send thread.
4816      */
4817     ram_dirty_bitmap_reload_notify(s);
4818
4819     ret = 0;
4820 out:
4821     g_free(le_bitmap);
4822     return ret;
4823 }
4824
4825 static int ram_resume_prepare(MigrationState *s, void *opaque)
4826 {
4827     RAMState *rs = *(RAMState **)opaque;
4828     int ret;
4829
4830     ret = ram_dirty_bitmap_sync_all(s, rs);
4831     if (ret) {
4832         return ret;
4833     }
4834
4835     ram_state_resume_prepare(rs, s->to_dst_file);
4836
4837     return 0;
4838 }
4839
4840 void postcopy_preempt_shutdown_file(MigrationState *s)
4841 {
4842     qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4843     qemu_fflush(s->postcopy_qemufile_src);
4844 }
4845
4846 static SaveVMHandlers savevm_ram_handlers = {
4847     .save_setup = ram_save_setup,
4848     .save_live_iterate = ram_save_iterate,
4849     .save_live_complete_postcopy = ram_save_complete,
4850     .save_live_complete_precopy = ram_save_complete,
4851     .has_postcopy = ram_has_postcopy,
4852     .save_live_pending = ram_save_pending,
4853     .load_state = ram_load,
4854     .save_cleanup = ram_save_cleanup,
4855     .load_setup = ram_load_setup,
4856     .load_cleanup = ram_load_cleanup,
4857     .resume_prepare = ram_resume_prepare,
4858 };
4859
4860 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4861                                       size_t old_size, size_t new_size)
4862 {
4863     PostcopyState ps = postcopy_state_get();
4864     ram_addr_t offset;
4865     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4866     Error *err = NULL;
4867
4868     if (ramblock_is_ignored(rb)) {
4869         return;
4870     }
4871
4872     if (!migration_is_idle()) {
4873         /*
4874          * Precopy code on the source cannot deal with the size of RAM blocks
4875          * changing at random points in time - especially after sending the
4876          * RAM block sizes in the migration stream, they must no longer change.
4877          * Abort and indicate a proper reason.
4878          */
4879         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4880         migration_cancel(err);
4881         error_free(err);
4882     }
4883
4884     switch (ps) {
4885     case POSTCOPY_INCOMING_ADVISE:
4886         /*
4887          * Update what ram_postcopy_incoming_init()->init_range() does at the
4888          * time postcopy was advised. Syncing RAM blocks with the source will
4889          * result in RAM resizes.
4890          */
4891         if (old_size < new_size) {
4892             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4893                 error_report("RAM block '%s' discard of resized RAM failed",
4894                              rb->idstr);
4895             }
4896         }
4897         rb->postcopy_length = new_size;
4898         break;
4899     case POSTCOPY_INCOMING_NONE:
4900     case POSTCOPY_INCOMING_RUNNING:
4901     case POSTCOPY_INCOMING_END:
4902         /*
4903          * Once our guest is running, postcopy does no longer care about
4904          * resizes. When growing, the new memory was not available on the
4905          * source, no handler needed.
4906          */
4907         break;
4908     default:
4909         error_report("RAM block '%s' resized during postcopy state: %d",
4910                      rb->idstr, ps);
4911         exit(-1);
4912     }
4913 }
4914
4915 static RAMBlockNotifier ram_mig_ram_notifier = {
4916     .ram_block_resized = ram_mig_ram_block_resized,
4917 };
4918
4919 void ram_mig_init(void)
4920 {
4921     qemu_mutex_init(&XBZRLE.lock);
4922     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4923     ram_block_notifier_add(&ram_mig_ram_notifier);
4924 }