migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/madvise.h"
  34 #include "qemu/main-loop.h"
  35 #include "xbzrle.h"
  36 #include "ram-compress.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "migration-stats.h"
  40 #include "migration/register.h"
  41 #include "migration/misc.h"
  42 #include "qemu-file.h"
  43 #include "postcopy-ram.h"
  44 #include "page_cache.h"
  45 #include "qemu/error-report.h"
  46 #include "qapi/error.h"
  47 #include "qapi/qapi-types-migration.h"
  48 #include "qapi/qapi-events-migration.h"
  49 #include "qapi/qmp/qerror.h"
  50 #include "trace.h"
  51 #include "exec/ram_addr.h"
  52 #include "exec/target_page.h"
  53 #include "qemu/rcu_queue.h"
  54 #include "migration/colo.h"
  55 #include "block.h"
  56 #include "sysemu/cpu-throttle.h"
  57 #include "savevm.h"
  58 #include "qemu/iov.h"
  59 #include "multifd.h"
  60 #include "sysemu/runstate.h"
  61 #include "options.h"
  62
  63 #include "hw/boards.h" /* for machine_dump_guest_core() */
  64
  65 #if defined(__linux__)
  66 #include "qemu/userfaultfd.h"
  67 #endif /* defined(__linux__) */
  68
  69 /***********************************************************/
  70 /* ram save/restore */
  71
  72 /*
  73  * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  74  * worked for pages that were filled with the same char.  We switched
  75  * it to only search for the zero value.  And to avoid confusion with
  76  * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
  77  */
  78 /*
  79  * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
  80  */
  81 #define RAM_SAVE_FLAG_FULL     0x01
  82 #define RAM_SAVE_FLAG_ZERO     0x02
  83 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  84 #define RAM_SAVE_FLAG_PAGE     0x08
  85 #define RAM_SAVE_FLAG_EOS      0x10
  86 #define RAM_SAVE_FLAG_CONTINUE 0x20
  87 #define RAM_SAVE_FLAG_XBZRLE   0x40
  88 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
  89 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  90 #define RAM_SAVE_FLAG_MULTIFD_FLUSH    0x200
  91 /* We can't use any flag that is bigger than 0x200 */
  92
  93 XBZRLECacheStats xbzrle_counters;
  94
  95 /* used by the search for pages to send */
  96 struct PageSearchStatus {
  97     /* The migration channel used for a specific host page */
  98     QEMUFile    *pss_channel;
  99     /* Last block from where we have sent data */
 100     RAMBlock *last_sent_block;
 101     /* Current block being searched */
 102     RAMBlock    *block;
 103     /* Current page to search from */
 104     unsigned long page;
 105     /* Set once we wrap around */
 106     bool         complete_round;
 107     /* Whether we're sending a host page */
 108     bool          host_page_sending;
 109     /* The start/end of current host page.  Invalid if host_page_sending==false */
 110     unsigned long host_page_start;
 111     unsigned long host_page_end;
 112 };
 113 typedef struct PageSearchStatus PageSearchStatus;
 114
 115 /* struct contains XBZRLE cache and a static page
 116    used by the compression */
 117 static struct {
 118     /* buffer used for XBZRLE encoding */
 119     uint8_t *encoded_buf;
 120     /* buffer for storing page content */
 121     uint8_t *current_buf;
 122     /* Cache for XBZRLE, Protected by lock. */
 123     PageCache *cache;
 124     QemuMutex lock;
 125     /* it will store a page full of zeros */
 126     uint8_t *zero_target_page;
 127     /* buffer used for XBZRLE decoding */
 128     uint8_t *decoded_buf;
 129 } XBZRLE;
 130
 131 static void XBZRLE_cache_lock(void)
 132 {
 133     if (migrate_xbzrle()) {
 134         qemu_mutex_lock(&XBZRLE.lock);
 135     }
 136 }
 137
 138 static void XBZRLE_cache_unlock(void)
 139 {
 140     if (migrate_xbzrle()) {
 141         qemu_mutex_unlock(&XBZRLE.lock);
 142     }
 143 }
 144
 145 /**
 146  * xbzrle_cache_resize: resize the xbzrle cache
 147  *
 148  * This function is called from migrate_params_apply in main
 149  * thread, possibly while a migration is in progress.  A running
 150  * migration may be using the cache and might finish during this call,
 151  * hence changes to the cache are protected by XBZRLE.lock().
 152  *
 153  * Returns 0 for success or -1 for error
 154  *
 155  * @new_size: new cache size
 156  * @errp: set *errp if the check failed, with reason
 157  */
 158 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 159 {
 160     PageCache *new_cache;
 161     int64_t ret = 0;
 162
 163     /* Check for truncation */
 164     if (new_size != (size_t)new_size) {
 165         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 166                    "exceeding address space");
 167         return -1;
 168     }
 169
 170     if (new_size == migrate_xbzrle_cache_size()) {
 171         /* nothing to do */
 172         return 0;
 173     }
 174
 175     XBZRLE_cache_lock();
 176
 177     if (XBZRLE.cache != NULL) {
 178         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 179         if (!new_cache) {
 180             ret = -1;
 181             goto out;
 182         }
 183
 184         cache_fini(XBZRLE.cache);
 185         XBZRLE.cache = new_cache;
 186     }
 187 out:
 188     XBZRLE_cache_unlock();
 189     return ret;
 190 }
 191
 192 static bool postcopy_preempt_active(void)
 193 {
 194     return migrate_postcopy_preempt() && migration_in_postcopy();
 195 }
 196
 197 bool ramblock_is_ignored(RAMBlock *block)
 198 {
 199     return !qemu_ram_is_migratable(block) ||
 200            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 201 }
 202
 203 #undef RAMBLOCK_FOREACH
 204
 205 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 206 {
 207     RAMBlock *block;
 208     int ret = 0;
 209
 210     RCU_READ_LOCK_GUARD();
 211
 212     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 213         ret = func(block, opaque);
 214         if (ret) {
 215             break;
 216         }
 217     }
 218     return ret;
 219 }
 220
 221 static void ramblock_recv_map_init(void)
 222 {
 223     RAMBlock *rb;
 224
 225     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 226         assert(!rb->receivedmap);
 227         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 228     }
 229 }
 230
 231 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 232 {
 233     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 234                     rb->receivedmap);
 235 }
 236
 237 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 238 {
 239     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 240 }
 241
 242 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 243 {
 244     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 245 }
 246
 247 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 248                                     size_t nr)
 249 {
 250     bitmap_set_atomic(rb->receivedmap,
 251                       ramblock_recv_bitmap_offset(host_addr, rb),
 252                       nr);
 253 }
 254
 255 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 256
 257 /*
 258  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 259  *
 260  * Returns >0 if success with sent bytes, or <0 if error.
 261  */
 262 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 263                                   const char *block_name)
 264 {
 265     RAMBlock *block = qemu_ram_block_by_name(block_name);
 266     unsigned long *le_bitmap, nbits;
 267     uint64_t size;
 268
 269     if (!block) {
 270         error_report("%s: invalid block name: %s", __func__, block_name);
 271         return -1;
 272     }
 273
 274     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 275
 276     /*
 277      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 278      * machines we may need 4 more bytes for padding (see below
 279      * comment). So extend it a bit before hand.
 280      */
 281     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 282
 283     /*
 284      * Always use little endian when sending the bitmap. This is
 285      * required that when source and destination VMs are not using the
 286      * same endianness. (Note: big endian won't work.)
 287      */
 288     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 289
 290     /* Size of the bitmap, in bytes */
 291     size = DIV_ROUND_UP(nbits, 8);
 292
 293     /*
 294      * size is always aligned to 8 bytes for 64bit machines, but it
 295      * may not be true for 32bit machines. We need this padding to
 296      * make sure the migration can survive even between 32bit and
 297      * 64bit machines.
 298      */
 299     size = ROUND_UP(size, 8);
 300
 301     qemu_put_be64(file, size);
 302     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 303     /*
 304      * Mark as an end, in case the middle part is screwed up due to
 305      * some "mysterious" reason.
 306      */
 307     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 308     qemu_fflush(file);
 309
 310     g_free(le_bitmap);
 311
 312     if (qemu_file_get_error(file)) {
 313         return qemu_file_get_error(file);
 314     }
 315
 316     return size + sizeof(size);
 317 }
 318
 319 /*
 320  * An outstanding page request, on the source, having been received
 321  * and queued
 322  */
 323 struct RAMSrcPageRequest {
 324     RAMBlock *rb;
 325     hwaddr    offset;
 326     hwaddr    len;
 327
 328     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 329 };
 330
 331 /* State of RAM for migration */
 332 struct RAMState {
 333     /*
 334      * PageSearchStatus structures for the channels when send pages.
 335      * Protected by the bitmap_mutex.
 336      */
 337     PageSearchStatus pss[RAM_CHANNEL_MAX];
 338     /* UFFD file descriptor, used in 'write-tracking' migration */
 339     int uffdio_fd;
 340     /* total ram size in bytes */
 341     uint64_t ram_bytes_total;
 342     /* Last block that we have visited searching for dirty pages */
 343     RAMBlock *last_seen_block;
 344     /* Last dirty target page we have sent */
 345     ram_addr_t last_page;
 346     /* last ram version we have seen */
 347     uint32_t last_version;
 348     /* How many times we have dirty too many pages */
 349     int dirty_rate_high_cnt;
 350     /* these variables are used for bitmap sync */
 351     /* last time we did a full bitmap_sync */
 352     int64_t time_last_bitmap_sync;
 353     /* bytes transferred at start_time */
 354     uint64_t bytes_xfer_prev;
 355     /* number of dirty pages since start_time */
 356     uint64_t num_dirty_pages_period;
 357     /* xbzrle misses since the beginning of the period */
 358     uint64_t xbzrle_cache_miss_prev;
 359     /* Amount of xbzrle pages since the beginning of the period */
 360     uint64_t xbzrle_pages_prev;
 361     /* Amount of xbzrle encoded bytes since the beginning of the period */
 362     uint64_t xbzrle_bytes_prev;
 363     /* Are we really using XBZRLE (e.g., after the first round). */
 364     bool xbzrle_started;
 365     /* Are we on the last stage of migration */
 366     bool last_stage;
 367     /* compression statistics since the beginning of the period */
 368     /* amount of count that no free thread to compress data */
 369     uint64_t compress_thread_busy_prev;
 370     /* amount bytes after compression */
 371     uint64_t compressed_size_prev;
 372     /* amount of compressed pages */
 373     uint64_t compress_pages_prev;
 374
 375     /* total handled target pages at the beginning of period */
 376     uint64_t target_page_count_prev;
 377     /* total handled target pages since start */
 378     uint64_t target_page_count;
 379     /* number of dirty bits in the bitmap */
 380     uint64_t migration_dirty_pages;
 381     /*
 382      * Protects:
 383      * - dirty/clear bitmap
 384      * - migration_dirty_pages
 385      * - pss structures
 386      */
 387     QemuMutex bitmap_mutex;
 388     /* The RAMBlock used in the last src_page_requests */
 389     RAMBlock *last_req_rb;
 390     /* Queue of outstanding page requests from the destination */
 391     QemuMutex src_page_req_mutex;
 392     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 393 };
 394 typedef struct RAMState RAMState;
 395
 396 static RAMState *ram_state;
 397
 398 static NotifierWithReturnList precopy_notifier_list;
 399
 400 /* Whether postcopy has queued requests? */
 401 static bool postcopy_has_request(RAMState *rs)
 402 {
 403     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
 404 }
 405
 406 void precopy_infrastructure_init(void)
 407 {
 408     notifier_with_return_list_init(&precopy_notifier_list);
 409 }
 410
 411 void precopy_add_notifier(NotifierWithReturn *n)
 412 {
 413     notifier_with_return_list_add(&precopy_notifier_list, n);
 414 }
 415
 416 void precopy_remove_notifier(NotifierWithReturn *n)
 417 {
 418     notifier_with_return_remove(n);
 419 }
 420
 421 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 422 {
 423     PrecopyNotifyData pnd;
 424     pnd.reason = reason;
 425     pnd.errp = errp;
 426
 427     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 428 }
 429
 430 uint64_t ram_bytes_remaining(void)
 431 {
 432     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 433                        0;
 434 }
 435
 436 void ram_transferred_add(uint64_t bytes)
 437 {
 438     if (runstate_is_running()) {
 439         stat64_add(&mig_stats.precopy_bytes, bytes);
 440     } else if (migration_in_postcopy()) {
 441         stat64_add(&mig_stats.postcopy_bytes, bytes);
 442     } else {
 443         stat64_add(&mig_stats.downtime_bytes, bytes);
 444     }
 445     stat64_add(&mig_stats.transferred, bytes);
 446 }
 447
 448 struct MigrationOps {
 449     int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
 450 };
 451 typedef struct MigrationOps MigrationOps;
 452
 453 MigrationOps *migration_ops;
 454
 455 static int ram_save_host_page_urgent(PageSearchStatus *pss);
 456
 457 /* NOTE: page is the PFN not real ram_addr_t. */
 458 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
 459 {
 460     pss->block = rb;
 461     pss->page = page;
 462     pss->complete_round = false;
 463 }
 464
 465 /*
 466  * Check whether two PSSs are actively sending the same page.  Return true
 467  * if it is, false otherwise.
 468  */
 469 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
 470 {
 471     return pss1->host_page_sending && pss2->host_page_sending &&
 472         (pss1->host_page_start == pss2->host_page_start);
 473 }
 474
 475 /**
 476  * save_page_header: write page header to wire
 477  *
 478  * If this is the 1st block, it also writes the block identification
 479  *
 480  * Returns the number of bytes written
 481  *
 482  * @pss: current PSS channel status
 483  * @block: block that contains the page we want to send
 484  * @offset: offset inside the block for the page
 485  *          in the lower bits, it contains flags
 486  */
 487 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
 488                                RAMBlock *block, ram_addr_t offset)
 489 {
 490     size_t size, len;
 491     bool same_block = (block == pss->last_sent_block);
 492
 493     if (same_block) {
 494         offset |= RAM_SAVE_FLAG_CONTINUE;
 495     }
 496     qemu_put_be64(f, offset);
 497     size = 8;
 498
 499     if (!same_block) {
 500         len = strlen(block->idstr);
 501         qemu_put_byte(f, len);
 502         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 503         size += 1 + len;
 504         pss->last_sent_block = block;
 505     }
 506     return size;
 507 }
 508
 509 /**
 510  * mig_throttle_guest_down: throttle down the guest
 511  *
 512  * Reduce amount of guest cpu execution to hopefully slow down memory
 513  * writes. If guest dirty memory rate is reduced below the rate at
 514  * which we can transfer pages to the destination then we should be
 515  * able to complete migration. Some workloads dirty memory way too
 516  * fast and will not effectively converge, even with auto-converge.
 517  */
 518 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 519                                     uint64_t bytes_dirty_threshold)
 520 {
 521     uint64_t pct_initial = migrate_cpu_throttle_initial();
 522     uint64_t pct_increment = migrate_cpu_throttle_increment();
 523     bool pct_tailslow = migrate_cpu_throttle_tailslow();
 524     int pct_max = migrate_max_cpu_throttle();
 525
 526     uint64_t throttle_now = cpu_throttle_get_percentage();
 527     uint64_t cpu_now, cpu_ideal, throttle_inc;
 528
 529     /* We have not started throttling yet. Let's start it. */
 530     if (!cpu_throttle_active()) {
 531         cpu_throttle_set(pct_initial);
 532     } else {
 533         /* Throttling already on, just increase the rate */
 534         if (!pct_tailslow) {
 535             throttle_inc = pct_increment;
 536         } else {
 537             /* Compute the ideal CPU percentage used by Guest, which may
 538              * make the dirty rate match the dirty rate threshold. */
 539             cpu_now = 100 - throttle_now;
 540             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 541                         bytes_dirty_period);
 542             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 543         }
 544         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 545     }
 546 }
 547
 548 void mig_throttle_counter_reset(void)
 549 {
 550     RAMState *rs = ram_state;
 551
 552     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 553     rs->num_dirty_pages_period = 0;
 554     rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred);
 555 }
 556
 557 /**
 558  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 559  *
 560  * @rs: current RAM state
 561  * @current_addr: address for the zero page
 562  *
 563  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 564  * The important thing is that a stale (not-yet-0'd) page be replaced
 565  * by the new data.
 566  * As a bonus, if the page wasn't in the cache it gets added so that
 567  * when a small write is made into the 0'd page it gets XBZRLE sent.
 568  */
 569 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 570 {
 571     /* We don't care if this fails to allocate a new cache page
 572      * as long as it updated an old one */
 573     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 574                  stat64_get(&mig_stats.dirty_sync_count));
 575 }
 576
 577 #define ENCODING_FLAG_XBZRLE 0x1
 578
 579 /**
 580  * save_xbzrle_page: compress and send current page
 581  *
 582  * Returns: 1 means that we wrote the page
 583  *          0 means that page is identical to the one already sent
 584  *          -1 means that xbzrle would be longer than normal
 585  *
 586  * @rs: current RAM state
 587  * @pss: current PSS channel
 588  * @current_data: pointer to the address of the page contents
 589  * @current_addr: addr of the page
 590  * @block: block that contains the page we want to send
 591  * @offset: offset inside the block for the page
 592  */
 593 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
 594                             uint8_t **current_data, ram_addr_t current_addr,
 595                             RAMBlock *block, ram_addr_t offset)
 596 {
 597     int encoded_len = 0, bytes_xbzrle;
 598     uint8_t *prev_cached_page;
 599     QEMUFile *file = pss->pss_channel;
 600     uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
 601
 602     if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) {
 603         xbzrle_counters.cache_miss++;
 604         if (!rs->last_stage) {
 605             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 606                              generation) == -1) {
 607                 return -1;
 608             } else {
 609                 /* update *current_data when the page has been
 610                    inserted into cache */
 611                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 612             }
 613         }
 614         return -1;
 615     }
 616
 617     /*
 618      * Reaching here means the page has hit the xbzrle cache, no matter what
 619      * encoding result it is (normal encoding, overflow or skipping the page),
 620      * count the page as encoded. This is used to calculate the encoding rate.
 621      *
 622      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 623      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 624      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 625      * skipped page included. In this way, the encoding rate can tell if the
 626      * guest page is good for xbzrle encoding.
 627      */
 628     xbzrle_counters.pages++;
 629     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 630
 631     /* save current buffer into memory */
 632     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 633
 634     /* XBZRLE encoding (if there is no overflow) */
 635     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 636                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 637                                        TARGET_PAGE_SIZE);
 638
 639     /*
 640      * Update the cache contents, so that it corresponds to the data
 641      * sent, in all cases except where we skip the page.
 642      */
 643     if (!rs->last_stage && encoded_len != 0) {
 644         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 645         /*
 646          * In the case where we couldn't compress, ensure that the caller
 647          * sends the data from the cache, since the guest might have
 648          * changed the RAM since we copied it.
 649          */
 650         *current_data = prev_cached_page;
 651     }
 652
 653     if (encoded_len == 0) {
 654         trace_save_xbzrle_page_skipping();
 655         return 0;
 656     } else if (encoded_len == -1) {
 657         trace_save_xbzrle_page_overflow();
 658         xbzrle_counters.overflow++;
 659         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 660         return -1;
 661     }
 662
 663     /* Send XBZRLE based compressed page */
 664     bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
 665                                     offset | RAM_SAVE_FLAG_XBZRLE);
 666     qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
 667     qemu_put_be16(file, encoded_len);
 668     qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
 669     bytes_xbzrle += encoded_len + 1 + 2;
 670     /*
 671      * Like compressed_size (please see update_compress_thread_counts),
 672      * the xbzrle encoded bytes don't count the 8 byte header with
 673      * RAM_SAVE_FLAG_CONTINUE.
 674      */
 675     xbzrle_counters.bytes += bytes_xbzrle - 8;
 676     ram_transferred_add(bytes_xbzrle);
 677
 678     return 1;
 679 }
 680
 681 /**
 682  * pss_find_next_dirty: find the next dirty page of current ramblock
 683  *
 684  * This function updates pss->page to point to the next dirty page index
 685  * within the ramblock to migrate, or the end of ramblock when nothing
 686  * found.  Note that when pss->host_page_sending==true it means we're
 687  * during sending a host page, so we won't look for dirty page that is
 688  * outside the host page boundary.
 689  *
 690  * @pss: the current page search status
 691  */
 692 static void pss_find_next_dirty(PageSearchStatus *pss)
 693 {
 694     RAMBlock *rb = pss->block;
 695     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 696     unsigned long *bitmap = rb->bmap;
 697
 698     if (ramblock_is_ignored(rb)) {
 699         /* Points directly to the end, so we know no dirty page */
 700         pss->page = size;
 701         return;
 702     }
 703
 704     /*
 705      * If during sending a host page, only look for dirty pages within the
 706      * current host page being send.
 707      */
 708     if (pss->host_page_sending) {
 709         assert(pss->host_page_end);
 710         size = MIN(size, pss->host_page_end);
 711     }
 712
 713     pss->page = find_next_bit(bitmap, size, pss->page);
 714 }
 715
 716 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
 717                                                        unsigned long page)
 718 {
 719     uint8_t shift;
 720     hwaddr size, start;
 721
 722     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
 723         return;
 724     }
 725
 726     shift = rb->clear_bmap_shift;
 727     /*
 728      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 729      * can make things easier sometimes since then start address
 730      * of the small chunk will always be 64 pages aligned so the
 731      * bitmap will always be aligned to unsigned long. We should
 732      * even be able to remove this restriction but I'm simply
 733      * keeping it.
 734      */
 735     assert(shift >= 6);
 736
 737     size = 1ULL << (TARGET_PAGE_BITS + shift);
 738     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
 739     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 740     memory_region_clear_dirty_bitmap(rb->mr, start, size);
 741 }
 742
 743 static void
 744 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
 745                                                  unsigned long start,
 746                                                  unsigned long npages)
 747 {
 748     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
 749     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
 750     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
 751
 752     /*
 753      * Clear pages from start to start + npages - 1, so the end boundary is
 754      * exclusive.
 755      */
 756     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
 757         migration_clear_memory_region_dirty_bitmap(rb, i);
 758     }
 759 }
 760
 761 /*
 762  * colo_bitmap_find_diry:find contiguous dirty pages from start
 763  *
 764  * Returns the page offset within memory region of the start of the contiguout
 765  * dirty page
 766  *
 767  * @rs: current RAM state
 768  * @rb: RAMBlock where to search for dirty pages
 769  * @start: page where we start the search
 770  * @num: the number of contiguous dirty pages
 771  */
 772 static inline
 773 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 774                                      unsigned long start, unsigned long *num)
 775 {
 776     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 777     unsigned long *bitmap = rb->bmap;
 778     unsigned long first, next;
 779
 780     *num = 0;
 781
 782     if (ramblock_is_ignored(rb)) {
 783         return size;
 784     }
 785
 786     first = find_next_bit(bitmap, size, start);
 787     if (first >= size) {
 788         return first;
 789     }
 790     next = find_next_zero_bit(bitmap, size, first + 1);
 791     assert(next >= first);
 792     *num = next - first;
 793     return first;
 794 }
 795
 796 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 797                                                 RAMBlock *rb,
 798                                                 unsigned long page)
 799 {
 800     bool ret;
 801
 802     /*
 803      * Clear dirty bitmap if needed.  This _must_ be called before we
 804      * send any of the page in the chunk because we need to make sure
 805      * we can capture further page content changes when we sync dirty
 806      * log the next time.  So as long as we are going to send any of
 807      * the page in the chunk we clear the remote dirty bitmap for all.
 808      * Clearing it earlier won't be a problem, but too late will.
 809      */
 810     migration_clear_memory_region_dirty_bitmap(rb, page);
 811
 812     ret = test_and_clear_bit(page, rb->bmap);
 813     if (ret) {
 814         rs->migration_dirty_pages--;
 815     }
 816
 817     return ret;
 818 }
 819
 820 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
 821                                        void *opaque)
 822 {
 823     const hwaddr offset = section->offset_within_region;
 824     const hwaddr size = int128_get64(section->size);
 825     const unsigned long start = offset >> TARGET_PAGE_BITS;
 826     const unsigned long npages = size >> TARGET_PAGE_BITS;
 827     RAMBlock *rb = section->mr->ram_block;
 828     uint64_t *cleared_bits = opaque;
 829
 830     /*
 831      * We don't grab ram_state->bitmap_mutex because we expect to run
 832      * only when starting migration or during postcopy recovery where
 833      * we don't have concurrent access.
 834      */
 835     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
 836         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
 837     }
 838     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
 839     bitmap_clear(rb->bmap, start, npages);
 840 }
 841
 842 /*
 843  * Exclude all dirty pages from migration that fall into a discarded range as
 844  * managed by a RamDiscardManager responsible for the mapped memory region of
 845  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
 846  *
 847  * Discarded pages ("logically unplugged") have undefined content and must
 848  * not get migrated, because even reading these pages for migration might
 849  * result in undesired behavior.
 850  *
 851  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
 852  *
 853  * Note: The result is only stable while migrating (precopy/postcopy).
 854  */
 855 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
 856 {
 857     uint64_t cleared_bits = 0;
 858
 859     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
 860         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 861         MemoryRegionSection section = {
 862             .mr = rb->mr,
 863             .offset_within_region = 0,
 864             .size = int128_make64(qemu_ram_get_used_length(rb)),
 865         };
 866
 867         ram_discard_manager_replay_discarded(rdm, &section,
 868                                              dirty_bitmap_clear_section,
 869                                              &cleared_bits);
 870     }
 871     return cleared_bits;
 872 }
 873
 874 /*
 875  * Check if a host-page aligned page falls into a discarded range as managed by
 876  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
 877  *
 878  * Note: The result is only stable while migrating (precopy/postcopy).
 879  */
 880 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
 881 {
 882     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
 883         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 884         MemoryRegionSection section = {
 885             .mr = rb->mr,
 886             .offset_within_region = start,
 887             .size = int128_make64(qemu_ram_pagesize(rb)),
 888         };
 889
 890         return !ram_discard_manager_is_populated(rdm, &section);
 891     }
 892     return false;
 893 }
 894
 895 /* Called with RCU critical section */
 896 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 897 {
 898     uint64_t new_dirty_pages =
 899         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
 900
 901     rs->migration_dirty_pages += new_dirty_pages;
 902     rs->num_dirty_pages_period += new_dirty_pages;
 903 }
 904
 905 /**
 906  * ram_pagesize_summary: calculate all the pagesizes of a VM
 907  *
 908  * Returns a summary bitmap of the page sizes of all RAMBlocks
 909  *
 910  * For VMs with just normal pages this is equivalent to the host page
 911  * size. If it's got some huge pages then it's the OR of all the
 912  * different page sizes.
 913  */
 914 uint64_t ram_pagesize_summary(void)
 915 {
 916     RAMBlock *block;
 917     uint64_t summary = 0;
 918
 919     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 920         summary |= block->page_size;
 921     }
 922
 923     return summary;
 924 }
 925
 926 uint64_t ram_get_total_transferred_pages(void)
 927 {
 928     return stat64_get(&mig_stats.normal_pages) +
 929         stat64_get(&mig_stats.zero_pages) +
 930         compression_counters.pages + xbzrle_counters.pages;
 931 }
 932
 933 static void migration_update_rates(RAMState *rs, int64_t end_time)
 934 {
 935     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 936     double compressed_size;
 937
 938     /* calculate period counters */
 939     stat64_set(&mig_stats.dirty_pages_rate,
 940                rs->num_dirty_pages_period * 1000 /
 941                (end_time - rs->time_last_bitmap_sync));
 942
 943     if (!page_count) {
 944         return;
 945     }
 946
 947     if (migrate_xbzrle()) {
 948         double encoded_size, unencoded_size;
 949
 950         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 951             rs->xbzrle_cache_miss_prev) / page_count;
 952         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 953         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
 954                          TARGET_PAGE_SIZE;
 955         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
 956         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
 957             xbzrle_counters.encoding_rate = 0;
 958         } else {
 959             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
 960         }
 961         rs->xbzrle_pages_prev = xbzrle_counters.pages;
 962         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
 963     }
 964
 965     if (migrate_compress()) {
 966         compression_counters.busy_rate = (double)(compression_counters.busy -
 967             rs->compress_thread_busy_prev) / page_count;
 968         rs->compress_thread_busy_prev = compression_counters.busy;
 969
 970         compressed_size = compression_counters.compressed_size -
 971                           rs->compressed_size_prev;
 972         if (compressed_size) {
 973             double uncompressed_size = (compression_counters.pages -
 974                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
 975
 976             /* Compression-Ratio = Uncompressed-size / Compressed-size */
 977             compression_counters.compression_rate =
 978                                         uncompressed_size / compressed_size;
 979
 980             rs->compress_pages_prev = compression_counters.pages;
 981             rs->compressed_size_prev = compression_counters.compressed_size;
 982         }
 983     }
 984 }
 985
 986 static void migration_trigger_throttle(RAMState *rs)
 987 {
 988     uint64_t threshold = migrate_throttle_trigger_threshold();
 989     uint64_t bytes_xfer_period =
 990         stat64_get(&mig_stats.transferred) - rs->bytes_xfer_prev;
 991     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
 992     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
 993
 994     /* During block migration the auto-converge logic incorrectly detects
 995      * that ram migration makes no progress. Avoid this by disabling the
 996      * throttling logic during the bulk phase of block migration. */
 997     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 998         /* The following detection logic can be refined later. For now:
 999            Check to see if the ratio between dirtied bytes and the approx.
1000            amount of bytes that just got transferred since the last time
1001            we were in this routine reaches the threshold. If that happens
1002            twice, start or increase throttling. */
1003
1004         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1005             (++rs->dirty_rate_high_cnt >= 2)) {
1006             trace_migration_throttle();
1007             rs->dirty_rate_high_cnt = 0;
1008             mig_throttle_guest_down(bytes_dirty_period,
1009                                     bytes_dirty_threshold);
1010         }
1011     }
1012 }
1013
1014 static void migration_bitmap_sync(RAMState *rs, bool last_stage)
1015 {
1016     RAMBlock *block;
1017     int64_t end_time;
1018
1019     stat64_add(&mig_stats.dirty_sync_count, 1);
1020
1021     if (!rs->time_last_bitmap_sync) {
1022         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1023     }
1024
1025     trace_migration_bitmap_sync_start();
1026     memory_global_dirty_log_sync(last_stage);
1027
1028     qemu_mutex_lock(&rs->bitmap_mutex);
1029     WITH_RCU_READ_LOCK_GUARD() {
1030         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1031             ramblock_sync_dirty_bitmap(rs, block);
1032         }
1033         stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining());
1034     }
1035     qemu_mutex_unlock(&rs->bitmap_mutex);
1036
1037     memory_global_after_dirty_log_sync();
1038     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1039
1040     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1041
1042     /* more than 1 second = 1000 millisecons */
1043     if (end_time > rs->time_last_bitmap_sync + 1000) {
1044         migration_trigger_throttle(rs);
1045
1046         migration_update_rates(rs, end_time);
1047
1048         rs->target_page_count_prev = rs->target_page_count;
1049
1050         /* reset period counters */
1051         rs->time_last_bitmap_sync = end_time;
1052         rs->num_dirty_pages_period = 0;
1053         rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred);
1054     }
1055     if (migrate_events()) {
1056         uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
1057         qapi_event_send_migration_pass(generation);
1058     }
1059 }
1060
1061 static void migration_bitmap_sync_precopy(RAMState *rs, bool last_stage)
1062 {
1063     Error *local_err = NULL;
1064
1065     /*
1066      * The current notifier usage is just an optimization to migration, so we
1067      * don't stop the normal migration process in the error case.
1068      */
1069     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1070         error_report_err(local_err);
1071         local_err = NULL;
1072     }
1073
1074     migration_bitmap_sync(rs, last_stage);
1075
1076     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1077         error_report_err(local_err);
1078     }
1079 }
1080
1081 void ram_release_page(const char *rbname, uint64_t offset)
1082 {
1083     if (!migrate_release_ram() || !migration_in_postcopy()) {
1084         return;
1085     }
1086
1087     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1088 }
1089
1090 /**
1091  * save_zero_page_to_file: send the zero page to the file
1092  *
1093  * Returns the size of data written to the file, 0 means the page is not
1094  * a zero page
1095  *
1096  * @pss: current PSS channel
1097  * @block: block that contains the page we want to send
1098  * @offset: offset inside the block for the page
1099  */
1100 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
1101                                   RAMBlock *block, ram_addr_t offset)
1102 {
1103     uint8_t *p = block->host + offset;
1104     int len = 0;
1105
1106     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1107         len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1108         qemu_put_byte(file, 0);
1109         len += 1;
1110         ram_release_page(block->idstr, offset);
1111     }
1112     return len;
1113 }
1114
1115 /**
1116  * save_zero_page: send the zero page to the stream
1117  *
1118  * Returns the number of pages written.
1119  *
1120  * @pss: current PSS channel
1121  * @block: block that contains the page we want to send
1122  * @offset: offset inside the block for the page
1123  */
1124 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block,
1125                           ram_addr_t offset)
1126 {
1127     int len = save_zero_page_to_file(pss, f, block, offset);
1128
1129     if (len) {
1130         stat64_add(&mig_stats.zero_pages, 1);
1131         ram_transferred_add(len);
1132         return 1;
1133     }
1134     return -1;
1135 }
1136
1137 /*
1138  * @pages: the number of pages written by the control path,
1139  *        < 0 - error
1140  *        > 0 - number of pages written
1141  *
1142  * Return true if the pages has been saved, otherwise false is returned.
1143  */
1144 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1145                               ram_addr_t offset, int *pages)
1146 {
1147     uint64_t bytes_xmit = 0;
1148     int ret;
1149
1150     *pages = -1;
1151     ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1152                                 TARGET_PAGE_SIZE, &bytes_xmit);
1153     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1154         return false;
1155     }
1156
1157     if (bytes_xmit) {
1158         ram_transferred_add(bytes_xmit);
1159         *pages = 1;
1160     }
1161
1162     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1163         return true;
1164     }
1165
1166     if (bytes_xmit > 0) {
1167         stat64_add(&mig_stats.normal_pages, 1);
1168     } else if (bytes_xmit == 0) {
1169         stat64_add(&mig_stats.zero_pages, 1);
1170     }
1171
1172     return true;
1173 }
1174
1175 /*
1176  * directly send the page to the stream
1177  *
1178  * Returns the number of pages written.
1179  *
1180  * @pss: current PSS channel
1181  * @block: block that contains the page we want to send
1182  * @offset: offset inside the block for the page
1183  * @buf: the page to be sent
1184  * @async: send to page asyncly
1185  */
1186 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1187                             ram_addr_t offset, uint8_t *buf, bool async)
1188 {
1189     QEMUFile *file = pss->pss_channel;
1190
1191     ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
1192                                          offset | RAM_SAVE_FLAG_PAGE));
1193     if (async) {
1194         qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1195                               migrate_release_ram() &&
1196                               migration_in_postcopy());
1197     } else {
1198         qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1199     }
1200     ram_transferred_add(TARGET_PAGE_SIZE);
1201     stat64_add(&mig_stats.normal_pages, 1);
1202     return 1;
1203 }
1204
1205 /**
1206  * ram_save_page: send the given page to the stream
1207  *
1208  * Returns the number of pages written.
1209  *          < 0 - error
1210  *          >=0 - Number of pages written - this might legally be 0
1211  *                if xbzrle noticed the page was the same.
1212  *
1213  * @rs: current RAM state
1214  * @block: block that contains the page we want to send
1215  * @offset: offset inside the block for the page
1216  */
1217 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1218 {
1219     int pages = -1;
1220     uint8_t *p;
1221     bool send_async = true;
1222     RAMBlock *block = pss->block;
1223     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1224     ram_addr_t current_addr = block->offset + offset;
1225
1226     p = block->host + offset;
1227     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1228
1229     XBZRLE_cache_lock();
1230     if (rs->xbzrle_started && !migration_in_postcopy()) {
1231         pages = save_xbzrle_page(rs, pss, &p, current_addr,
1232                                  block, offset);
1233         if (!rs->last_stage) {
1234             /* Can't send this cached data async, since the cache page
1235              * might get updated before it gets to the wire
1236              */
1237             send_async = false;
1238         }
1239     }
1240
1241     /* XBZRLE overflow or normal page */
1242     if (pages == -1) {
1243         pages = save_normal_page(pss, block, offset, p, send_async);
1244     }
1245
1246     XBZRLE_cache_unlock();
1247
1248     return pages;
1249 }
1250
1251 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
1252                                  ram_addr_t offset)
1253 {
1254     if (multifd_queue_page(file, block, offset) < 0) {
1255         return -1;
1256     }
1257     stat64_add(&mig_stats.normal_pages, 1);
1258
1259     return 1;
1260 }
1261
1262 static void
1263 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1264 {
1265     ram_transferred_add(bytes_xmit);
1266
1267     if (param->result == RES_ZEROPAGE) {
1268         stat64_add(&mig_stats.zero_pages, 1);
1269         return;
1270     }
1271
1272     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1273     compression_counters.compressed_size += bytes_xmit - 8;
1274     compression_counters.pages++;
1275 }
1276
1277 static bool save_page_use_compression(RAMState *rs);
1278
1279 static int send_queued_data(CompressParam *param)
1280 {
1281     PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_PRECOPY];
1282     MigrationState *ms = migrate_get_current();
1283     QEMUFile *file = ms->to_dst_file;
1284     int len = 0;
1285
1286     RAMBlock *block = param->block;
1287     ram_addr_t offset = param->offset;
1288
1289     if (param->result == RES_NONE) {
1290         return 0;
1291     }
1292
1293     assert(block == pss->last_sent_block);
1294
1295     if (param->result == RES_ZEROPAGE) {
1296         assert(qemu_file_buffer_empty(param->file));
1297         len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1298         qemu_put_byte(file, 0);
1299         len += 1;
1300         ram_release_page(block->idstr, offset);
1301     } else if (param->result == RES_COMPRESS) {
1302         assert(!qemu_file_buffer_empty(param->file));
1303         len += save_page_header(pss, file, block,
1304                                 offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1305         len += qemu_put_qemu_file(file, param->file);
1306     } else {
1307         abort();
1308     }
1309
1310     update_compress_thread_counts(param, len);
1311
1312     return len;
1313 }
1314
1315 static void ram_flush_compressed_data(RAMState *rs)
1316 {
1317     if (!save_page_use_compression(rs)) {
1318         return;
1319     }
1320
1321     flush_compressed_data(send_queued_data);
1322 }
1323
1324 #define PAGE_ALL_CLEAN 0
1325 #define PAGE_TRY_AGAIN 1
1326 #define PAGE_DIRTY_FOUND 2
1327 /**
1328  * find_dirty_block: find the next dirty page and update any state
1329  * associated with the search process.
1330  *
1331  * Returns:
1332  *         <0: An error happened
1333  *         PAGE_ALL_CLEAN: no dirty page found, give up
1334  *         PAGE_TRY_AGAIN: no dirty page found, retry for next block
1335  *         PAGE_DIRTY_FOUND: dirty page found
1336  *
1337  * @rs: current RAM state
1338  * @pss: data about the state of the current dirty page scan
1339  * @again: set to false if the search has scanned the whole of RAM
1340  */
1341 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
1342 {
1343     /* Update pss->page for the next dirty bit in ramblock */
1344     pss_find_next_dirty(pss);
1345
1346     if (pss->complete_round && pss->block == rs->last_seen_block &&
1347         pss->page >= rs->last_page) {
1348         /*
1349          * We've been once around the RAM and haven't found anything.
1350          * Give up.
1351          */
1352         return PAGE_ALL_CLEAN;
1353     }
1354     if (!offset_in_ramblock(pss->block,
1355                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1356         /* Didn't find anything in this RAM Block */
1357         pss->page = 0;
1358         pss->block = QLIST_NEXT_RCU(pss->block, next);
1359         if (!pss->block) {
1360             if (!migrate_multifd_flush_after_each_section()) {
1361                 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel;
1362                 int ret = multifd_send_sync_main(f);
1363                 if (ret < 0) {
1364                     return ret;
1365                 }
1366                 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
1367                 qemu_fflush(f);
1368             }
1369             /*
1370              * If memory migration starts over, we will meet a dirtied page
1371              * which may still exists in compression threads's ring, so we
1372              * should flush the compressed data to make sure the new page
1373              * is not overwritten by the old one in the destination.
1374              *
1375              * Also If xbzrle is on, stop using the data compression at this
1376              * point. In theory, xbzrle can do better than compression.
1377              */
1378             ram_flush_compressed_data(rs);
1379
1380             /* Hit the end of the list */
1381             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1382             /* Flag that we've looped */
1383             pss->complete_round = true;
1384             /* After the first round, enable XBZRLE. */
1385             if (migrate_xbzrle()) {
1386                 rs->xbzrle_started = true;
1387             }
1388         }
1389         /* Didn't find anything this time, but try again on the new block */
1390         return PAGE_TRY_AGAIN;
1391     } else {
1392         /* We've found something */
1393         return PAGE_DIRTY_FOUND;
1394     }
1395 }
1396
1397 /**
1398  * unqueue_page: gets a page of the queue
1399  *
1400  * Helper for 'get_queued_page' - gets a page off the queue
1401  *
1402  * Returns the block of the page (or NULL if none available)
1403  *
1404  * @rs: current RAM state
1405  * @offset: used to return the offset within the RAMBlock
1406  */
1407 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1408 {
1409     struct RAMSrcPageRequest *entry;
1410     RAMBlock *block = NULL;
1411
1412     if (!postcopy_has_request(rs)) {
1413         return NULL;
1414     }
1415
1416     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1417
1418     /*
1419      * This should _never_ change even after we take the lock, because no one
1420      * should be taking anything off the request list other than us.
1421      */
1422     assert(postcopy_has_request(rs));
1423
1424     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1425     block = entry->rb;
1426     *offset = entry->offset;
1427
1428     if (entry->len > TARGET_PAGE_SIZE) {
1429         entry->len -= TARGET_PAGE_SIZE;
1430         entry->offset += TARGET_PAGE_SIZE;
1431     } else {
1432         memory_region_unref(block->mr);
1433         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1434         g_free(entry);
1435         migration_consume_urgent_request();
1436     }
1437
1438     return block;
1439 }
1440
1441 #if defined(__linux__)
1442 /**
1443  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1444  *   is found, return RAM block pointer and page offset
1445  *
1446  * Returns pointer to the RAMBlock containing faulting page,
1447  *   NULL if no write faults are pending
1448  *
1449  * @rs: current RAM state
1450  * @offset: page offset from the beginning of the block
1451  */
1452 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1453 {
1454     struct uffd_msg uffd_msg;
1455     void *page_address;
1456     RAMBlock *block;
1457     int res;
1458
1459     if (!migrate_background_snapshot()) {
1460         return NULL;
1461     }
1462
1463     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1464     if (res <= 0) {
1465         return NULL;
1466     }
1467
1468     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1469     block = qemu_ram_block_from_host(page_address, false, offset);
1470     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1471     return block;
1472 }
1473
1474 /**
1475  * ram_save_release_protection: release UFFD write protection after
1476  *   a range of pages has been saved
1477  *
1478  * @rs: current RAM state
1479  * @pss: page-search-status structure
1480  * @start_page: index of the first page in the range relative to pss->block
1481  *
1482  * Returns 0 on success, negative value in case of an error
1483 */
1484 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1485         unsigned long start_page)
1486 {
1487     int res = 0;
1488
1489     /* Check if page is from UFFD-managed region. */
1490     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1491         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1492         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1493
1494         /* Flush async buffers before un-protect. */
1495         qemu_fflush(pss->pss_channel);
1496         /* Un-protect memory range. */
1497         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1498                 false, false);
1499     }
1500
1501     return res;
1502 }
1503
1504 /* ram_write_tracking_available: check if kernel supports required UFFD features
1505  *
1506  * Returns true if supports, false otherwise
1507  */
1508 bool ram_write_tracking_available(void)
1509 {
1510     uint64_t uffd_features;
1511     int res;
1512
1513     res = uffd_query_features(&uffd_features);
1514     return (res == 0 &&
1515             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1516 }
1517
1518 /* ram_write_tracking_compatible: check if guest configuration is
1519  *   compatible with 'write-tracking'
1520  *
1521  * Returns true if compatible, false otherwise
1522  */
1523 bool ram_write_tracking_compatible(void)
1524 {
1525     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1526     int uffd_fd;
1527     RAMBlock *block;
1528     bool ret = false;
1529
1530     /* Open UFFD file descriptor */
1531     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1532     if (uffd_fd < 0) {
1533         return false;
1534     }
1535
1536     RCU_READ_LOCK_GUARD();
1537
1538     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1539         uint64_t uffd_ioctls;
1540
1541         /* Nothing to do with read-only and MMIO-writable regions */
1542         if (block->mr->readonly || block->mr->rom_device) {
1543             continue;
1544         }
1545         /* Try to register block memory via UFFD-IO to track writes */
1546         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1547                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1548             goto out;
1549         }
1550         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1551             goto out;
1552         }
1553     }
1554     ret = true;
1555
1556 out:
1557     uffd_close_fd(uffd_fd);
1558     return ret;
1559 }
1560
1561 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1562                                        ram_addr_t size)
1563 {
1564     const ram_addr_t end = offset + size;
1565
1566     /*
1567      * We read one byte of each page; this will preallocate page tables if
1568      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1569      * where no page was populated yet. This might require adaption when
1570      * supporting other mappings, like shmem.
1571      */
1572     for (; offset < end; offset += block->page_size) {
1573         char tmp = *((char *)block->host + offset);
1574
1575         /* Don't optimize the read out */
1576         asm volatile("" : "+r" (tmp));
1577     }
1578 }
1579
1580 static inline int populate_read_section(MemoryRegionSection *section,
1581                                         void *opaque)
1582 {
1583     const hwaddr size = int128_get64(section->size);
1584     hwaddr offset = section->offset_within_region;
1585     RAMBlock *block = section->mr->ram_block;
1586
1587     populate_read_range(block, offset, size);
1588     return 0;
1589 }
1590
1591 /*
1592  * ram_block_populate_read: preallocate page tables and populate pages in the
1593  *   RAM block by reading a byte of each page.
1594  *
1595  * Since it's solely used for userfault_fd WP feature, here we just
1596  *   hardcode page size to qemu_real_host_page_size.
1597  *
1598  * @block: RAM block to populate
1599  */
1600 static void ram_block_populate_read(RAMBlock *rb)
1601 {
1602     /*
1603      * Skip populating all pages that fall into a discarded range as managed by
1604      * a RamDiscardManager responsible for the mapped memory region of the
1605      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1606      * must not get populated automatically. We don't have to track
1607      * modifications via userfaultfd WP reliably, because these pages will
1608      * not be part of the migration stream either way -- see
1609      * ramblock_dirty_bitmap_exclude_discarded_pages().
1610      *
1611      * Note: The result is only stable while migrating (precopy/postcopy).
1612      */
1613     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1614         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1615         MemoryRegionSection section = {
1616             .mr = rb->mr,
1617             .offset_within_region = 0,
1618             .size = rb->mr->size,
1619         };
1620
1621         ram_discard_manager_replay_populated(rdm, &section,
1622                                              populate_read_section, NULL);
1623     } else {
1624         populate_read_range(rb, 0, rb->used_length);
1625     }
1626 }
1627
1628 /*
1629  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1630  */
1631 void ram_write_tracking_prepare(void)
1632 {
1633     RAMBlock *block;
1634
1635     RCU_READ_LOCK_GUARD();
1636
1637     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1638         /* Nothing to do with read-only and MMIO-writable regions */
1639         if (block->mr->readonly || block->mr->rom_device) {
1640             continue;
1641         }
1642
1643         /*
1644          * Populate pages of the RAM block before enabling userfault_fd
1645          * write protection.
1646          *
1647          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1648          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1649          * pages with pte_none() entries in page table.
1650          */
1651         ram_block_populate_read(block);
1652     }
1653 }
1654
1655 static inline int uffd_protect_section(MemoryRegionSection *section,
1656                                        void *opaque)
1657 {
1658     const hwaddr size = int128_get64(section->size);
1659     const hwaddr offset = section->offset_within_region;
1660     RAMBlock *rb = section->mr->ram_block;
1661     int uffd_fd = (uintptr_t)opaque;
1662
1663     return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1664                                   false);
1665 }
1666
1667 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1668 {
1669     assert(rb->flags & RAM_UF_WRITEPROTECT);
1670
1671     /* See ram_block_populate_read() */
1672     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1673         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1674         MemoryRegionSection section = {
1675             .mr = rb->mr,
1676             .offset_within_region = 0,
1677             .size = rb->mr->size,
1678         };
1679
1680         return ram_discard_manager_replay_populated(rdm, &section,
1681                                                     uffd_protect_section,
1682                                                     (void *)(uintptr_t)uffd_fd);
1683     }
1684     return uffd_change_protection(uffd_fd, rb->host,
1685                                   rb->used_length, true, false);
1686 }
1687
1688 /*
1689  * ram_write_tracking_start: start UFFD-WP memory tracking
1690  *
1691  * Returns 0 for success or negative value in case of error
1692  */
1693 int ram_write_tracking_start(void)
1694 {
1695     int uffd_fd;
1696     RAMState *rs = ram_state;
1697     RAMBlock *block;
1698
1699     /* Open UFFD file descriptor */
1700     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1701     if (uffd_fd < 0) {
1702         return uffd_fd;
1703     }
1704     rs->uffdio_fd = uffd_fd;
1705
1706     RCU_READ_LOCK_GUARD();
1707
1708     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1709         /* Nothing to do with read-only and MMIO-writable regions */
1710         if (block->mr->readonly || block->mr->rom_device) {
1711             continue;
1712         }
1713
1714         /* Register block memory with UFFD to track writes */
1715         if (uffd_register_memory(rs->uffdio_fd, block->host,
1716                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1717             goto fail;
1718         }
1719         block->flags |= RAM_UF_WRITEPROTECT;
1720         memory_region_ref(block->mr);
1721
1722         /* Apply UFFD write protection to the block memory range */
1723         if (ram_block_uffd_protect(block, uffd_fd)) {
1724             goto fail;
1725         }
1726
1727         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1728                 block->host, block->max_length);
1729     }
1730
1731     return 0;
1732
1733 fail:
1734     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1735
1736     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1737         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1738             continue;
1739         }
1740         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1741         /* Cleanup flags and remove reference */
1742         block->flags &= ~RAM_UF_WRITEPROTECT;
1743         memory_region_unref(block->mr);
1744     }
1745
1746     uffd_close_fd(uffd_fd);
1747     rs->uffdio_fd = -1;
1748     return -1;
1749 }
1750
1751 /**
1752  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1753  */
1754 void ram_write_tracking_stop(void)
1755 {
1756     RAMState *rs = ram_state;
1757     RAMBlock *block;
1758
1759     RCU_READ_LOCK_GUARD();
1760
1761     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1762         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1763             continue;
1764         }
1765         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1766
1767         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1768                 block->host, block->max_length);
1769
1770         /* Cleanup flags and remove reference */
1771         block->flags &= ~RAM_UF_WRITEPROTECT;
1772         memory_region_unref(block->mr);
1773     }
1774
1775     /* Finally close UFFD file descriptor */
1776     uffd_close_fd(rs->uffdio_fd);
1777     rs->uffdio_fd = -1;
1778 }
1779
1780 #else
1781 /* No target OS support, stubs just fail or ignore */
1782
1783 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1784 {
1785     (void) rs;
1786     (void) offset;
1787
1788     return NULL;
1789 }
1790
1791 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1792         unsigned long start_page)
1793 {
1794     (void) rs;
1795     (void) pss;
1796     (void) start_page;
1797
1798     return 0;
1799 }
1800
1801 bool ram_write_tracking_available(void)
1802 {
1803     return false;
1804 }
1805
1806 bool ram_write_tracking_compatible(void)
1807 {
1808     assert(0);
1809     return false;
1810 }
1811
1812 int ram_write_tracking_start(void)
1813 {
1814     assert(0);
1815     return -1;
1816 }
1817
1818 void ram_write_tracking_stop(void)
1819 {
1820     assert(0);
1821 }
1822 #endif /* defined(__linux__) */
1823
1824 /**
1825  * get_queued_page: unqueue a page from the postcopy requests
1826  *
1827  * Skips pages that are already sent (!dirty)
1828  *
1829  * Returns true if a queued page is found
1830  *
1831  * @rs: current RAM state
1832  * @pss: data about the state of the current dirty page scan
1833  */
1834 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1835 {
1836     RAMBlock  *block;
1837     ram_addr_t offset;
1838     bool dirty;
1839
1840     do {
1841         block = unqueue_page(rs, &offset);
1842         /*
1843          * We're sending this page, and since it's postcopy nothing else
1844          * will dirty it, and we must make sure it doesn't get sent again
1845          * even if this queue request was received after the background
1846          * search already sent it.
1847          */
1848         if (block) {
1849             unsigned long page;
1850
1851             page = offset >> TARGET_PAGE_BITS;
1852             dirty = test_bit(page, block->bmap);
1853             if (!dirty) {
1854                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1855                                                 page);
1856             } else {
1857                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1858             }
1859         }
1860
1861     } while (block && !dirty);
1862
1863     if (!block) {
1864         /*
1865          * Poll write faults too if background snapshot is enabled; that's
1866          * when we have vcpus got blocked by the write protected pages.
1867          */
1868         block = poll_fault_page(rs, &offset);
1869     }
1870
1871     if (block) {
1872         /*
1873          * We want the background search to continue from the queued page
1874          * since the guest is likely to want other pages near to the page
1875          * it just requested.
1876          */
1877         pss->block = block;
1878         pss->page = offset >> TARGET_PAGE_BITS;
1879
1880         /*
1881          * This unqueued page would break the "one round" check, even is
1882          * really rare.
1883          */
1884         pss->complete_round = false;
1885     }
1886
1887     return !!block;
1888 }
1889
1890 /**
1891  * migration_page_queue_free: drop any remaining pages in the ram
1892  * request queue
1893  *
1894  * It should be empty at the end anyway, but in error cases there may
1895  * be some left.  in case that there is any page left, we drop it.
1896  *
1897  */
1898 static void migration_page_queue_free(RAMState *rs)
1899 {
1900     struct RAMSrcPageRequest *mspr, *next_mspr;
1901     /* This queue generally should be empty - but in the case of a failed
1902      * migration might have some droppings in.
1903      */
1904     RCU_READ_LOCK_GUARD();
1905     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1906         memory_region_unref(mspr->rb->mr);
1907         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1908         g_free(mspr);
1909     }
1910 }
1911
1912 /**
1913  * ram_save_queue_pages: queue the page for transmission
1914  *
1915  * A request from postcopy destination for example.
1916  *
1917  * Returns zero on success or negative on error
1918  *
1919  * @rbname: Name of the RAMBLock of the request. NULL means the
1920  *          same that last one.
1921  * @start: starting address from the start of the RAMBlock
1922  * @len: length (in bytes) to send
1923  */
1924 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1925 {
1926     RAMBlock *ramblock;
1927     RAMState *rs = ram_state;
1928
1929     stat64_add(&mig_stats.postcopy_requests, 1);
1930     RCU_READ_LOCK_GUARD();
1931
1932     if (!rbname) {
1933         /* Reuse last RAMBlock */
1934         ramblock = rs->last_req_rb;
1935
1936         if (!ramblock) {
1937             /*
1938              * Shouldn't happen, we can't reuse the last RAMBlock if
1939              * it's the 1st request.
1940              */
1941             error_report("ram_save_queue_pages no previous block");
1942             return -1;
1943         }
1944     } else {
1945         ramblock = qemu_ram_block_by_name(rbname);
1946
1947         if (!ramblock) {
1948             /* We shouldn't be asked for a non-existent RAMBlock */
1949             error_report("ram_save_queue_pages no block '%s'", rbname);
1950             return -1;
1951         }
1952         rs->last_req_rb = ramblock;
1953     }
1954     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1955     if (!offset_in_ramblock(ramblock, start + len - 1)) {
1956         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1957                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1958                      __func__, start, len, ramblock->used_length);
1959         return -1;
1960     }
1961
1962     /*
1963      * When with postcopy preempt, we send back the page directly in the
1964      * rp-return thread.
1965      */
1966     if (postcopy_preempt_active()) {
1967         ram_addr_t page_start = start >> TARGET_PAGE_BITS;
1968         size_t page_size = qemu_ram_pagesize(ramblock);
1969         PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
1970         int ret = 0;
1971
1972         qemu_mutex_lock(&rs->bitmap_mutex);
1973
1974         pss_init(pss, ramblock, page_start);
1975         /*
1976          * Always use the preempt channel, and make sure it's there.  It's
1977          * safe to access without lock, because when rp-thread is running
1978          * we should be the only one who operates on the qemufile
1979          */
1980         pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
1981         assert(pss->pss_channel);
1982
1983         /*
1984          * It must be either one or multiple of host page size.  Just
1985          * assert; if something wrong we're mostly split brain anyway.
1986          */
1987         assert(len % page_size == 0);
1988         while (len) {
1989             if (ram_save_host_page_urgent(pss)) {
1990                 error_report("%s: ram_save_host_page_urgent() failed: "
1991                              "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
1992                              __func__, ramblock->idstr, start);
1993                 ret = -1;
1994                 break;
1995             }
1996             /*
1997              * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
1998              * will automatically be moved and point to the next host page
1999              * we're going to send, so no need to update here.
2000              *
2001              * Normally QEMU never sends >1 host page in requests, so
2002              * logically we don't even need that as the loop should only
2003              * run once, but just to be consistent.
2004              */
2005             len -= page_size;
2006         };
2007         qemu_mutex_unlock(&rs->bitmap_mutex);
2008
2009         return ret;
2010     }
2011
2012     struct RAMSrcPageRequest *new_entry =
2013         g_new0(struct RAMSrcPageRequest, 1);
2014     new_entry->rb = ramblock;
2015     new_entry->offset = start;
2016     new_entry->len = len;
2017
2018     memory_region_ref(ramblock->mr);
2019     qemu_mutex_lock(&rs->src_page_req_mutex);
2020     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2021     migration_make_urgent_request();
2022     qemu_mutex_unlock(&rs->src_page_req_mutex);
2023
2024     return 0;
2025 }
2026
2027 static bool save_page_use_compression(RAMState *rs)
2028 {
2029     if (!migrate_compress()) {
2030         return false;
2031     }
2032
2033     /*
2034      * If xbzrle is enabled (e.g., after first round of migration), stop
2035      * using the data compression. In theory, xbzrle can do better than
2036      * compression.
2037      */
2038     if (rs->xbzrle_started) {
2039         return false;
2040     }
2041
2042     return true;
2043 }
2044
2045 /*
2046  * try to compress the page before posting it out, return true if the page
2047  * has been properly handled by compression, otherwise needs other
2048  * paths to handle it
2049  */
2050 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2051                                RAMBlock *block, ram_addr_t offset)
2052 {
2053     if (!save_page_use_compression(rs)) {
2054         return false;
2055     }
2056
2057     /*
2058      * When starting the process of a new block, the first page of
2059      * the block should be sent out before other pages in the same
2060      * block, and all the pages in last block should have been sent
2061      * out, keeping this order is important, because the 'cont' flag
2062      * is used to avoid resending the block name.
2063      *
2064      * We post the fist page as normal page as compression will take
2065      * much CPU resource.
2066      */
2067     if (block != pss->last_sent_block) {
2068         ram_flush_compressed_data(rs);
2069         return false;
2070     }
2071
2072     if (compress_page_with_multi_thread(block, offset, send_queued_data) > 0) {
2073         return true;
2074     }
2075
2076     compression_counters.busy++;
2077     return false;
2078 }
2079
2080 /**
2081  * ram_save_target_page_legacy: save one target page
2082  *
2083  * Returns the number of pages written
2084  *
2085  * @rs: current RAM state
2086  * @pss: data about the page we want to send
2087  */
2088 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
2089 {
2090     RAMBlock *block = pss->block;
2091     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2092     int res;
2093
2094     if (control_save_page(pss, block, offset, &res)) {
2095         return res;
2096     }
2097
2098     if (save_compress_page(rs, pss, block, offset)) {
2099         return 1;
2100     }
2101
2102     res = save_zero_page(pss, pss->pss_channel, block, offset);
2103     if (res > 0) {
2104         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2105          * page would be stale
2106          */
2107         if (rs->xbzrle_started) {
2108             XBZRLE_cache_lock();
2109             xbzrle_cache_zero_page(rs, block->offset + offset);
2110             XBZRLE_cache_unlock();
2111         }
2112         return res;
2113     }
2114
2115     /*
2116      * Do not use multifd in postcopy as one whole host page should be
2117      * placed.  Meanwhile postcopy requires atomic update of pages, so even
2118      * if host page size == guest page size the dest guest during run may
2119      * still see partially copied pages which is data corruption.
2120      */
2121     if (migrate_multifd() && !migration_in_postcopy()) {
2122         return ram_save_multifd_page(pss->pss_channel, block, offset);
2123     }
2124
2125     return ram_save_page(rs, pss);
2126 }
2127
2128 /* Should be called before sending a host page */
2129 static void pss_host_page_prepare(PageSearchStatus *pss)
2130 {
2131     /* How many guest pages are there in one host page? */
2132     size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2133
2134     pss->host_page_sending = true;
2135     if (guest_pfns <= 1) {
2136         /*
2137          * This covers both when guest psize == host psize, or when guest
2138          * has larger psize than the host (guest_pfns==0).
2139          *
2140          * For the latter, we always send one whole guest page per
2141          * iteration of the host page (example: an Alpha VM on x86 host
2142          * will have guest psize 8K while host psize 4K).
2143          */
2144         pss->host_page_start = pss->page;
2145         pss->host_page_end = pss->page + 1;
2146     } else {
2147         /*
2148          * The host page spans over multiple guest pages, we send them
2149          * within the same host page iteration.
2150          */
2151         pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2152         pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2153     }
2154 }
2155
2156 /*
2157  * Whether the page pointed by PSS is within the host page being sent.
2158  * Must be called after a previous pss_host_page_prepare().
2159  */
2160 static bool pss_within_range(PageSearchStatus *pss)
2161 {
2162     ram_addr_t ram_addr;
2163
2164     assert(pss->host_page_sending);
2165
2166     /* Over host-page boundary? */
2167     if (pss->page >= pss->host_page_end) {
2168         return false;
2169     }
2170
2171     ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2172
2173     return offset_in_ramblock(pss->block, ram_addr);
2174 }
2175
2176 static void pss_host_page_finish(PageSearchStatus *pss)
2177 {
2178     pss->host_page_sending = false;
2179     /* This is not needed, but just to reset it */
2180     pss->host_page_start = pss->host_page_end = 0;
2181 }
2182
2183 /*
2184  * Send an urgent host page specified by `pss'.  Need to be called with
2185  * bitmap_mutex held.
2186  *
2187  * Returns 0 if save host page succeeded, false otherwise.
2188  */
2189 static int ram_save_host_page_urgent(PageSearchStatus *pss)
2190 {
2191     bool page_dirty, sent = false;
2192     RAMState *rs = ram_state;
2193     int ret = 0;
2194
2195     trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2196     pss_host_page_prepare(pss);
2197
2198     /*
2199      * If precopy is sending the same page, let it be done in precopy, or
2200      * we could send the same page in two channels and none of them will
2201      * receive the whole page.
2202      */
2203     if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2204         trace_postcopy_preempt_hit(pss->block->idstr,
2205                                    pss->page << TARGET_PAGE_BITS);
2206         return 0;
2207     }
2208
2209     do {
2210         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2211
2212         if (page_dirty) {
2213             /* Be strict to return code; it must be 1, or what else? */
2214             if (migration_ops->ram_save_target_page(rs, pss) != 1) {
2215                 error_report_once("%s: ram_save_target_page failed", __func__);
2216                 ret = -1;
2217                 goto out;
2218             }
2219             sent = true;
2220         }
2221         pss_find_next_dirty(pss);
2222     } while (pss_within_range(pss));
2223 out:
2224     pss_host_page_finish(pss);
2225     /* For urgent requests, flush immediately if sent */
2226     if (sent) {
2227         qemu_fflush(pss->pss_channel);
2228     }
2229     return ret;
2230 }
2231
2232 /**
2233  * ram_save_host_page: save a whole host page
2234  *
2235  * Starting at *offset send pages up to the end of the current host
2236  * page. It's valid for the initial offset to point into the middle of
2237  * a host page in which case the remainder of the hostpage is sent.
2238  * Only dirty target pages are sent. Note that the host page size may
2239  * be a huge page for this block.
2240  *
2241  * The saving stops at the boundary of the used_length of the block
2242  * if the RAMBlock isn't a multiple of the host page size.
2243  *
2244  * The caller must be with ram_state.bitmap_mutex held to call this
2245  * function.  Note that this function can temporarily release the lock, but
2246  * when the function is returned it'll make sure the lock is still held.
2247  *
2248  * Returns the number of pages written or negative on error
2249  *
2250  * @rs: current RAM state
2251  * @pss: data about the page we want to send
2252  */
2253 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2254 {
2255     bool page_dirty, preempt_active = postcopy_preempt_active();
2256     int tmppages, pages = 0;
2257     size_t pagesize_bits =
2258         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2259     unsigned long start_page = pss->page;
2260     int res;
2261
2262     if (ramblock_is_ignored(pss->block)) {
2263         error_report("block %s should not be migrated !", pss->block->idstr);
2264         return 0;
2265     }
2266
2267     /* Update host page boundary information */
2268     pss_host_page_prepare(pss);
2269
2270     do {
2271         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2272
2273         /* Check the pages is dirty and if it is send it */
2274         if (page_dirty) {
2275             /*
2276              * Properly yield the lock only in postcopy preempt mode
2277              * because both migration thread and rp-return thread can
2278              * operate on the bitmaps.
2279              */
2280             if (preempt_active) {
2281                 qemu_mutex_unlock(&rs->bitmap_mutex);
2282             }
2283             tmppages = migration_ops->ram_save_target_page(rs, pss);
2284             if (tmppages >= 0) {
2285                 pages += tmppages;
2286                 /*
2287                  * Allow rate limiting to happen in the middle of huge pages if
2288                  * something is sent in the current iteration.
2289                  */
2290                 if (pagesize_bits > 1 && tmppages > 0) {
2291                     migration_rate_limit();
2292                 }
2293             }
2294             if (preempt_active) {
2295                 qemu_mutex_lock(&rs->bitmap_mutex);
2296             }
2297         } else {
2298             tmppages = 0;
2299         }
2300
2301         if (tmppages < 0) {
2302             pss_host_page_finish(pss);
2303             return tmppages;
2304         }
2305
2306         pss_find_next_dirty(pss);
2307     } while (pss_within_range(pss));
2308
2309     pss_host_page_finish(pss);
2310
2311     res = ram_save_release_protection(rs, pss, start_page);
2312     return (res < 0 ? res : pages);
2313 }
2314
2315 /**
2316  * ram_find_and_save_block: finds a dirty page and sends it to f
2317  *
2318  * Called within an RCU critical section.
2319  *
2320  * Returns the number of pages written where zero means no dirty pages,
2321  * or negative on error
2322  *
2323  * @rs: current RAM state
2324  *
2325  * On systems where host-page-size > target-page-size it will send all the
2326  * pages in a host page that are dirty.
2327  */
2328 static int ram_find_and_save_block(RAMState *rs)
2329 {
2330     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2331     int pages = 0;
2332
2333     /* No dirty page as there is zero RAM */
2334     if (!rs->ram_bytes_total) {
2335         return pages;
2336     }
2337
2338     /*
2339      * Always keep last_seen_block/last_page valid during this procedure,
2340      * because find_dirty_block() relies on these values (e.g., we compare
2341      * last_seen_block with pss.block to see whether we searched all the
2342      * ramblocks) to detect the completion of migration.  Having NULL value
2343      * of last_seen_block can conditionally cause below loop to run forever.
2344      */
2345     if (!rs->last_seen_block) {
2346         rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2347         rs->last_page = 0;
2348     }
2349
2350     pss_init(pss, rs->last_seen_block, rs->last_page);
2351
2352     while (true){
2353         if (!get_queued_page(rs, pss)) {
2354             /* priority queue empty, so just search for something dirty */
2355             int res = find_dirty_block(rs, pss);
2356             if (res != PAGE_DIRTY_FOUND) {
2357                 if (res == PAGE_ALL_CLEAN) {
2358                     break;
2359                 } else if (res == PAGE_TRY_AGAIN) {
2360                     continue;
2361                 } else if (res < 0) {
2362                     pages = res;
2363                     break;
2364                 }
2365             }
2366         }
2367         pages = ram_save_host_page(rs, pss);
2368         if (pages) {
2369             break;
2370         }
2371     }
2372
2373     rs->last_seen_block = pss->block;
2374     rs->last_page = pss->page;
2375
2376     return pages;
2377 }
2378
2379 static uint64_t ram_bytes_total_with_ignored(void)
2380 {
2381     RAMBlock *block;
2382     uint64_t total = 0;
2383
2384     RCU_READ_LOCK_GUARD();
2385
2386     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2387         total += block->used_length;
2388     }
2389     return total;
2390 }
2391
2392 uint64_t ram_bytes_total(void)
2393 {
2394     RAMBlock *block;
2395     uint64_t total = 0;
2396
2397     RCU_READ_LOCK_GUARD();
2398
2399     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2400         total += block->used_length;
2401     }
2402     return total;
2403 }
2404
2405 static void xbzrle_load_setup(void)
2406 {
2407     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2408 }
2409
2410 static void xbzrle_load_cleanup(void)
2411 {
2412     g_free(XBZRLE.decoded_buf);
2413     XBZRLE.decoded_buf = NULL;
2414 }
2415
2416 static void ram_state_cleanup(RAMState **rsp)
2417 {
2418     if (*rsp) {
2419         migration_page_queue_free(*rsp);
2420         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2421         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2422         g_free(*rsp);
2423         *rsp = NULL;
2424     }
2425 }
2426
2427 static void xbzrle_cleanup(void)
2428 {
2429     XBZRLE_cache_lock();
2430     if (XBZRLE.cache) {
2431         cache_fini(XBZRLE.cache);
2432         g_free(XBZRLE.encoded_buf);
2433         g_free(XBZRLE.current_buf);
2434         g_free(XBZRLE.zero_target_page);
2435         XBZRLE.cache = NULL;
2436         XBZRLE.encoded_buf = NULL;
2437         XBZRLE.current_buf = NULL;
2438         XBZRLE.zero_target_page = NULL;
2439     }
2440     XBZRLE_cache_unlock();
2441 }
2442
2443 static void ram_save_cleanup(void *opaque)
2444 {
2445     RAMState **rsp = opaque;
2446     RAMBlock *block;
2447
2448     /* We don't use dirty log with background snapshots */
2449     if (!migrate_background_snapshot()) {
2450         /* caller have hold iothread lock or is in a bh, so there is
2451          * no writing race against the migration bitmap
2452          */
2453         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2454             /*
2455              * do not stop dirty log without starting it, since
2456              * memory_global_dirty_log_stop will assert that
2457              * memory_global_dirty_log_start/stop used in pairs
2458              */
2459             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2460         }
2461     }
2462
2463     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2464         g_free(block->clear_bmap);
2465         block->clear_bmap = NULL;
2466         g_free(block->bmap);
2467         block->bmap = NULL;
2468     }
2469
2470     xbzrle_cleanup();
2471     compress_threads_save_cleanup();
2472     ram_state_cleanup(rsp);
2473     g_free(migration_ops);
2474     migration_ops = NULL;
2475 }
2476
2477 static void ram_state_reset(RAMState *rs)
2478 {
2479     int i;
2480
2481     for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2482         rs->pss[i].last_sent_block = NULL;
2483     }
2484
2485     rs->last_seen_block = NULL;
2486     rs->last_page = 0;
2487     rs->last_version = ram_list.version;
2488     rs->xbzrle_started = false;
2489 }
2490
2491 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2492
2493 /* **** functions for postcopy ***** */
2494
2495 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2496 {
2497     struct RAMBlock *block;
2498
2499     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2500         unsigned long *bitmap = block->bmap;
2501         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2502         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2503
2504         while (run_start < range) {
2505             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2506             ram_discard_range(block->idstr,
2507                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2508                               ((ram_addr_t)(run_end - run_start))
2509                                 << TARGET_PAGE_BITS);
2510             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2511         }
2512     }
2513 }
2514
2515 /**
2516  * postcopy_send_discard_bm_ram: discard a RAMBlock
2517  *
2518  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2519  *
2520  * @ms: current migration state
2521  * @block: RAMBlock to discard
2522  */
2523 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2524 {
2525     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2526     unsigned long current;
2527     unsigned long *bitmap = block->bmap;
2528
2529     for (current = 0; current < end; ) {
2530         unsigned long one = find_next_bit(bitmap, end, current);
2531         unsigned long zero, discard_length;
2532
2533         if (one >= end) {
2534             break;
2535         }
2536
2537         zero = find_next_zero_bit(bitmap, end, one + 1);
2538
2539         if (zero >= end) {
2540             discard_length = end - one;
2541         } else {
2542             discard_length = zero - one;
2543         }
2544         postcopy_discard_send_range(ms, one, discard_length);
2545         current = one + discard_length;
2546     }
2547 }
2548
2549 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2550
2551 /**
2552  * postcopy_each_ram_send_discard: discard all RAMBlocks
2553  *
2554  * Utility for the outgoing postcopy code.
2555  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2556  *   passing it bitmap indexes and name.
2557  * (qemu_ram_foreach_block ends up passing unscaled lengths
2558  *  which would mean postcopy code would have to deal with target page)
2559  *
2560  * @ms: current migration state
2561  */
2562 static void postcopy_each_ram_send_discard(MigrationState *ms)
2563 {
2564     struct RAMBlock *block;
2565
2566     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2567         postcopy_discard_send_init(ms, block->idstr);
2568
2569         /*
2570          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2571          * host-page size chunks, mark any partially dirty host-page size
2572          * chunks as all dirty.  In this case the host-page is the host-page
2573          * for the particular RAMBlock, i.e. it might be a huge page.
2574          */
2575         postcopy_chunk_hostpages_pass(ms, block);
2576
2577         /*
2578          * Postcopy sends chunks of bitmap over the wire, but it
2579          * just needs indexes at this point, avoids it having
2580          * target page specific code.
2581          */
2582         postcopy_send_discard_bm_ram(ms, block);
2583         postcopy_discard_send_finish(ms);
2584     }
2585 }
2586
2587 /**
2588  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2589  *
2590  * Helper for postcopy_chunk_hostpages; it's called twice to
2591  * canonicalize the two bitmaps, that are similar, but one is
2592  * inverted.
2593  *
2594  * Postcopy requires that all target pages in a hostpage are dirty or
2595  * clean, not a mix.  This function canonicalizes the bitmaps.
2596  *
2597  * @ms: current migration state
2598  * @block: block that contains the page we want to canonicalize
2599  */
2600 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2601 {
2602     RAMState *rs = ram_state;
2603     unsigned long *bitmap = block->bmap;
2604     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2605     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2606     unsigned long run_start;
2607
2608     if (block->page_size == TARGET_PAGE_SIZE) {
2609         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2610         return;
2611     }
2612
2613     /* Find a dirty page */
2614     run_start = find_next_bit(bitmap, pages, 0);
2615
2616     while (run_start < pages) {
2617
2618         /*
2619          * If the start of this run of pages is in the middle of a host
2620          * page, then we need to fixup this host page.
2621          */
2622         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2623             /* Find the end of this run */
2624             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2625             /*
2626              * If the end isn't at the start of a host page, then the
2627              * run doesn't finish at the end of a host page
2628              * and we need to discard.
2629              */
2630         }
2631
2632         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2633             unsigned long page;
2634             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2635                                                              host_ratio);
2636             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2637
2638             /* Clean up the bitmap */
2639             for (page = fixup_start_addr;
2640                  page < fixup_start_addr + host_ratio; page++) {
2641                 /*
2642                  * Remark them as dirty, updating the count for any pages
2643                  * that weren't previously dirty.
2644                  */
2645                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2646             }
2647         }
2648
2649         /* Find the next dirty page for the next iteration */
2650         run_start = find_next_bit(bitmap, pages, run_start);
2651     }
2652 }
2653
2654 /**
2655  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2656  *
2657  * Transmit the set of pages to be discarded after precopy to the target
2658  * these are pages that:
2659  *     a) Have been previously transmitted but are now dirty again
2660  *     b) Pages that have never been transmitted, this ensures that
2661  *        any pages on the destination that have been mapped by background
2662  *        tasks get discarded (transparent huge pages is the specific concern)
2663  * Hopefully this is pretty sparse
2664  *
2665  * @ms: current migration state
2666  */
2667 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2668 {
2669     RAMState *rs = ram_state;
2670
2671     RCU_READ_LOCK_GUARD();
2672
2673     /* This should be our last sync, the src is now paused */
2674     migration_bitmap_sync(rs, false);
2675
2676     /* Easiest way to make sure we don't resume in the middle of a host-page */
2677     rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
2678     rs->last_seen_block = NULL;
2679     rs->last_page = 0;
2680
2681     postcopy_each_ram_send_discard(ms);
2682
2683     trace_ram_postcopy_send_discard_bitmap();
2684 }
2685
2686 /**
2687  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2688  *
2689  * Returns zero on success
2690  *
2691  * @rbname: name of the RAMBlock of the request. NULL means the
2692  *          same that last one.
2693  * @start: RAMBlock starting page
2694  * @length: RAMBlock size
2695  */
2696 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2697 {
2698     trace_ram_discard_range(rbname, start, length);
2699
2700     RCU_READ_LOCK_GUARD();
2701     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2702
2703     if (!rb) {
2704         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2705         return -1;
2706     }
2707
2708     /*
2709      * On source VM, we don't need to update the received bitmap since
2710      * we don't even have one.
2711      */
2712     if (rb->receivedmap) {
2713         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2714                      length >> qemu_target_page_bits());
2715     }
2716
2717     return ram_block_discard_range(rb, start, length);
2718 }
2719
2720 /*
2721  * For every allocation, we will try not to crash the VM if the
2722  * allocation failed.
2723  */
2724 static int xbzrle_init(void)
2725 {
2726     Error *local_err = NULL;
2727
2728     if (!migrate_xbzrle()) {
2729         return 0;
2730     }
2731
2732     XBZRLE_cache_lock();
2733
2734     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2735     if (!XBZRLE.zero_target_page) {
2736         error_report("%s: Error allocating zero page", __func__);
2737         goto err_out;
2738     }
2739
2740     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2741                               TARGET_PAGE_SIZE, &local_err);
2742     if (!XBZRLE.cache) {
2743         error_report_err(local_err);
2744         goto free_zero_page;
2745     }
2746
2747     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2748     if (!XBZRLE.encoded_buf) {
2749         error_report("%s: Error allocating encoded_buf", __func__);
2750         goto free_cache;
2751     }
2752
2753     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2754     if (!XBZRLE.current_buf) {
2755         error_report("%s: Error allocating current_buf", __func__);
2756         goto free_encoded_buf;
2757     }
2758
2759     /* We are all good */
2760     XBZRLE_cache_unlock();
2761     return 0;
2762
2763 free_encoded_buf:
2764     g_free(XBZRLE.encoded_buf);
2765     XBZRLE.encoded_buf = NULL;
2766 free_cache:
2767     cache_fini(XBZRLE.cache);
2768     XBZRLE.cache = NULL;
2769 free_zero_page:
2770     g_free(XBZRLE.zero_target_page);
2771     XBZRLE.zero_target_page = NULL;
2772 err_out:
2773     XBZRLE_cache_unlock();
2774     return -ENOMEM;
2775 }
2776
2777 static int ram_state_init(RAMState **rsp)
2778 {
2779     *rsp = g_try_new0(RAMState, 1);
2780
2781     if (!*rsp) {
2782         error_report("%s: Init ramstate fail", __func__);
2783         return -1;
2784     }
2785
2786     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2787     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2788     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2789     (*rsp)->ram_bytes_total = ram_bytes_total();
2790
2791     /*
2792      * Count the total number of pages used by ram blocks not including any
2793      * gaps due to alignment or unplugs.
2794      * This must match with the initial values of dirty bitmap.
2795      */
2796     (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
2797     ram_state_reset(*rsp);
2798
2799     return 0;
2800 }
2801
2802 static void ram_list_init_bitmaps(void)
2803 {
2804     MigrationState *ms = migrate_get_current();
2805     RAMBlock *block;
2806     unsigned long pages;
2807     uint8_t shift;
2808
2809     /* Skip setting bitmap if there is no RAM */
2810     if (ram_bytes_total()) {
2811         shift = ms->clear_bitmap_shift;
2812         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2813             error_report("clear_bitmap_shift (%u) too big, using "
2814                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2815             shift = CLEAR_BITMAP_SHIFT_MAX;
2816         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2817             error_report("clear_bitmap_shift (%u) too small, using "
2818                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2819             shift = CLEAR_BITMAP_SHIFT_MIN;
2820         }
2821
2822         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2823             pages = block->max_length >> TARGET_PAGE_BITS;
2824             /*
2825              * The initial dirty bitmap for migration must be set with all
2826              * ones to make sure we'll migrate every guest RAM page to
2827              * destination.
2828              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2829              * new migration after a failed migration, ram_list.
2830              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2831              * guest memory.
2832              */
2833             block->bmap = bitmap_new(pages);
2834             bitmap_set(block->bmap, 0, pages);
2835             block->clear_bmap_shift = shift;
2836             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2837         }
2838     }
2839 }
2840
2841 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2842 {
2843     unsigned long pages;
2844     RAMBlock *rb;
2845
2846     RCU_READ_LOCK_GUARD();
2847
2848     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2849             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2850             rs->migration_dirty_pages -= pages;
2851     }
2852 }
2853
2854 static void ram_init_bitmaps(RAMState *rs)
2855 {
2856     /* For memory_global_dirty_log_start below.  */
2857     qemu_mutex_lock_iothread();
2858     qemu_mutex_lock_ramlist();
2859
2860     WITH_RCU_READ_LOCK_GUARD() {
2861         ram_list_init_bitmaps();
2862         /* We don't use dirty log with background snapshots */
2863         if (!migrate_background_snapshot()) {
2864             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2865             migration_bitmap_sync_precopy(rs, false);
2866         }
2867     }
2868     qemu_mutex_unlock_ramlist();
2869     qemu_mutex_unlock_iothread();
2870
2871     /*
2872      * After an eventual first bitmap sync, fixup the initial bitmap
2873      * containing all 1s to exclude any discarded pages from migration.
2874      */
2875     migration_bitmap_clear_discarded_pages(rs);
2876 }
2877
2878 static int ram_init_all(RAMState **rsp)
2879 {
2880     if (ram_state_init(rsp)) {
2881         return -1;
2882     }
2883
2884     if (xbzrle_init()) {
2885         ram_state_cleanup(rsp);
2886         return -1;
2887     }
2888
2889     ram_init_bitmaps(*rsp);
2890
2891     return 0;
2892 }
2893
2894 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2895 {
2896     RAMBlock *block;
2897     uint64_t pages = 0;
2898
2899     /*
2900      * Postcopy is not using xbzrle/compression, so no need for that.
2901      * Also, since source are already halted, we don't need to care
2902      * about dirty page logging as well.
2903      */
2904
2905     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2906         pages += bitmap_count_one(block->bmap,
2907                                   block->used_length >> TARGET_PAGE_BITS);
2908     }
2909
2910     /* This may not be aligned with current bitmaps. Recalculate. */
2911     rs->migration_dirty_pages = pages;
2912
2913     ram_state_reset(rs);
2914
2915     /* Update RAMState cache of output QEMUFile */
2916     rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
2917
2918     trace_ram_state_resume_prepare(pages);
2919 }
2920
2921 /*
2922  * This function clears bits of the free pages reported by the caller from the
2923  * migration dirty bitmap. @addr is the host address corresponding to the
2924  * start of the continuous guest free pages, and @len is the total bytes of
2925  * those pages.
2926  */
2927 void qemu_guest_free_page_hint(void *addr, size_t len)
2928 {
2929     RAMBlock *block;
2930     ram_addr_t offset;
2931     size_t used_len, start, npages;
2932     MigrationState *s = migrate_get_current();
2933
2934     /* This function is currently expected to be used during live migration */
2935     if (!migration_is_setup_or_active(s->state)) {
2936         return;
2937     }
2938
2939     for (; len > 0; len -= used_len, addr += used_len) {
2940         block = qemu_ram_block_from_host(addr, false, &offset);
2941         if (unlikely(!block || offset >= block->used_length)) {
2942             /*
2943              * The implementation might not support RAMBlock resize during
2944              * live migration, but it could happen in theory with future
2945              * updates. So we add a check here to capture that case.
2946              */
2947             error_report_once("%s unexpected error", __func__);
2948             return;
2949         }
2950
2951         if (len <= block->used_length - offset) {
2952             used_len = len;
2953         } else {
2954             used_len = block->used_length - offset;
2955         }
2956
2957         start = offset >> TARGET_PAGE_BITS;
2958         npages = used_len >> TARGET_PAGE_BITS;
2959
2960         qemu_mutex_lock(&ram_state->bitmap_mutex);
2961         /*
2962          * The skipped free pages are equavalent to be sent from clear_bmap's
2963          * perspective, so clear the bits from the memory region bitmap which
2964          * are initially set. Otherwise those skipped pages will be sent in
2965          * the next round after syncing from the memory region bitmap.
2966          */
2967         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2968         ram_state->migration_dirty_pages -=
2969                       bitmap_count_one_with_offset(block->bmap, start, npages);
2970         bitmap_clear(block->bmap, start, npages);
2971         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2972     }
2973 }
2974
2975 /*
2976  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2977  * long-running RCU critical section.  When rcu-reclaims in the code
2978  * start to become numerous it will be necessary to reduce the
2979  * granularity of these critical sections.
2980  */
2981
2982 /**
2983  * ram_save_setup: Setup RAM for migration
2984  *
2985  * Returns zero to indicate success and negative for error
2986  *
2987  * @f: QEMUFile where to send the data
2988  * @opaque: RAMState pointer
2989  */
2990 static int ram_save_setup(QEMUFile *f, void *opaque)
2991 {
2992     RAMState **rsp = opaque;
2993     RAMBlock *block;
2994     int ret;
2995
2996     if (compress_threads_save_setup()) {
2997         return -1;
2998     }
2999
3000     /* migration has already setup the bitmap, reuse it. */
3001     if (!migration_in_colo_state()) {
3002         if (ram_init_all(rsp) != 0) {
3003             compress_threads_save_cleanup();
3004             return -1;
3005         }
3006     }
3007     (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
3008
3009     WITH_RCU_READ_LOCK_GUARD() {
3010         qemu_put_be64(f, ram_bytes_total_with_ignored()
3011                          | RAM_SAVE_FLAG_MEM_SIZE);
3012
3013         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3014             qemu_put_byte(f, strlen(block->idstr));
3015             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3016             qemu_put_be64(f, block->used_length);
3017             if (migrate_postcopy_ram() && block->page_size !=
3018                                           qemu_host_page_size) {
3019                 qemu_put_be64(f, block->page_size);
3020             }
3021             if (migrate_ignore_shared()) {
3022                 qemu_put_be64(f, block->mr->addr);
3023             }
3024         }
3025     }
3026
3027     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3028     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3029
3030     migration_ops = g_malloc0(sizeof(MigrationOps));
3031     migration_ops->ram_save_target_page = ram_save_target_page_legacy;
3032     ret = multifd_send_sync_main(f);
3033     if (ret < 0) {
3034         return ret;
3035     }
3036
3037     if (!migrate_multifd_flush_after_each_section()) {
3038         qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3039     }
3040
3041     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3042     qemu_fflush(f);
3043
3044     return 0;
3045 }
3046
3047 /**
3048  * ram_save_iterate: iterative stage for migration
3049  *
3050  * Returns zero to indicate success and negative for error
3051  *
3052  * @f: QEMUFile where to send the data
3053  * @opaque: RAMState pointer
3054  */
3055 static int ram_save_iterate(QEMUFile *f, void *opaque)
3056 {
3057     RAMState **temp = opaque;
3058     RAMState *rs = *temp;
3059     int ret = 0;
3060     int i;
3061     int64_t t0;
3062     int done = 0;
3063
3064     if (blk_mig_bulk_active()) {
3065         /* Avoid transferring ram during bulk phase of block migration as
3066          * the bulk phase will usually take a long time and transferring
3067          * ram updates during that time is pointless. */
3068         goto out;
3069     }
3070
3071     /*
3072      * We'll take this lock a little bit long, but it's okay for two reasons.
3073      * Firstly, the only possible other thread to take it is who calls
3074      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3075      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3076      * guarantees that we'll at least released it in a regular basis.
3077      */
3078     qemu_mutex_lock(&rs->bitmap_mutex);
3079     WITH_RCU_READ_LOCK_GUARD() {
3080         if (ram_list.version != rs->last_version) {
3081             ram_state_reset(rs);
3082         }
3083
3084         /* Read version before ram_list.blocks */
3085         smp_rmb();
3086
3087         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3088
3089         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3090         i = 0;
3091         while ((ret = migration_rate_exceeded(f)) == 0 ||
3092                postcopy_has_request(rs)) {
3093             int pages;
3094
3095             if (qemu_file_get_error(f)) {
3096                 break;
3097             }
3098
3099             pages = ram_find_and_save_block(rs);
3100             /* no more pages to sent */
3101             if (pages == 0) {
3102                 done = 1;
3103                 break;
3104             }
3105
3106             if (pages < 0) {
3107                 qemu_file_set_error(f, pages);
3108                 break;
3109             }
3110
3111             rs->target_page_count += pages;
3112
3113             /*
3114              * During postcopy, it is necessary to make sure one whole host
3115              * page is sent in one chunk.
3116              */
3117             if (migrate_postcopy_ram()) {
3118                 ram_flush_compressed_data(rs);
3119             }
3120
3121             /*
3122              * we want to check in the 1st loop, just in case it was the 1st
3123              * time and we had to sync the dirty bitmap.
3124              * qemu_clock_get_ns() is a bit expensive, so we only check each
3125              * some iterations
3126              */
3127             if ((i & 63) == 0) {
3128                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3129                               1000000;
3130                 if (t1 > MAX_WAIT) {
3131                     trace_ram_save_iterate_big_wait(t1, i);
3132                     break;
3133                 }
3134             }
3135             i++;
3136         }
3137     }
3138     qemu_mutex_unlock(&rs->bitmap_mutex);
3139
3140     /*
3141      * Must occur before EOS (or any QEMUFile operation)
3142      * because of RDMA protocol.
3143      */
3144     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3145
3146 out:
3147     if (ret >= 0
3148         && migration_is_setup_or_active(migrate_get_current()->state)) {
3149         if (migrate_multifd_flush_after_each_section()) {
3150             ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3151             if (ret < 0) {
3152                 return ret;
3153             }
3154         }
3155
3156         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3157         qemu_fflush(f);
3158         ram_transferred_add(8);
3159
3160         ret = qemu_file_get_error(f);
3161     }
3162     if (ret < 0) {
3163         return ret;
3164     }
3165
3166     return done;
3167 }
3168
3169 /**
3170  * ram_save_complete: function called to send the remaining amount of ram
3171  *
3172  * Returns zero to indicate success or negative on error
3173  *
3174  * Called with iothread lock
3175  *
3176  * @f: QEMUFile where to send the data
3177  * @opaque: RAMState pointer
3178  */
3179 static int ram_save_complete(QEMUFile *f, void *opaque)
3180 {
3181     RAMState **temp = opaque;
3182     RAMState *rs = *temp;
3183     int ret = 0;
3184
3185     rs->last_stage = !migration_in_colo_state();
3186
3187     WITH_RCU_READ_LOCK_GUARD() {
3188         if (!migration_in_postcopy()) {
3189             migration_bitmap_sync_precopy(rs, true);
3190         }
3191
3192         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3193
3194         /* try transferring iterative blocks of memory */
3195
3196         /* flush all remaining blocks regardless of rate limiting */
3197         qemu_mutex_lock(&rs->bitmap_mutex);
3198         while (true) {
3199             int pages;
3200
3201             pages = ram_find_and_save_block(rs);
3202             /* no more blocks to sent */
3203             if (pages == 0) {
3204                 break;
3205             }
3206             if (pages < 0) {
3207                 ret = pages;
3208                 break;
3209             }
3210         }
3211         qemu_mutex_unlock(&rs->bitmap_mutex);
3212
3213         ram_flush_compressed_data(rs);
3214         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3215     }
3216
3217     if (ret < 0) {
3218         return ret;
3219     }
3220
3221     ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3222     if (ret < 0) {
3223         return ret;
3224     }
3225
3226     if (!migrate_multifd_flush_after_each_section()) {
3227         qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3228     }
3229     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3230     qemu_fflush(f);
3231
3232     return 0;
3233 }
3234
3235 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3236                                        uint64_t *can_postcopy)
3237 {
3238     RAMState **temp = opaque;
3239     RAMState *rs = *temp;
3240
3241     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3242
3243     if (migrate_postcopy_ram()) {
3244         /* We can do postcopy, and all the data is postcopiable */
3245         *can_postcopy += remaining_size;
3246     } else {
3247         *must_precopy += remaining_size;
3248     }
3249 }
3250
3251 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3252                                     uint64_t *can_postcopy)
3253 {
3254     MigrationState *s = migrate_get_current();
3255     RAMState **temp = opaque;
3256     RAMState *rs = *temp;
3257
3258     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3259
3260     if (!migration_in_postcopy() && remaining_size < s->threshold_size) {
3261         qemu_mutex_lock_iothread();
3262         WITH_RCU_READ_LOCK_GUARD() {
3263             migration_bitmap_sync_precopy(rs, false);
3264         }
3265         qemu_mutex_unlock_iothread();
3266         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3267     }
3268
3269     if (migrate_postcopy_ram()) {
3270         /* We can do postcopy, and all the data is postcopiable */
3271         *can_postcopy += remaining_size;
3272     } else {
3273         *must_precopy += remaining_size;
3274     }
3275 }
3276
3277 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3278 {
3279     unsigned int xh_len;
3280     int xh_flags;
3281     uint8_t *loaded_data;
3282
3283     /* extract RLE header */
3284     xh_flags = qemu_get_byte(f);
3285     xh_len = qemu_get_be16(f);
3286
3287     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3288         error_report("Failed to load XBZRLE page - wrong compression!");
3289         return -1;
3290     }
3291
3292     if (xh_len > TARGET_PAGE_SIZE) {
3293         error_report("Failed to load XBZRLE page - len overflow!");
3294         return -1;
3295     }
3296     loaded_data = XBZRLE.decoded_buf;
3297     /* load data and decode */
3298     /* it can change loaded_data to point to an internal buffer */
3299     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3300
3301     /* decode RLE */
3302     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3303                              TARGET_PAGE_SIZE) == -1) {
3304         error_report("Failed to load XBZRLE page - decode error!");
3305         return -1;
3306     }
3307
3308     return 0;
3309 }
3310
3311 /**
3312  * ram_block_from_stream: read a RAMBlock id from the migration stream
3313  *
3314  * Must be called from within a rcu critical section.
3315  *
3316  * Returns a pointer from within the RCU-protected ram_list.
3317  *
3318  * @mis: the migration incoming state pointer
3319  * @f: QEMUFile where to read the data from
3320  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3321  * @channel: the channel we're using
3322  */
3323 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3324                                               QEMUFile *f, int flags,
3325                                               int channel)
3326 {
3327     RAMBlock *block = mis->last_recv_block[channel];
3328     char id[256];
3329     uint8_t len;
3330
3331     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3332         if (!block) {
3333             error_report("Ack, bad migration stream!");
3334             return NULL;
3335         }
3336         return block;
3337     }
3338
3339     len = qemu_get_byte(f);
3340     qemu_get_buffer(f, (uint8_t *)id, len);
3341     id[len] = 0;
3342
3343     block = qemu_ram_block_by_name(id);
3344     if (!block) {
3345         error_report("Can't find block %s", id);
3346         return NULL;
3347     }
3348
3349     if (ramblock_is_ignored(block)) {
3350         error_report("block %s should not be migrated !", id);
3351         return NULL;
3352     }
3353
3354     mis->last_recv_block[channel] = block;
3355
3356     return block;
3357 }
3358
3359 static inline void *host_from_ram_block_offset(RAMBlock *block,
3360                                                ram_addr_t offset)
3361 {
3362     if (!offset_in_ramblock(block, offset)) {
3363         return NULL;
3364     }
3365
3366     return block->host + offset;
3367 }
3368
3369 static void *host_page_from_ram_block_offset(RAMBlock *block,
3370                                              ram_addr_t offset)
3371 {
3372     /* Note: Explicitly no check against offset_in_ramblock(). */
3373     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3374                                    block->page_size);
3375 }
3376
3377 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3378                                                          ram_addr_t offset)
3379 {
3380     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3381 }
3382
3383 void colo_record_bitmap(RAMBlock *block, ram_addr_t *normal, uint32_t pages)
3384 {
3385     qemu_mutex_lock(&ram_state->bitmap_mutex);
3386     for (int i = 0; i < pages; i++) {
3387         ram_addr_t offset = normal[i];
3388         ram_state->migration_dirty_pages += !test_and_set_bit(
3389                                                 offset >> TARGET_PAGE_BITS,
3390                                                 block->bmap);
3391     }
3392     qemu_mutex_unlock(&ram_state->bitmap_mutex);
3393 }
3394
3395 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3396                              ram_addr_t offset, bool record_bitmap)
3397 {
3398     if (!offset_in_ramblock(block, offset)) {
3399         return NULL;
3400     }
3401     if (!block->colo_cache) {
3402         error_report("%s: colo_cache is NULL in block :%s",
3403                      __func__, block->idstr);
3404         return NULL;
3405     }
3406
3407     /*
3408     * During colo checkpoint, we need bitmap of these migrated pages.
3409     * It help us to decide which pages in ram cache should be flushed
3410     * into VM's RAM later.
3411     */
3412     if (record_bitmap) {
3413         colo_record_bitmap(block, &offset, 1);
3414     }
3415     return block->colo_cache + offset;
3416 }
3417
3418 /**
3419  * ram_handle_compressed: handle the zero page case
3420  *
3421  * If a page (or a whole RDMA chunk) has been
3422  * determined to be zero, then zap it.
3423  *
3424  * @host: host address for the zero page
3425  * @ch: what the page is filled from.  We only support zero
3426  * @size: size of the zero page
3427  */
3428 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3429 {
3430     if (ch != 0 || !buffer_is_zero(host, size)) {
3431         memset(host, ch, size);
3432     }
3433 }
3434
3435 static void colo_init_ram_state(void)
3436 {
3437     ram_state_init(&ram_state);
3438 }
3439
3440 /*
3441  * colo cache: this is for secondary VM, we cache the whole
3442  * memory of the secondary VM, it is need to hold the global lock
3443  * to call this helper.
3444  */
3445 int colo_init_ram_cache(void)
3446 {
3447     RAMBlock *block;
3448
3449     WITH_RCU_READ_LOCK_GUARD() {
3450         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3451             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3452                                                     NULL, false, false);
3453             if (!block->colo_cache) {
3454                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3455                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3456                              block->used_length);
3457                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3458                     if (block->colo_cache) {
3459                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3460                         block->colo_cache = NULL;
3461                     }
3462                 }
3463                 return -errno;
3464             }
3465             if (!machine_dump_guest_core(current_machine)) {
3466                 qemu_madvise(block->colo_cache, block->used_length,
3467                              QEMU_MADV_DONTDUMP);
3468             }
3469         }
3470     }
3471
3472     /*
3473     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3474     * with to decide which page in cache should be flushed into SVM's RAM. Here
3475     * we use the same name 'ram_bitmap' as for migration.
3476     */
3477     if (ram_bytes_total()) {
3478         RAMBlock *block;
3479
3480         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3481             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3482             block->bmap = bitmap_new(pages);
3483         }
3484     }
3485
3486     colo_init_ram_state();
3487     return 0;
3488 }
3489
3490 /* TODO: duplicated with ram_init_bitmaps */
3491 void colo_incoming_start_dirty_log(void)
3492 {
3493     RAMBlock *block = NULL;
3494     /* For memory_global_dirty_log_start below. */
3495     qemu_mutex_lock_iothread();
3496     qemu_mutex_lock_ramlist();
3497
3498     memory_global_dirty_log_sync(false);
3499     WITH_RCU_READ_LOCK_GUARD() {
3500         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3501             ramblock_sync_dirty_bitmap(ram_state, block);
3502             /* Discard this dirty bitmap record */
3503             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3504         }
3505         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3506     }
3507     ram_state->migration_dirty_pages = 0;
3508     qemu_mutex_unlock_ramlist();
3509     qemu_mutex_unlock_iothread();
3510 }
3511
3512 /* It is need to hold the global lock to call this helper */
3513 void colo_release_ram_cache(void)
3514 {
3515     RAMBlock *block;
3516
3517     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3518     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3519         g_free(block->bmap);
3520         block->bmap = NULL;
3521     }
3522
3523     WITH_RCU_READ_LOCK_GUARD() {
3524         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3525             if (block->colo_cache) {
3526                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3527                 block->colo_cache = NULL;
3528             }
3529         }
3530     }
3531     ram_state_cleanup(&ram_state);
3532 }
3533
3534 /**
3535  * ram_load_setup: Setup RAM for migration incoming side
3536  *
3537  * Returns zero to indicate success and negative for error
3538  *
3539  * @f: QEMUFile where to receive the data
3540  * @opaque: RAMState pointer
3541  */
3542 static int ram_load_setup(QEMUFile *f, void *opaque)
3543 {
3544     xbzrle_load_setup();
3545     ramblock_recv_map_init();
3546
3547     return 0;
3548 }
3549
3550 static int ram_load_cleanup(void *opaque)
3551 {
3552     RAMBlock *rb;
3553
3554     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3555         qemu_ram_block_writeback(rb);
3556     }
3557
3558     xbzrle_load_cleanup();
3559
3560     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3561         g_free(rb->receivedmap);
3562         rb->receivedmap = NULL;
3563     }
3564
3565     return 0;
3566 }
3567
3568 /**
3569  * ram_postcopy_incoming_init: allocate postcopy data structures
3570  *
3571  * Returns 0 for success and negative if there was one error
3572  *
3573  * @mis: current migration incoming state
3574  *
3575  * Allocate data structures etc needed by incoming migration with
3576  * postcopy-ram. postcopy-ram's similarly names
3577  * postcopy_ram_incoming_init does the work.
3578  */
3579 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3580 {
3581     return postcopy_ram_incoming_init(mis);
3582 }
3583
3584 /**
3585  * ram_load_postcopy: load a page in postcopy case
3586  *
3587  * Returns 0 for success or -errno in case of error
3588  *
3589  * Called in postcopy mode by ram_load().
3590  * rcu_read_lock is taken prior to this being called.
3591  *
3592  * @f: QEMUFile where to send the data
3593  * @channel: the channel to use for loading
3594  */
3595 int ram_load_postcopy(QEMUFile *f, int channel)
3596 {
3597     int flags = 0, ret = 0;
3598     bool place_needed = false;
3599     bool matches_target_page_size = false;
3600     MigrationIncomingState *mis = migration_incoming_get_current();
3601     PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
3602
3603     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3604         ram_addr_t addr;
3605         void *page_buffer = NULL;
3606         void *place_source = NULL;
3607         RAMBlock *block = NULL;
3608         uint8_t ch;
3609         int len;
3610
3611         addr = qemu_get_be64(f);
3612
3613         /*
3614          * If qemu file error, we should stop here, and then "addr"
3615          * may be invalid
3616          */
3617         ret = qemu_file_get_error(f);
3618         if (ret) {
3619             break;
3620         }
3621
3622         flags = addr & ~TARGET_PAGE_MASK;
3623         addr &= TARGET_PAGE_MASK;
3624
3625         trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
3626         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3627                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3628             block = ram_block_from_stream(mis, f, flags, channel);
3629             if (!block) {
3630                 ret = -EINVAL;
3631                 break;
3632             }
3633
3634             /*
3635              * Relying on used_length is racy and can result in false positives.
3636              * We might place pages beyond used_length in case RAM was shrunk
3637              * while in postcopy, which is fine - trying to place via
3638              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3639              */
3640             if (!block->host || addr >= block->postcopy_length) {
3641                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3642                 ret = -EINVAL;
3643                 break;
3644             }
3645             tmp_page->target_pages++;
3646             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3647             /*
3648              * Postcopy requires that we place whole host pages atomically;
3649              * these may be huge pages for RAMBlocks that are backed by
3650              * hugetlbfs.
3651              * To make it atomic, the data is read into a temporary page
3652              * that's moved into place later.
3653              * The migration protocol uses,  possibly smaller, target-pages
3654              * however the source ensures it always sends all the components
3655              * of a host page in one chunk.
3656              */
3657             page_buffer = tmp_page->tmp_huge_page +
3658                           host_page_offset_from_ram_block_offset(block, addr);
3659             /* If all TP are zero then we can optimise the place */
3660             if (tmp_page->target_pages == 1) {
3661                 tmp_page->host_addr =
3662                     host_page_from_ram_block_offset(block, addr);
3663             } else if (tmp_page->host_addr !=
3664                        host_page_from_ram_block_offset(block, addr)) {
3665                 /* not the 1st TP within the HP */
3666                 error_report("Non-same host page detected on channel %d: "
3667                              "Target host page %p, received host page %p "
3668                              "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
3669                              channel, tmp_page->host_addr,
3670                              host_page_from_ram_block_offset(block, addr),
3671                              block->idstr, addr, tmp_page->target_pages);
3672                 ret = -EINVAL;
3673                 break;
3674             }
3675
3676             /*
3677              * If it's the last part of a host page then we place the host
3678              * page
3679              */
3680             if (tmp_page->target_pages ==
3681                 (block->page_size / TARGET_PAGE_SIZE)) {
3682                 place_needed = true;
3683             }
3684             place_source = tmp_page->tmp_huge_page;
3685         }
3686
3687         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3688         case RAM_SAVE_FLAG_ZERO:
3689             ch = qemu_get_byte(f);
3690             /*
3691              * Can skip to set page_buffer when
3692              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3693              */
3694             if (ch || !matches_target_page_size) {
3695                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3696             }
3697             if (ch) {
3698                 tmp_page->all_zero = false;
3699             }
3700             break;
3701
3702         case RAM_SAVE_FLAG_PAGE:
3703             tmp_page->all_zero = false;
3704             if (!matches_target_page_size) {
3705                 /* For huge pages, we always use temporary buffer */
3706                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3707             } else {
3708                 /*
3709                  * For small pages that matches target page size, we
3710                  * avoid the qemu_file copy.  Instead we directly use
3711                  * the buffer of QEMUFile to place the page.  Note: we
3712                  * cannot do any QEMUFile operation before using that
3713                  * buffer to make sure the buffer is valid when
3714                  * placing the page.
3715                  */
3716                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3717                                          TARGET_PAGE_SIZE);
3718             }
3719             break;
3720         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3721             tmp_page->all_zero = false;
3722             len = qemu_get_be32(f);
3723             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3724                 error_report("Invalid compressed data length: %d", len);
3725                 ret = -EINVAL;
3726                 break;
3727             }
3728             decompress_data_with_multi_threads(f, page_buffer, len);
3729             break;
3730         case RAM_SAVE_FLAG_MULTIFD_FLUSH:
3731             multifd_recv_sync_main();
3732             break;
3733         case RAM_SAVE_FLAG_EOS:
3734             /* normal exit */
3735             if (migrate_multifd_flush_after_each_section()) {
3736                 multifd_recv_sync_main();
3737             }
3738             break;
3739         default:
3740             error_report("Unknown combination of migration flags: 0x%x"
3741                          " (postcopy mode)", flags);
3742             ret = -EINVAL;
3743             break;
3744         }
3745
3746         /* Got the whole host page, wait for decompress before placing. */
3747         if (place_needed) {
3748             ret |= wait_for_decompress_done();
3749         }
3750
3751         /* Detect for any possible file errors */
3752         if (!ret && qemu_file_get_error(f)) {
3753             ret = qemu_file_get_error(f);
3754         }
3755
3756         if (!ret && place_needed) {
3757             if (tmp_page->all_zero) {
3758                 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
3759             } else {
3760                 ret = postcopy_place_page(mis, tmp_page->host_addr,
3761                                           place_source, block);
3762             }
3763             place_needed = false;
3764             postcopy_temp_page_reset(tmp_page);
3765         }
3766     }
3767
3768     return ret;
3769 }
3770
3771 static bool postcopy_is_running(void)
3772 {
3773     PostcopyState ps = postcopy_state_get();
3774     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3775 }
3776
3777 /*
3778  * Flush content of RAM cache into SVM's memory.
3779  * Only flush the pages that be dirtied by PVM or SVM or both.
3780  */
3781 void colo_flush_ram_cache(void)
3782 {
3783     RAMBlock *block = NULL;
3784     void *dst_host;
3785     void *src_host;
3786     unsigned long offset = 0;
3787
3788     memory_global_dirty_log_sync(false);
3789     qemu_mutex_lock(&ram_state->bitmap_mutex);
3790     WITH_RCU_READ_LOCK_GUARD() {
3791         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3792             ramblock_sync_dirty_bitmap(ram_state, block);
3793         }
3794     }
3795
3796     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3797     WITH_RCU_READ_LOCK_GUARD() {
3798         block = QLIST_FIRST_RCU(&ram_list.blocks);
3799
3800         while (block) {
3801             unsigned long num = 0;
3802
3803             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
3804             if (!offset_in_ramblock(block,
3805                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3806                 offset = 0;
3807                 num = 0;
3808                 block = QLIST_NEXT_RCU(block, next);
3809             } else {
3810                 unsigned long i = 0;
3811
3812                 for (i = 0; i < num; i++) {
3813                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
3814                 }
3815                 dst_host = block->host
3816                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3817                 src_host = block->colo_cache
3818                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3819                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3820                 offset += num;
3821             }
3822         }
3823     }
3824     qemu_mutex_unlock(&ram_state->bitmap_mutex);
3825     trace_colo_flush_ram_cache_end();
3826 }
3827
3828 /**
3829  * ram_load_precopy: load pages in precopy case
3830  *
3831  * Returns 0 for success or -errno in case of error
3832  *
3833  * Called in precopy mode by ram_load().
3834  * rcu_read_lock is taken prior to this being called.
3835  *
3836  * @f: QEMUFile where to send the data
3837  */
3838 static int ram_load_precopy(QEMUFile *f)
3839 {
3840     MigrationIncomingState *mis = migration_incoming_get_current();
3841     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3842     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3843     bool postcopy_advised = migration_incoming_postcopy_advised();
3844     if (!migrate_compress()) {
3845         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3846     }
3847
3848     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3849         ram_addr_t addr, total_ram_bytes;
3850         void *host = NULL, *host_bak = NULL;
3851         uint8_t ch;
3852
3853         /*
3854          * Yield periodically to let main loop run, but an iteration of
3855          * the main loop is expensive, so do it each some iterations
3856          */
3857         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3858             aio_co_schedule(qemu_get_current_aio_context(),
3859                             qemu_coroutine_self());
3860             qemu_coroutine_yield();
3861         }
3862         i++;
3863
3864         addr = qemu_get_be64(f);
3865         flags = addr & ~TARGET_PAGE_MASK;
3866         addr &= TARGET_PAGE_MASK;
3867
3868         if (flags & invalid_flags) {
3869             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3870                 error_report("Received an unexpected compressed page");
3871             }
3872
3873             ret = -EINVAL;
3874             break;
3875         }
3876
3877         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3878                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3879             RAMBlock *block = ram_block_from_stream(mis, f, flags,
3880                                                     RAM_CHANNEL_PRECOPY);
3881
3882             host = host_from_ram_block_offset(block, addr);
3883             /*
3884              * After going into COLO stage, we should not load the page
3885              * into SVM's memory directly, we put them into colo_cache firstly.
3886              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3887              * Previously, we copied all these memory in preparing stage of COLO
3888              * while we need to stop VM, which is a time-consuming process.
3889              * Here we optimize it by a trick, back-up every page while in
3890              * migration process while COLO is enabled, though it affects the
3891              * speed of the migration, but it obviously reduce the downtime of
3892              * back-up all SVM'S memory in COLO preparing stage.
3893              */
3894             if (migration_incoming_colo_enabled()) {
3895                 if (migration_incoming_in_colo_state()) {
3896                     /* In COLO stage, put all pages into cache temporarily */
3897                     host = colo_cache_from_block_offset(block, addr, true);
3898                 } else {
3899                    /*
3900                     * In migration stage but before COLO stage,
3901                     * Put all pages into both cache and SVM's memory.
3902                     */
3903                     host_bak = colo_cache_from_block_offset(block, addr, false);
3904                 }
3905             }
3906             if (!host) {
3907                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3908                 ret = -EINVAL;
3909                 break;
3910             }
3911             if (!migration_incoming_in_colo_state()) {
3912                 ramblock_recv_bitmap_set(block, host);
3913             }
3914
3915             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3916         }
3917
3918         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3919         case RAM_SAVE_FLAG_MEM_SIZE:
3920             /* Synchronize RAM block list */
3921             total_ram_bytes = addr;
3922             while (!ret && total_ram_bytes) {
3923                 RAMBlock *block;
3924                 char id[256];
3925                 ram_addr_t length;
3926
3927                 len = qemu_get_byte(f);
3928                 qemu_get_buffer(f, (uint8_t *)id, len);
3929                 id[len] = 0;
3930                 length = qemu_get_be64(f);
3931
3932                 block = qemu_ram_block_by_name(id);
3933                 if (block && !qemu_ram_is_migratable(block)) {
3934                     error_report("block %s should not be migrated !", id);
3935                     ret = -EINVAL;
3936                 } else if (block) {
3937                     if (length != block->used_length) {
3938                         Error *local_err = NULL;
3939
3940                         ret = qemu_ram_resize(block, length,
3941                                               &local_err);
3942                         if (local_err) {
3943                             error_report_err(local_err);
3944                         }
3945                     }
3946                     /* For postcopy we need to check hugepage sizes match */
3947                     if (postcopy_advised && migrate_postcopy_ram() &&
3948                         block->page_size != qemu_host_page_size) {
3949                         uint64_t remote_page_size = qemu_get_be64(f);
3950                         if (remote_page_size != block->page_size) {
3951                             error_report("Mismatched RAM page size %s "
3952                                          "(local) %zd != %" PRId64,
3953                                          id, block->page_size,
3954                                          remote_page_size);
3955                             ret = -EINVAL;
3956                         }
3957                     }
3958                     if (migrate_ignore_shared()) {
3959                         hwaddr addr = qemu_get_be64(f);
3960                         if (ramblock_is_ignored(block) &&
3961                             block->mr->addr != addr) {
3962                             error_report("Mismatched GPAs for block %s "
3963                                          "%" PRId64 "!= %" PRId64,
3964                                          id, (uint64_t)addr,
3965                                          (uint64_t)block->mr->addr);
3966                             ret = -EINVAL;
3967                         }
3968                     }
3969                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3970                                           block->idstr);
3971                 } else {
3972                     error_report("Unknown ramblock \"%s\", cannot "
3973                                  "accept migration", id);
3974                     ret = -EINVAL;
3975                 }
3976
3977                 total_ram_bytes -= length;
3978             }
3979             break;
3980
3981         case RAM_SAVE_FLAG_ZERO:
3982             ch = qemu_get_byte(f);
3983             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3984             break;
3985
3986         case RAM_SAVE_FLAG_PAGE:
3987             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3988             break;
3989
3990         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3991             len = qemu_get_be32(f);
3992             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3993                 error_report("Invalid compressed data length: %d", len);
3994                 ret = -EINVAL;
3995                 break;
3996             }
3997             decompress_data_with_multi_threads(f, host, len);
3998             break;
3999
4000         case RAM_SAVE_FLAG_XBZRLE:
4001             if (load_xbzrle(f, addr, host) < 0) {
4002                 error_report("Failed to decompress XBZRLE page at "
4003                              RAM_ADDR_FMT, addr);
4004                 ret = -EINVAL;
4005                 break;
4006             }
4007             break;
4008         case RAM_SAVE_FLAG_MULTIFD_FLUSH:
4009             multifd_recv_sync_main();
4010             break;
4011         case RAM_SAVE_FLAG_EOS:
4012             /* normal exit */
4013             if (migrate_multifd_flush_after_each_section()) {
4014                 multifd_recv_sync_main();
4015             }
4016             break;
4017         case RAM_SAVE_FLAG_HOOK:
4018             ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4019             break;
4020         default:
4021             error_report("Unknown combination of migration flags: 0x%x", flags);
4022             ret = -EINVAL;
4023         }
4024         if (!ret) {
4025             ret = qemu_file_get_error(f);
4026         }
4027         if (!ret && host_bak) {
4028             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4029         }
4030     }
4031
4032     ret |= wait_for_decompress_done();
4033     return ret;
4034 }
4035
4036 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4037 {
4038     int ret = 0;
4039     static uint64_t seq_iter;
4040     /*
4041      * If system is running in postcopy mode, page inserts to host memory must
4042      * be atomic
4043      */
4044     bool postcopy_running = postcopy_is_running();
4045
4046     seq_iter++;
4047
4048     if (version_id != 4) {
4049         return -EINVAL;
4050     }
4051
4052     /*
4053      * This RCU critical section can be very long running.
4054      * When RCU reclaims in the code start to become numerous,
4055      * it will be necessary to reduce the granularity of this
4056      * critical section.
4057      */
4058     WITH_RCU_READ_LOCK_GUARD() {
4059         if (postcopy_running) {
4060             /*
4061              * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
4062              * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4063              * service fast page faults.
4064              */
4065             ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4066         } else {
4067             ret = ram_load_precopy(f);
4068         }
4069     }
4070     trace_ram_load_complete(ret, seq_iter);
4071
4072     return ret;
4073 }
4074
4075 static bool ram_has_postcopy(void *opaque)
4076 {
4077     RAMBlock *rb;
4078     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4079         if (ramblock_is_pmem(rb)) {
4080             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4081                          "is not supported now!", rb->idstr, rb->host);
4082             return false;
4083         }
4084     }
4085
4086     return migrate_postcopy_ram();
4087 }
4088
4089 /* Sync all the dirty bitmap with destination VM.  */
4090 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4091 {
4092     RAMBlock *block;
4093     QEMUFile *file = s->to_dst_file;
4094     int ramblock_count = 0;
4095
4096     trace_ram_dirty_bitmap_sync_start();
4097
4098     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4099         qemu_savevm_send_recv_bitmap(file, block->idstr);
4100         trace_ram_dirty_bitmap_request(block->idstr);
4101         ramblock_count++;
4102     }
4103
4104     trace_ram_dirty_bitmap_sync_wait();
4105
4106     /* Wait until all the ramblocks' dirty bitmap synced */
4107     while (ramblock_count--) {
4108         qemu_sem_wait(&s->rp_state.rp_sem);
4109     }
4110
4111     trace_ram_dirty_bitmap_sync_complete();
4112
4113     return 0;
4114 }
4115
4116 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4117 {
4118     qemu_sem_post(&s->rp_state.rp_sem);
4119 }
4120
4121 /*
4122  * Read the received bitmap, revert it as the initial dirty bitmap.
4123  * This is only used when the postcopy migration is paused but wants
4124  * to resume from a middle point.
4125  */
4126 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4127 {
4128     int ret = -EINVAL;
4129     /* from_dst_file is always valid because we're within rp_thread */
4130     QEMUFile *file = s->rp_state.from_dst_file;
4131     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4132     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4133     uint64_t size, end_mark;
4134
4135     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4136
4137     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4138         error_report("%s: incorrect state %s", __func__,
4139                      MigrationStatus_str(s->state));
4140         return -EINVAL;
4141     }
4142
4143     /*
4144      * Note: see comments in ramblock_recv_bitmap_send() on why we
4145      * need the endianness conversion, and the paddings.
4146      */
4147     local_size = ROUND_UP(local_size, 8);
4148
4149     /* Add paddings */
4150     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4151
4152     size = qemu_get_be64(file);
4153
4154     /* The size of the bitmap should match with our ramblock */
4155     if (size != local_size) {
4156         error_report("%s: ramblock '%s' bitmap size mismatch "
4157                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4158                      block->idstr, size, local_size);
4159         ret = -EINVAL;
4160         goto out;
4161     }
4162
4163     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4164     end_mark = qemu_get_be64(file);
4165
4166     ret = qemu_file_get_error(file);
4167     if (ret || size != local_size) {
4168         error_report("%s: read bitmap failed for ramblock '%s': %d"
4169                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4170                      __func__, block->idstr, ret, local_size, size);
4171         ret = -EIO;
4172         goto out;
4173     }
4174
4175     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4176         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4177                      __func__, block->idstr, end_mark);
4178         ret = -EINVAL;
4179         goto out;
4180     }
4181
4182     /*
4183      * Endianness conversion. We are during postcopy (though paused).
4184      * The dirty bitmap won't change. We can directly modify it.
4185      */
4186     bitmap_from_le(block->bmap, le_bitmap, nbits);
4187
4188     /*
4189      * What we received is "received bitmap". Revert it as the initial
4190      * dirty bitmap for this ramblock.
4191      */
4192     bitmap_complement(block->bmap, block->bmap, nbits);
4193
4194     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4195     ramblock_dirty_bitmap_clear_discarded_pages(block);
4196
4197     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4198     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4199
4200     /*
4201      * We succeeded to sync bitmap for current ramblock. If this is
4202      * the last one to sync, we need to notify the main send thread.
4203      */
4204     ram_dirty_bitmap_reload_notify(s);
4205
4206     ret = 0;
4207 out:
4208     g_free(le_bitmap);
4209     return ret;
4210 }
4211
4212 static int ram_resume_prepare(MigrationState *s, void *opaque)
4213 {
4214     RAMState *rs = *(RAMState **)opaque;
4215     int ret;
4216
4217     ret = ram_dirty_bitmap_sync_all(s, rs);
4218     if (ret) {
4219         return ret;
4220     }
4221
4222     ram_state_resume_prepare(rs, s->to_dst_file);
4223
4224     return 0;
4225 }
4226
4227 void postcopy_preempt_shutdown_file(MigrationState *s)
4228 {
4229     qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4230     qemu_fflush(s->postcopy_qemufile_src);
4231 }
4232
4233 static SaveVMHandlers savevm_ram_handlers = {
4234     .save_setup = ram_save_setup,
4235     .save_live_iterate = ram_save_iterate,
4236     .save_live_complete_postcopy = ram_save_complete,
4237     .save_live_complete_precopy = ram_save_complete,
4238     .has_postcopy = ram_has_postcopy,
4239     .state_pending_exact = ram_state_pending_exact,
4240     .state_pending_estimate = ram_state_pending_estimate,
4241     .load_state = ram_load,
4242     .save_cleanup = ram_save_cleanup,
4243     .load_setup = ram_load_setup,
4244     .load_cleanup = ram_load_cleanup,
4245     .resume_prepare = ram_resume_prepare,
4246 };
4247
4248 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4249                                       size_t old_size, size_t new_size)
4250 {
4251     PostcopyState ps = postcopy_state_get();
4252     ram_addr_t offset;
4253     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4254     Error *err = NULL;
4255
4256     if (ramblock_is_ignored(rb)) {
4257         return;
4258     }
4259
4260     if (!migration_is_idle()) {
4261         /*
4262          * Precopy code on the source cannot deal with the size of RAM blocks
4263          * changing at random points in time - especially after sending the
4264          * RAM block sizes in the migration stream, they must no longer change.
4265          * Abort and indicate a proper reason.
4266          */
4267         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4268         migration_cancel(err);
4269         error_free(err);
4270     }
4271
4272     switch (ps) {
4273     case POSTCOPY_INCOMING_ADVISE:
4274         /*
4275          * Update what ram_postcopy_incoming_init()->init_range() does at the
4276          * time postcopy was advised. Syncing RAM blocks with the source will
4277          * result in RAM resizes.
4278          */
4279         if (old_size < new_size) {
4280             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4281                 error_report("RAM block '%s' discard of resized RAM failed",
4282                              rb->idstr);
4283             }
4284         }
4285         rb->postcopy_length = new_size;
4286         break;
4287     case POSTCOPY_INCOMING_NONE:
4288     case POSTCOPY_INCOMING_RUNNING:
4289     case POSTCOPY_INCOMING_END:
4290         /*
4291          * Once our guest is running, postcopy does no longer care about
4292          * resizes. When growing, the new memory was not available on the
4293          * source, no handler needed.
4294          */
4295         break;
4296     default:
4297         error_report("RAM block '%s' resized during postcopy state: %d",
4298                      rb->idstr, ps);
4299         exit(-1);
4300     }
4301 }
4302
4303 static RAMBlockNotifier ram_mig_ram_notifier = {
4304     .ram_block_resized = ram_mig_ram_block_resized,
4305 };
4306
4307 void ram_mig_init(void)
4308 {
4309     qemu_mutex_init(&XBZRLE.lock);
4310     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4311     ram_block_notifier_add(&ram_mig_ram_notifier);
4312 }