migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/madvise.h"
  34 #include "qemu/main-loop.h"
  35 #include "xbzrle.h"
  36 #include "ram-compress.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "migration-stats.h"
  40 #include "migration/register.h"
  41 #include "migration/misc.h"
  42 #include "qemu-file.h"
  43 #include "postcopy-ram.h"
  44 #include "page_cache.h"
  45 #include "qemu/error-report.h"
  46 #include "qapi/error.h"
  47 #include "qapi/qapi-types-migration.h"
  48 #include "qapi/qapi-events-migration.h"
  49 #include "qapi/qmp/qerror.h"
  50 #include "trace.h"
  51 #include "exec/ram_addr.h"
  52 #include "exec/target_page.h"
  53 #include "qemu/rcu_queue.h"
  54 #include "migration/colo.h"
  55 #include "block.h"
  56 #include "sysemu/cpu-throttle.h"
  57 #include "savevm.h"
  58 #include "qemu/iov.h"
  59 #include "multifd.h"
  60 #include "sysemu/runstate.h"
  61 #include "options.h"
  62
  63 #include "hw/boards.h" /* for machine_dump_guest_core() */
  64
  65 #if defined(__linux__)
  66 #include "qemu/userfaultfd.h"
  67 #endif /* defined(__linux__) */
  68
  69 /***********************************************************/
  70 /* ram save/restore */
  71
  72 /*
  73  * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  74  * worked for pages that were filled with the same char.  We switched
  75  * it to only search for the zero value.  And to avoid confusion with
  76  * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
  77  */
  78 /*
  79  * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
  80  */
  81 #define RAM_SAVE_FLAG_FULL     0x01
  82 #define RAM_SAVE_FLAG_ZERO     0x02
  83 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  84 #define RAM_SAVE_FLAG_PAGE     0x08
  85 #define RAM_SAVE_FLAG_EOS      0x10
  86 #define RAM_SAVE_FLAG_CONTINUE 0x20
  87 #define RAM_SAVE_FLAG_XBZRLE   0x40
  88 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
  89 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  90 #define RAM_SAVE_FLAG_MULTIFD_FLUSH    0x200
  91 /* We can't use any flag that is bigger than 0x200 */
  92
  93 XBZRLECacheStats xbzrle_counters;
  94
  95 /* used by the search for pages to send */
  96 struct PageSearchStatus {
  97     /* The migration channel used for a specific host page */
  98     QEMUFile    *pss_channel;
  99     /* Last block from where we have sent data */
 100     RAMBlock *last_sent_block;
 101     /* Current block being searched */
 102     RAMBlock    *block;
 103     /* Current page to search from */
 104     unsigned long page;
 105     /* Set once we wrap around */
 106     bool         complete_round;
 107     /* Whether we're sending a host page */
 108     bool          host_page_sending;
 109     /* The start/end of current host page.  Invalid if host_page_sending==false */
 110     unsigned long host_page_start;
 111     unsigned long host_page_end;
 112 };
 113 typedef struct PageSearchStatus PageSearchStatus;
 114
 115 /* struct contains XBZRLE cache and a static page
 116    used by the compression */
 117 static struct {
 118     /* buffer used for XBZRLE encoding */
 119     uint8_t *encoded_buf;
 120     /* buffer for storing page content */
 121     uint8_t *current_buf;
 122     /* Cache for XBZRLE, Protected by lock. */
 123     PageCache *cache;
 124     QemuMutex lock;
 125     /* it will store a page full of zeros */
 126     uint8_t *zero_target_page;
 127     /* buffer used for XBZRLE decoding */
 128     uint8_t *decoded_buf;
 129 } XBZRLE;
 130
 131 static void XBZRLE_cache_lock(void)
 132 {
 133     if (migrate_xbzrle()) {
 134         qemu_mutex_lock(&XBZRLE.lock);
 135     }
 136 }
 137
 138 static void XBZRLE_cache_unlock(void)
 139 {
 140     if (migrate_xbzrle()) {
 141         qemu_mutex_unlock(&XBZRLE.lock);
 142     }
 143 }
 144
 145 /**
 146  * xbzrle_cache_resize: resize the xbzrle cache
 147  *
 148  * This function is called from migrate_params_apply in main
 149  * thread, possibly while a migration is in progress.  A running
 150  * migration may be using the cache and might finish during this call,
 151  * hence changes to the cache are protected by XBZRLE.lock().
 152  *
 153  * Returns 0 for success or -1 for error
 154  *
 155  * @new_size: new cache size
 156  * @errp: set *errp if the check failed, with reason
 157  */
 158 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 159 {
 160     PageCache *new_cache;
 161     int64_t ret = 0;
 162
 163     /* Check for truncation */
 164     if (new_size != (size_t)new_size) {
 165         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 166                    "exceeding address space");
 167         return -1;
 168     }
 169
 170     if (new_size == migrate_xbzrle_cache_size()) {
 171         /* nothing to do */
 172         return 0;
 173     }
 174
 175     XBZRLE_cache_lock();
 176
 177     if (XBZRLE.cache != NULL) {
 178         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 179         if (!new_cache) {
 180             ret = -1;
 181             goto out;
 182         }
 183
 184         cache_fini(XBZRLE.cache);
 185         XBZRLE.cache = new_cache;
 186     }
 187 out:
 188     XBZRLE_cache_unlock();
 189     return ret;
 190 }
 191
 192 static bool postcopy_preempt_active(void)
 193 {
 194     return migrate_postcopy_preempt() && migration_in_postcopy();
 195 }
 196
 197 bool ramblock_is_ignored(RAMBlock *block)
 198 {
 199     return !qemu_ram_is_migratable(block) ||
 200            (migrate_ignore_shared() && qemu_ram_is_shared(block)
 201                                     && qemu_ram_is_named_file(block));
 202 }
 203
 204 #undef RAMBLOCK_FOREACH
 205
 206 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 207 {
 208     RAMBlock *block;
 209     int ret = 0;
 210
 211     RCU_READ_LOCK_GUARD();
 212
 213     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 214         ret = func(block, opaque);
 215         if (ret) {
 216             break;
 217         }
 218     }
 219     return ret;
 220 }
 221
 222 static void ramblock_recv_map_init(void)
 223 {
 224     RAMBlock *rb;
 225
 226     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 227         assert(!rb->receivedmap);
 228         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 229     }
 230 }
 231
 232 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 233 {
 234     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 235                     rb->receivedmap);
 236 }
 237
 238 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 239 {
 240     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 241 }
 242
 243 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 244 {
 245     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 246 }
 247
 248 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 249                                     size_t nr)
 250 {
 251     bitmap_set_atomic(rb->receivedmap,
 252                       ramblock_recv_bitmap_offset(host_addr, rb),
 253                       nr);
 254 }
 255
 256 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 257
 258 /*
 259  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 260  *
 261  * Returns >0 if success with sent bytes, or <0 if error.
 262  */
 263 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 264                                   const char *block_name)
 265 {
 266     RAMBlock *block = qemu_ram_block_by_name(block_name);
 267     unsigned long *le_bitmap, nbits;
 268     uint64_t size;
 269
 270     if (!block) {
 271         error_report("%s: invalid block name: %s", __func__, block_name);
 272         return -1;
 273     }
 274
 275     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 276
 277     /*
 278      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 279      * machines we may need 4 more bytes for padding (see below
 280      * comment). So extend it a bit before hand.
 281      */
 282     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 283
 284     /*
 285      * Always use little endian when sending the bitmap. This is
 286      * required that when source and destination VMs are not using the
 287      * same endianness. (Note: big endian won't work.)
 288      */
 289     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 290
 291     /* Size of the bitmap, in bytes */
 292     size = DIV_ROUND_UP(nbits, 8);
 293
 294     /*
 295      * size is always aligned to 8 bytes for 64bit machines, but it
 296      * may not be true for 32bit machines. We need this padding to
 297      * make sure the migration can survive even between 32bit and
 298      * 64bit machines.
 299      */
 300     size = ROUND_UP(size, 8);
 301
 302     qemu_put_be64(file, size);
 303     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 304     /*
 305      * Mark as an end, in case the middle part is screwed up due to
 306      * some "mysterious" reason.
 307      */
 308     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 309     qemu_fflush(file);
 310
 311     g_free(le_bitmap);
 312
 313     if (qemu_file_get_error(file)) {
 314         return qemu_file_get_error(file);
 315     }
 316
 317     return size + sizeof(size);
 318 }
 319
 320 /*
 321  * An outstanding page request, on the source, having been received
 322  * and queued
 323  */
 324 struct RAMSrcPageRequest {
 325     RAMBlock *rb;
 326     hwaddr    offset;
 327     hwaddr    len;
 328
 329     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 330 };
 331
 332 /* State of RAM for migration */
 333 struct RAMState {
 334     /*
 335      * PageSearchStatus structures for the channels when send pages.
 336      * Protected by the bitmap_mutex.
 337      */
 338     PageSearchStatus pss[RAM_CHANNEL_MAX];
 339     /* UFFD file descriptor, used in 'write-tracking' migration */
 340     int uffdio_fd;
 341     /* total ram size in bytes */
 342     uint64_t ram_bytes_total;
 343     /* Last block that we have visited searching for dirty pages */
 344     RAMBlock *last_seen_block;
 345     /* Last dirty target page we have sent */
 346     ram_addr_t last_page;
 347     /* last ram version we have seen */
 348     uint32_t last_version;
 349     /* How many times we have dirty too many pages */
 350     int dirty_rate_high_cnt;
 351     /* these variables are used for bitmap sync */
 352     /* last time we did a full bitmap_sync */
 353     int64_t time_last_bitmap_sync;
 354     /* bytes transferred at start_time */
 355     uint64_t bytes_xfer_prev;
 356     /* number of dirty pages since start_time */
 357     uint64_t num_dirty_pages_period;
 358     /* xbzrle misses since the beginning of the period */
 359     uint64_t xbzrle_cache_miss_prev;
 360     /* Amount of xbzrle pages since the beginning of the period */
 361     uint64_t xbzrle_pages_prev;
 362     /* Amount of xbzrle encoded bytes since the beginning of the period */
 363     uint64_t xbzrle_bytes_prev;
 364     /* Are we really using XBZRLE (e.g., after the first round). */
 365     bool xbzrle_started;
 366     /* Are we on the last stage of migration */
 367     bool last_stage;
 368     /* compression statistics since the beginning of the period */
 369     /* amount of count that no free thread to compress data */
 370     uint64_t compress_thread_busy_prev;
 371     /* amount bytes after compression */
 372     uint64_t compressed_size_prev;
 373     /* amount of compressed pages */
 374     uint64_t compress_pages_prev;
 375
 376     /* total handled target pages at the beginning of period */
 377     uint64_t target_page_count_prev;
 378     /* total handled target pages since start */
 379     uint64_t target_page_count;
 380     /* number of dirty bits in the bitmap */
 381     uint64_t migration_dirty_pages;
 382     /*
 383      * Protects:
 384      * - dirty/clear bitmap
 385      * - migration_dirty_pages
 386      * - pss structures
 387      */
 388     QemuMutex bitmap_mutex;
 389     /* The RAMBlock used in the last src_page_requests */
 390     RAMBlock *last_req_rb;
 391     /* Queue of outstanding page requests from the destination */
 392     QemuMutex src_page_req_mutex;
 393     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 394 };
 395 typedef struct RAMState RAMState;
 396
 397 static RAMState *ram_state;
 398
 399 static NotifierWithReturnList precopy_notifier_list;
 400
 401 /* Whether postcopy has queued requests? */
 402 static bool postcopy_has_request(RAMState *rs)
 403 {
 404     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
 405 }
 406
 407 void precopy_infrastructure_init(void)
 408 {
 409     notifier_with_return_list_init(&precopy_notifier_list);
 410 }
 411
 412 void precopy_add_notifier(NotifierWithReturn *n)
 413 {
 414     notifier_with_return_list_add(&precopy_notifier_list, n);
 415 }
 416
 417 void precopy_remove_notifier(NotifierWithReturn *n)
 418 {
 419     notifier_with_return_remove(n);
 420 }
 421
 422 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 423 {
 424     PrecopyNotifyData pnd;
 425     pnd.reason = reason;
 426     pnd.errp = errp;
 427
 428     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 429 }
 430
 431 uint64_t ram_bytes_remaining(void)
 432 {
 433     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 434                        0;
 435 }
 436
 437 void ram_transferred_add(uint64_t bytes)
 438 {
 439     if (runstate_is_running()) {
 440         stat64_add(&mig_stats.precopy_bytes, bytes);
 441     } else if (migration_in_postcopy()) {
 442         stat64_add(&mig_stats.postcopy_bytes, bytes);
 443     } else {
 444         stat64_add(&mig_stats.downtime_bytes, bytes);
 445     }
 446     stat64_add(&mig_stats.transferred, bytes);
 447 }
 448
 449 struct MigrationOps {
 450     int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
 451 };
 452 typedef struct MigrationOps MigrationOps;
 453
 454 MigrationOps *migration_ops;
 455
 456 static int ram_save_host_page_urgent(PageSearchStatus *pss);
 457
 458 /* NOTE: page is the PFN not real ram_addr_t. */
 459 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
 460 {
 461     pss->block = rb;
 462     pss->page = page;
 463     pss->complete_round = false;
 464 }
 465
 466 /*
 467  * Check whether two PSSs are actively sending the same page.  Return true
 468  * if it is, false otherwise.
 469  */
 470 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
 471 {
 472     return pss1->host_page_sending && pss2->host_page_sending &&
 473         (pss1->host_page_start == pss2->host_page_start);
 474 }
 475
 476 /**
 477  * save_page_header: write page header to wire
 478  *
 479  * If this is the 1st block, it also writes the block identification
 480  *
 481  * Returns the number of bytes written
 482  *
 483  * @pss: current PSS channel status
 484  * @block: block that contains the page we want to send
 485  * @offset: offset inside the block for the page
 486  *          in the lower bits, it contains flags
 487  */
 488 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
 489                                RAMBlock *block, ram_addr_t offset)
 490 {
 491     size_t size, len;
 492     bool same_block = (block == pss->last_sent_block);
 493
 494     if (same_block) {
 495         offset |= RAM_SAVE_FLAG_CONTINUE;
 496     }
 497     qemu_put_be64(f, offset);
 498     size = 8;
 499
 500     if (!same_block) {
 501         len = strlen(block->idstr);
 502         qemu_put_byte(f, len);
 503         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 504         size += 1 + len;
 505         pss->last_sent_block = block;
 506     }
 507     return size;
 508 }
 509
 510 /**
 511  * mig_throttle_guest_down: throttle down the guest
 512  *
 513  * Reduce amount of guest cpu execution to hopefully slow down memory
 514  * writes. If guest dirty memory rate is reduced below the rate at
 515  * which we can transfer pages to the destination then we should be
 516  * able to complete migration. Some workloads dirty memory way too
 517  * fast and will not effectively converge, even with auto-converge.
 518  */
 519 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 520                                     uint64_t bytes_dirty_threshold)
 521 {
 522     uint64_t pct_initial = migrate_cpu_throttle_initial();
 523     uint64_t pct_increment = migrate_cpu_throttle_increment();
 524     bool pct_tailslow = migrate_cpu_throttle_tailslow();
 525     int pct_max = migrate_max_cpu_throttle();
 526
 527     uint64_t throttle_now = cpu_throttle_get_percentage();
 528     uint64_t cpu_now, cpu_ideal, throttle_inc;
 529
 530     /* We have not started throttling yet. Let's start it. */
 531     if (!cpu_throttle_active()) {
 532         cpu_throttle_set(pct_initial);
 533     } else {
 534         /* Throttling already on, just increase the rate */
 535         if (!pct_tailslow) {
 536             throttle_inc = pct_increment;
 537         } else {
 538             /* Compute the ideal CPU percentage used by Guest, which may
 539              * make the dirty rate match the dirty rate threshold. */
 540             cpu_now = 100 - throttle_now;
 541             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 542                         bytes_dirty_period);
 543             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 544         }
 545         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 546     }
 547 }
 548
 549 void mig_throttle_counter_reset(void)
 550 {
 551     RAMState *rs = ram_state;
 552
 553     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 554     rs->num_dirty_pages_period = 0;
 555     rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred);
 556 }
 557
 558 /**
 559  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 560  *
 561  * @rs: current RAM state
 562  * @current_addr: address for the zero page
 563  *
 564  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 565  * The important thing is that a stale (not-yet-0'd) page be replaced
 566  * by the new data.
 567  * As a bonus, if the page wasn't in the cache it gets added so that
 568  * when a small write is made into the 0'd page it gets XBZRLE sent.
 569  */
 570 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 571 {
 572     /* We don't care if this fails to allocate a new cache page
 573      * as long as it updated an old one */
 574     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 575                  stat64_get(&mig_stats.dirty_sync_count));
 576 }
 577
 578 #define ENCODING_FLAG_XBZRLE 0x1
 579
 580 /**
 581  * save_xbzrle_page: compress and send current page
 582  *
 583  * Returns: 1 means that we wrote the page
 584  *          0 means that page is identical to the one already sent
 585  *          -1 means that xbzrle would be longer than normal
 586  *
 587  * @rs: current RAM state
 588  * @pss: current PSS channel
 589  * @current_data: pointer to the address of the page contents
 590  * @current_addr: addr of the page
 591  * @block: block that contains the page we want to send
 592  * @offset: offset inside the block for the page
 593  */
 594 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
 595                             uint8_t **current_data, ram_addr_t current_addr,
 596                             RAMBlock *block, ram_addr_t offset)
 597 {
 598     int encoded_len = 0, bytes_xbzrle;
 599     uint8_t *prev_cached_page;
 600     QEMUFile *file = pss->pss_channel;
 601     uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
 602
 603     if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) {
 604         xbzrle_counters.cache_miss++;
 605         if (!rs->last_stage) {
 606             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 607                              generation) == -1) {
 608                 return -1;
 609             } else {
 610                 /* update *current_data when the page has been
 611                    inserted into cache */
 612                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 613             }
 614         }
 615         return -1;
 616     }
 617
 618     /*
 619      * Reaching here means the page has hit the xbzrle cache, no matter what
 620      * encoding result it is (normal encoding, overflow or skipping the page),
 621      * count the page as encoded. This is used to calculate the encoding rate.
 622      *
 623      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 624      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 625      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 626      * skipped page included. In this way, the encoding rate can tell if the
 627      * guest page is good for xbzrle encoding.
 628      */
 629     xbzrle_counters.pages++;
 630     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 631
 632     /* save current buffer into memory */
 633     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 634
 635     /* XBZRLE encoding (if there is no overflow) */
 636     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 637                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 638                                        TARGET_PAGE_SIZE);
 639
 640     /*
 641      * Update the cache contents, so that it corresponds to the data
 642      * sent, in all cases except where we skip the page.
 643      */
 644     if (!rs->last_stage && encoded_len != 0) {
 645         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 646         /*
 647          * In the case where we couldn't compress, ensure that the caller
 648          * sends the data from the cache, since the guest might have
 649          * changed the RAM since we copied it.
 650          */
 651         *current_data = prev_cached_page;
 652     }
 653
 654     if (encoded_len == 0) {
 655         trace_save_xbzrle_page_skipping();
 656         return 0;
 657     } else if (encoded_len == -1) {
 658         trace_save_xbzrle_page_overflow();
 659         xbzrle_counters.overflow++;
 660         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 661         return -1;
 662     }
 663
 664     /* Send XBZRLE based compressed page */
 665     bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
 666                                     offset | RAM_SAVE_FLAG_XBZRLE);
 667     qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
 668     qemu_put_be16(file, encoded_len);
 669     qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
 670     bytes_xbzrle += encoded_len + 1 + 2;
 671     /*
 672      * Like compressed_size (please see update_compress_thread_counts),
 673      * the xbzrle encoded bytes don't count the 8 byte header with
 674      * RAM_SAVE_FLAG_CONTINUE.
 675      */
 676     xbzrle_counters.bytes += bytes_xbzrle - 8;
 677     ram_transferred_add(bytes_xbzrle);
 678
 679     return 1;
 680 }
 681
 682 /**
 683  * pss_find_next_dirty: find the next dirty page of current ramblock
 684  *
 685  * This function updates pss->page to point to the next dirty page index
 686  * within the ramblock to migrate, or the end of ramblock when nothing
 687  * found.  Note that when pss->host_page_sending==true it means we're
 688  * during sending a host page, so we won't look for dirty page that is
 689  * outside the host page boundary.
 690  *
 691  * @pss: the current page search status
 692  */
 693 static void pss_find_next_dirty(PageSearchStatus *pss)
 694 {
 695     RAMBlock *rb = pss->block;
 696     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 697     unsigned long *bitmap = rb->bmap;
 698
 699     if (ramblock_is_ignored(rb)) {
 700         /* Points directly to the end, so we know no dirty page */
 701         pss->page = size;
 702         return;
 703     }
 704
 705     /*
 706      * If during sending a host page, only look for dirty pages within the
 707      * current host page being send.
 708      */
 709     if (pss->host_page_sending) {
 710         assert(pss->host_page_end);
 711         size = MIN(size, pss->host_page_end);
 712     }
 713
 714     pss->page = find_next_bit(bitmap, size, pss->page);
 715 }
 716
 717 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
 718                                                        unsigned long page)
 719 {
 720     uint8_t shift;
 721     hwaddr size, start;
 722
 723     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
 724         return;
 725     }
 726
 727     shift = rb->clear_bmap_shift;
 728     /*
 729      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 730      * can make things easier sometimes since then start address
 731      * of the small chunk will always be 64 pages aligned so the
 732      * bitmap will always be aligned to unsigned long. We should
 733      * even be able to remove this restriction but I'm simply
 734      * keeping it.
 735      */
 736     assert(shift >= 6);
 737
 738     size = 1ULL << (TARGET_PAGE_BITS + shift);
 739     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
 740     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 741     memory_region_clear_dirty_bitmap(rb->mr, start, size);
 742 }
 743
 744 static void
 745 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
 746                                                  unsigned long start,
 747                                                  unsigned long npages)
 748 {
 749     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
 750     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
 751     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
 752
 753     /*
 754      * Clear pages from start to start + npages - 1, so the end boundary is
 755      * exclusive.
 756      */
 757     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
 758         migration_clear_memory_region_dirty_bitmap(rb, i);
 759     }
 760 }
 761
 762 /*
 763  * colo_bitmap_find_diry:find contiguous dirty pages from start
 764  *
 765  * Returns the page offset within memory region of the start of the contiguout
 766  * dirty page
 767  *
 768  * @rs: current RAM state
 769  * @rb: RAMBlock where to search for dirty pages
 770  * @start: page where we start the search
 771  * @num: the number of contiguous dirty pages
 772  */
 773 static inline
 774 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 775                                      unsigned long start, unsigned long *num)
 776 {
 777     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 778     unsigned long *bitmap = rb->bmap;
 779     unsigned long first, next;
 780
 781     *num = 0;
 782
 783     if (ramblock_is_ignored(rb)) {
 784         return size;
 785     }
 786
 787     first = find_next_bit(bitmap, size, start);
 788     if (first >= size) {
 789         return first;
 790     }
 791     next = find_next_zero_bit(bitmap, size, first + 1);
 792     assert(next >= first);
 793     *num = next - first;
 794     return first;
 795 }
 796
 797 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 798                                                 RAMBlock *rb,
 799                                                 unsigned long page)
 800 {
 801     bool ret;
 802
 803     /*
 804      * Clear dirty bitmap if needed.  This _must_ be called before we
 805      * send any of the page in the chunk because we need to make sure
 806      * we can capture further page content changes when we sync dirty
 807      * log the next time.  So as long as we are going to send any of
 808      * the page in the chunk we clear the remote dirty bitmap for all.
 809      * Clearing it earlier won't be a problem, but too late will.
 810      */
 811     migration_clear_memory_region_dirty_bitmap(rb, page);
 812
 813     ret = test_and_clear_bit(page, rb->bmap);
 814     if (ret) {
 815         rs->migration_dirty_pages--;
 816     }
 817
 818     return ret;
 819 }
 820
 821 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
 822                                        void *opaque)
 823 {
 824     const hwaddr offset = section->offset_within_region;
 825     const hwaddr size = int128_get64(section->size);
 826     const unsigned long start = offset >> TARGET_PAGE_BITS;
 827     const unsigned long npages = size >> TARGET_PAGE_BITS;
 828     RAMBlock *rb = section->mr->ram_block;
 829     uint64_t *cleared_bits = opaque;
 830
 831     /*
 832      * We don't grab ram_state->bitmap_mutex because we expect to run
 833      * only when starting migration or during postcopy recovery where
 834      * we don't have concurrent access.
 835      */
 836     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
 837         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
 838     }
 839     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
 840     bitmap_clear(rb->bmap, start, npages);
 841 }
 842
 843 /*
 844  * Exclude all dirty pages from migration that fall into a discarded range as
 845  * managed by a RamDiscardManager responsible for the mapped memory region of
 846  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
 847  *
 848  * Discarded pages ("logically unplugged") have undefined content and must
 849  * not get migrated, because even reading these pages for migration might
 850  * result in undesired behavior.
 851  *
 852  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
 853  *
 854  * Note: The result is only stable while migrating (precopy/postcopy).
 855  */
 856 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
 857 {
 858     uint64_t cleared_bits = 0;
 859
 860     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
 861         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 862         MemoryRegionSection section = {
 863             .mr = rb->mr,
 864             .offset_within_region = 0,
 865             .size = int128_make64(qemu_ram_get_used_length(rb)),
 866         };
 867
 868         ram_discard_manager_replay_discarded(rdm, &section,
 869                                              dirty_bitmap_clear_section,
 870                                              &cleared_bits);
 871     }
 872     return cleared_bits;
 873 }
 874
 875 /*
 876  * Check if a host-page aligned page falls into a discarded range as managed by
 877  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
 878  *
 879  * Note: The result is only stable while migrating (precopy/postcopy).
 880  */
 881 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
 882 {
 883     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
 884         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 885         MemoryRegionSection section = {
 886             .mr = rb->mr,
 887             .offset_within_region = start,
 888             .size = int128_make64(qemu_ram_pagesize(rb)),
 889         };
 890
 891         return !ram_discard_manager_is_populated(rdm, &section);
 892     }
 893     return false;
 894 }
 895
 896 /* Called with RCU critical section */
 897 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 898 {
 899     uint64_t new_dirty_pages =
 900         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
 901
 902     rs->migration_dirty_pages += new_dirty_pages;
 903     rs->num_dirty_pages_period += new_dirty_pages;
 904 }
 905
 906 /**
 907  * ram_pagesize_summary: calculate all the pagesizes of a VM
 908  *
 909  * Returns a summary bitmap of the page sizes of all RAMBlocks
 910  *
 911  * For VMs with just normal pages this is equivalent to the host page
 912  * size. If it's got some huge pages then it's the OR of all the
 913  * different page sizes.
 914  */
 915 uint64_t ram_pagesize_summary(void)
 916 {
 917     RAMBlock *block;
 918     uint64_t summary = 0;
 919
 920     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 921         summary |= block->page_size;
 922     }
 923
 924     return summary;
 925 }
 926
 927 uint64_t ram_get_total_transferred_pages(void)
 928 {
 929     return stat64_get(&mig_stats.normal_pages) +
 930         stat64_get(&mig_stats.zero_pages) +
 931         compression_counters.pages + xbzrle_counters.pages;
 932 }
 933
 934 static void migration_update_rates(RAMState *rs, int64_t end_time)
 935 {
 936     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 937     double compressed_size;
 938
 939     /* calculate period counters */
 940     stat64_set(&mig_stats.dirty_pages_rate,
 941                rs->num_dirty_pages_period * 1000 /
 942                (end_time - rs->time_last_bitmap_sync));
 943
 944     if (!page_count) {
 945         return;
 946     }
 947
 948     if (migrate_xbzrle()) {
 949         double encoded_size, unencoded_size;
 950
 951         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 952             rs->xbzrle_cache_miss_prev) / page_count;
 953         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 954         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
 955                          TARGET_PAGE_SIZE;
 956         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
 957         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
 958             xbzrle_counters.encoding_rate = 0;
 959         } else {
 960             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
 961         }
 962         rs->xbzrle_pages_prev = xbzrle_counters.pages;
 963         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
 964     }
 965
 966     if (migrate_compress()) {
 967         compression_counters.busy_rate = (double)(compression_counters.busy -
 968             rs->compress_thread_busy_prev) / page_count;
 969         rs->compress_thread_busy_prev = compression_counters.busy;
 970
 971         compressed_size = compression_counters.compressed_size -
 972                           rs->compressed_size_prev;
 973         if (compressed_size) {
 974             double uncompressed_size = (compression_counters.pages -
 975                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
 976
 977             /* Compression-Ratio = Uncompressed-size / Compressed-size */
 978             compression_counters.compression_rate =
 979                                         uncompressed_size / compressed_size;
 980
 981             rs->compress_pages_prev = compression_counters.pages;
 982             rs->compressed_size_prev = compression_counters.compressed_size;
 983         }
 984     }
 985 }
 986
 987 static void migration_trigger_throttle(RAMState *rs)
 988 {
 989     uint64_t threshold = migrate_throttle_trigger_threshold();
 990     uint64_t bytes_xfer_period =
 991         stat64_get(&mig_stats.transferred) - rs->bytes_xfer_prev;
 992     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
 993     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
 994
 995     /* During block migration the auto-converge logic incorrectly detects
 996      * that ram migration makes no progress. Avoid this by disabling the
 997      * throttling logic during the bulk phase of block migration. */
 998     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 999         /* The following detection logic can be refined later. For now:
1000            Check to see if the ratio between dirtied bytes and the approx.
1001            amount of bytes that just got transferred since the last time
1002            we were in this routine reaches the threshold. If that happens
1003            twice, start or increase throttling. */
1004
1005         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1006             (++rs->dirty_rate_high_cnt >= 2)) {
1007             trace_migration_throttle();
1008             rs->dirty_rate_high_cnt = 0;
1009             mig_throttle_guest_down(bytes_dirty_period,
1010                                     bytes_dirty_threshold);
1011         }
1012     }
1013 }
1014
1015 static void migration_bitmap_sync(RAMState *rs, bool last_stage)
1016 {
1017     RAMBlock *block;
1018     int64_t end_time;
1019
1020     stat64_add(&mig_stats.dirty_sync_count, 1);
1021
1022     if (!rs->time_last_bitmap_sync) {
1023         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1024     }
1025
1026     trace_migration_bitmap_sync_start();
1027     memory_global_dirty_log_sync(last_stage);
1028
1029     qemu_mutex_lock(&rs->bitmap_mutex);
1030     WITH_RCU_READ_LOCK_GUARD() {
1031         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1032             ramblock_sync_dirty_bitmap(rs, block);
1033         }
1034         stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining());
1035     }
1036     qemu_mutex_unlock(&rs->bitmap_mutex);
1037
1038     memory_global_after_dirty_log_sync();
1039     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1040
1041     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1042
1043     /* more than 1 second = 1000 millisecons */
1044     if (end_time > rs->time_last_bitmap_sync + 1000) {
1045         migration_trigger_throttle(rs);
1046
1047         migration_update_rates(rs, end_time);
1048
1049         rs->target_page_count_prev = rs->target_page_count;
1050
1051         /* reset period counters */
1052         rs->time_last_bitmap_sync = end_time;
1053         rs->num_dirty_pages_period = 0;
1054         rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred);
1055     }
1056     if (migrate_events()) {
1057         uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
1058         qapi_event_send_migration_pass(generation);
1059     }
1060 }
1061
1062 static void migration_bitmap_sync_precopy(RAMState *rs, bool last_stage)
1063 {
1064     Error *local_err = NULL;
1065
1066     /*
1067      * The current notifier usage is just an optimization to migration, so we
1068      * don't stop the normal migration process in the error case.
1069      */
1070     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1071         error_report_err(local_err);
1072         local_err = NULL;
1073     }
1074
1075     migration_bitmap_sync(rs, last_stage);
1076
1077     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1078         error_report_err(local_err);
1079     }
1080 }
1081
1082 void ram_release_page(const char *rbname, uint64_t offset)
1083 {
1084     if (!migrate_release_ram() || !migration_in_postcopy()) {
1085         return;
1086     }
1087
1088     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1089 }
1090
1091 /**
1092  * save_zero_page_to_file: send the zero page to the file
1093  *
1094  * Returns the size of data written to the file, 0 means the page is not
1095  * a zero page
1096  *
1097  * @pss: current PSS channel
1098  * @block: block that contains the page we want to send
1099  * @offset: offset inside the block for the page
1100  */
1101 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
1102                                   RAMBlock *block, ram_addr_t offset)
1103 {
1104     uint8_t *p = block->host + offset;
1105     int len = 0;
1106
1107     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1108         len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1109         qemu_put_byte(file, 0);
1110         len += 1;
1111         ram_release_page(block->idstr, offset);
1112     }
1113     return len;
1114 }
1115
1116 /**
1117  * save_zero_page: send the zero page to the stream
1118  *
1119  * Returns the number of pages written.
1120  *
1121  * @pss: current PSS channel
1122  * @block: block that contains the page we want to send
1123  * @offset: offset inside the block for the page
1124  */
1125 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block,
1126                           ram_addr_t offset)
1127 {
1128     int len = save_zero_page_to_file(pss, f, block, offset);
1129
1130     if (len) {
1131         stat64_add(&mig_stats.zero_pages, 1);
1132         ram_transferred_add(len);
1133         return 1;
1134     }
1135     return -1;
1136 }
1137
1138 /*
1139  * @pages: the number of pages written by the control path,
1140  *        < 0 - error
1141  *        > 0 - number of pages written
1142  *
1143  * Return true if the pages has been saved, otherwise false is returned.
1144  */
1145 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1146                               ram_addr_t offset, int *pages)
1147 {
1148     uint64_t bytes_xmit = 0;
1149     int ret;
1150
1151     *pages = -1;
1152     ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1153                                 TARGET_PAGE_SIZE, &bytes_xmit);
1154     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1155         return false;
1156     }
1157
1158     if (bytes_xmit) {
1159         ram_transferred_add(bytes_xmit);
1160         *pages = 1;
1161     }
1162
1163     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1164         return true;
1165     }
1166
1167     if (bytes_xmit > 0) {
1168         stat64_add(&mig_stats.normal_pages, 1);
1169     } else if (bytes_xmit == 0) {
1170         stat64_add(&mig_stats.zero_pages, 1);
1171     }
1172
1173     return true;
1174 }
1175
1176 /*
1177  * directly send the page to the stream
1178  *
1179  * Returns the number of pages written.
1180  *
1181  * @pss: current PSS channel
1182  * @block: block that contains the page we want to send
1183  * @offset: offset inside the block for the page
1184  * @buf: the page to be sent
1185  * @async: send to page asyncly
1186  */
1187 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1188                             ram_addr_t offset, uint8_t *buf, bool async)
1189 {
1190     QEMUFile *file = pss->pss_channel;
1191
1192     ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
1193                                          offset | RAM_SAVE_FLAG_PAGE));
1194     if (async) {
1195         qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1196                               migrate_release_ram() &&
1197                               migration_in_postcopy());
1198     } else {
1199         qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1200     }
1201     ram_transferred_add(TARGET_PAGE_SIZE);
1202     stat64_add(&mig_stats.normal_pages, 1);
1203     return 1;
1204 }
1205
1206 /**
1207  * ram_save_page: send the given page to the stream
1208  *
1209  * Returns the number of pages written.
1210  *          < 0 - error
1211  *          >=0 - Number of pages written - this might legally be 0
1212  *                if xbzrle noticed the page was the same.
1213  *
1214  * @rs: current RAM state
1215  * @block: block that contains the page we want to send
1216  * @offset: offset inside the block for the page
1217  */
1218 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1219 {
1220     int pages = -1;
1221     uint8_t *p;
1222     bool send_async = true;
1223     RAMBlock *block = pss->block;
1224     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1225     ram_addr_t current_addr = block->offset + offset;
1226
1227     p = block->host + offset;
1228     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1229
1230     XBZRLE_cache_lock();
1231     if (rs->xbzrle_started && !migration_in_postcopy()) {
1232         pages = save_xbzrle_page(rs, pss, &p, current_addr,
1233                                  block, offset);
1234         if (!rs->last_stage) {
1235             /* Can't send this cached data async, since the cache page
1236              * might get updated before it gets to the wire
1237              */
1238             send_async = false;
1239         }
1240     }
1241
1242     /* XBZRLE overflow or normal page */
1243     if (pages == -1) {
1244         pages = save_normal_page(pss, block, offset, p, send_async);
1245     }
1246
1247     XBZRLE_cache_unlock();
1248
1249     return pages;
1250 }
1251
1252 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
1253                                  ram_addr_t offset)
1254 {
1255     if (multifd_queue_page(file, block, offset) < 0) {
1256         return -1;
1257     }
1258     stat64_add(&mig_stats.normal_pages, 1);
1259
1260     return 1;
1261 }
1262
1263 static void
1264 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1265 {
1266     ram_transferred_add(bytes_xmit);
1267
1268     if (param->result == RES_ZEROPAGE) {
1269         stat64_add(&mig_stats.zero_pages, 1);
1270         return;
1271     }
1272
1273     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1274     compression_counters.compressed_size += bytes_xmit - 8;
1275     compression_counters.pages++;
1276 }
1277
1278 static bool save_page_use_compression(RAMState *rs);
1279
1280 static int send_queued_data(CompressParam *param)
1281 {
1282     PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_PRECOPY];
1283     MigrationState *ms = migrate_get_current();
1284     QEMUFile *file = ms->to_dst_file;
1285     int len = 0;
1286
1287     RAMBlock *block = param->block;
1288     ram_addr_t offset = param->offset;
1289
1290     if (param->result == RES_NONE) {
1291         return 0;
1292     }
1293
1294     assert(block == pss->last_sent_block);
1295
1296     if (param->result == RES_ZEROPAGE) {
1297         assert(qemu_file_buffer_empty(param->file));
1298         len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1299         qemu_put_byte(file, 0);
1300         len += 1;
1301         ram_release_page(block->idstr, offset);
1302     } else if (param->result == RES_COMPRESS) {
1303         assert(!qemu_file_buffer_empty(param->file));
1304         len += save_page_header(pss, file, block,
1305                                 offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1306         len += qemu_put_qemu_file(file, param->file);
1307     } else {
1308         abort();
1309     }
1310
1311     update_compress_thread_counts(param, len);
1312
1313     return len;
1314 }
1315
1316 static void ram_flush_compressed_data(RAMState *rs)
1317 {
1318     if (!save_page_use_compression(rs)) {
1319         return;
1320     }
1321
1322     flush_compressed_data(send_queued_data);
1323 }
1324
1325 #define PAGE_ALL_CLEAN 0
1326 #define PAGE_TRY_AGAIN 1
1327 #define PAGE_DIRTY_FOUND 2
1328 /**
1329  * find_dirty_block: find the next dirty page and update any state
1330  * associated with the search process.
1331  *
1332  * Returns:
1333  *         <0: An error happened
1334  *         PAGE_ALL_CLEAN: no dirty page found, give up
1335  *         PAGE_TRY_AGAIN: no dirty page found, retry for next block
1336  *         PAGE_DIRTY_FOUND: dirty page found
1337  *
1338  * @rs: current RAM state
1339  * @pss: data about the state of the current dirty page scan
1340  * @again: set to false if the search has scanned the whole of RAM
1341  */
1342 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
1343 {
1344     /* Update pss->page for the next dirty bit in ramblock */
1345     pss_find_next_dirty(pss);
1346
1347     if (pss->complete_round && pss->block == rs->last_seen_block &&
1348         pss->page >= rs->last_page) {
1349         /*
1350          * We've been once around the RAM and haven't found anything.
1351          * Give up.
1352          */
1353         return PAGE_ALL_CLEAN;
1354     }
1355     if (!offset_in_ramblock(pss->block,
1356                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1357         /* Didn't find anything in this RAM Block */
1358         pss->page = 0;
1359         pss->block = QLIST_NEXT_RCU(pss->block, next);
1360         if (!pss->block) {
1361             if (!migrate_multifd_flush_after_each_section()) {
1362                 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel;
1363                 int ret = multifd_send_sync_main(f);
1364                 if (ret < 0) {
1365                     return ret;
1366                 }
1367                 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
1368                 qemu_fflush(f);
1369             }
1370             /*
1371              * If memory migration starts over, we will meet a dirtied page
1372              * which may still exists in compression threads's ring, so we
1373              * should flush the compressed data to make sure the new page
1374              * is not overwritten by the old one in the destination.
1375              *
1376              * Also If xbzrle is on, stop using the data compression at this
1377              * point. In theory, xbzrle can do better than compression.
1378              */
1379             ram_flush_compressed_data(rs);
1380
1381             /* Hit the end of the list */
1382             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1383             /* Flag that we've looped */
1384             pss->complete_round = true;
1385             /* After the first round, enable XBZRLE. */
1386             if (migrate_xbzrle()) {
1387                 rs->xbzrle_started = true;
1388             }
1389         }
1390         /* Didn't find anything this time, but try again on the new block */
1391         return PAGE_TRY_AGAIN;
1392     } else {
1393         /* We've found something */
1394         return PAGE_DIRTY_FOUND;
1395     }
1396 }
1397
1398 /**
1399  * unqueue_page: gets a page of the queue
1400  *
1401  * Helper for 'get_queued_page' - gets a page off the queue
1402  *
1403  * Returns the block of the page (or NULL if none available)
1404  *
1405  * @rs: current RAM state
1406  * @offset: used to return the offset within the RAMBlock
1407  */
1408 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1409 {
1410     struct RAMSrcPageRequest *entry;
1411     RAMBlock *block = NULL;
1412
1413     if (!postcopy_has_request(rs)) {
1414         return NULL;
1415     }
1416
1417     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1418
1419     /*
1420      * This should _never_ change even after we take the lock, because no one
1421      * should be taking anything off the request list other than us.
1422      */
1423     assert(postcopy_has_request(rs));
1424
1425     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1426     block = entry->rb;
1427     *offset = entry->offset;
1428
1429     if (entry->len > TARGET_PAGE_SIZE) {
1430         entry->len -= TARGET_PAGE_SIZE;
1431         entry->offset += TARGET_PAGE_SIZE;
1432     } else {
1433         memory_region_unref(block->mr);
1434         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1435         g_free(entry);
1436         migration_consume_urgent_request();
1437     }
1438
1439     return block;
1440 }
1441
1442 #if defined(__linux__)
1443 /**
1444  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1445  *   is found, return RAM block pointer and page offset
1446  *
1447  * Returns pointer to the RAMBlock containing faulting page,
1448  *   NULL if no write faults are pending
1449  *
1450  * @rs: current RAM state
1451  * @offset: page offset from the beginning of the block
1452  */
1453 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1454 {
1455     struct uffd_msg uffd_msg;
1456     void *page_address;
1457     RAMBlock *block;
1458     int res;
1459
1460     if (!migrate_background_snapshot()) {
1461         return NULL;
1462     }
1463
1464     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1465     if (res <= 0) {
1466         return NULL;
1467     }
1468
1469     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1470     block = qemu_ram_block_from_host(page_address, false, offset);
1471     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1472     return block;
1473 }
1474
1475 /**
1476  * ram_save_release_protection: release UFFD write protection after
1477  *   a range of pages has been saved
1478  *
1479  * @rs: current RAM state
1480  * @pss: page-search-status structure
1481  * @start_page: index of the first page in the range relative to pss->block
1482  *
1483  * Returns 0 on success, negative value in case of an error
1484 */
1485 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1486         unsigned long start_page)
1487 {
1488     int res = 0;
1489
1490     /* Check if page is from UFFD-managed region. */
1491     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1492         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1493         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1494
1495         /* Flush async buffers before un-protect. */
1496         qemu_fflush(pss->pss_channel);
1497         /* Un-protect memory range. */
1498         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1499                 false, false);
1500     }
1501
1502     return res;
1503 }
1504
1505 /* ram_write_tracking_available: check if kernel supports required UFFD features
1506  *
1507  * Returns true if supports, false otherwise
1508  */
1509 bool ram_write_tracking_available(void)
1510 {
1511     uint64_t uffd_features;
1512     int res;
1513
1514     res = uffd_query_features(&uffd_features);
1515     return (res == 0 &&
1516             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1517 }
1518
1519 /* ram_write_tracking_compatible: check if guest configuration is
1520  *   compatible with 'write-tracking'
1521  *
1522  * Returns true if compatible, false otherwise
1523  */
1524 bool ram_write_tracking_compatible(void)
1525 {
1526     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1527     int uffd_fd;
1528     RAMBlock *block;
1529     bool ret = false;
1530
1531     /* Open UFFD file descriptor */
1532     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1533     if (uffd_fd < 0) {
1534         return false;
1535     }
1536
1537     RCU_READ_LOCK_GUARD();
1538
1539     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1540         uint64_t uffd_ioctls;
1541
1542         /* Nothing to do with read-only and MMIO-writable regions */
1543         if (block->mr->readonly || block->mr->rom_device) {
1544             continue;
1545         }
1546         /* Try to register block memory via UFFD-IO to track writes */
1547         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1548                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1549             goto out;
1550         }
1551         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1552             goto out;
1553         }
1554     }
1555     ret = true;
1556
1557 out:
1558     uffd_close_fd(uffd_fd);
1559     return ret;
1560 }
1561
1562 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1563                                        ram_addr_t size)
1564 {
1565     const ram_addr_t end = offset + size;
1566
1567     /*
1568      * We read one byte of each page; this will preallocate page tables if
1569      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1570      * where no page was populated yet. This might require adaption when
1571      * supporting other mappings, like shmem.
1572      */
1573     for (; offset < end; offset += block->page_size) {
1574         char tmp = *((char *)block->host + offset);
1575
1576         /* Don't optimize the read out */
1577         asm volatile("" : "+r" (tmp));
1578     }
1579 }
1580
1581 static inline int populate_read_section(MemoryRegionSection *section,
1582                                         void *opaque)
1583 {
1584     const hwaddr size = int128_get64(section->size);
1585     hwaddr offset = section->offset_within_region;
1586     RAMBlock *block = section->mr->ram_block;
1587
1588     populate_read_range(block, offset, size);
1589     return 0;
1590 }
1591
1592 /*
1593  * ram_block_populate_read: preallocate page tables and populate pages in the
1594  *   RAM block by reading a byte of each page.
1595  *
1596  * Since it's solely used for userfault_fd WP feature, here we just
1597  *   hardcode page size to qemu_real_host_page_size.
1598  *
1599  * @block: RAM block to populate
1600  */
1601 static void ram_block_populate_read(RAMBlock *rb)
1602 {
1603     /*
1604      * Skip populating all pages that fall into a discarded range as managed by
1605      * a RamDiscardManager responsible for the mapped memory region of the
1606      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1607      * must not get populated automatically. We don't have to track
1608      * modifications via userfaultfd WP reliably, because these pages will
1609      * not be part of the migration stream either way -- see
1610      * ramblock_dirty_bitmap_exclude_discarded_pages().
1611      *
1612      * Note: The result is only stable while migrating (precopy/postcopy).
1613      */
1614     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1615         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1616         MemoryRegionSection section = {
1617             .mr = rb->mr,
1618             .offset_within_region = 0,
1619             .size = rb->mr->size,
1620         };
1621
1622         ram_discard_manager_replay_populated(rdm, &section,
1623                                              populate_read_section, NULL);
1624     } else {
1625         populate_read_range(rb, 0, rb->used_length);
1626     }
1627 }
1628
1629 /*
1630  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1631  */
1632 void ram_write_tracking_prepare(void)
1633 {
1634     RAMBlock *block;
1635
1636     RCU_READ_LOCK_GUARD();
1637
1638     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1639         /* Nothing to do with read-only and MMIO-writable regions */
1640         if (block->mr->readonly || block->mr->rom_device) {
1641             continue;
1642         }
1643
1644         /*
1645          * Populate pages of the RAM block before enabling userfault_fd
1646          * write protection.
1647          *
1648          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1649          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1650          * pages with pte_none() entries in page table.
1651          */
1652         ram_block_populate_read(block);
1653     }
1654 }
1655
1656 static inline int uffd_protect_section(MemoryRegionSection *section,
1657                                        void *opaque)
1658 {
1659     const hwaddr size = int128_get64(section->size);
1660     const hwaddr offset = section->offset_within_region;
1661     RAMBlock *rb = section->mr->ram_block;
1662     int uffd_fd = (uintptr_t)opaque;
1663
1664     return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1665                                   false);
1666 }
1667
1668 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1669 {
1670     assert(rb->flags & RAM_UF_WRITEPROTECT);
1671
1672     /* See ram_block_populate_read() */
1673     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1674         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1675         MemoryRegionSection section = {
1676             .mr = rb->mr,
1677             .offset_within_region = 0,
1678             .size = rb->mr->size,
1679         };
1680
1681         return ram_discard_manager_replay_populated(rdm, &section,
1682                                                     uffd_protect_section,
1683                                                     (void *)(uintptr_t)uffd_fd);
1684     }
1685     return uffd_change_protection(uffd_fd, rb->host,
1686                                   rb->used_length, true, false);
1687 }
1688
1689 /*
1690  * ram_write_tracking_start: start UFFD-WP memory tracking
1691  *
1692  * Returns 0 for success or negative value in case of error
1693  */
1694 int ram_write_tracking_start(void)
1695 {
1696     int uffd_fd;
1697     RAMState *rs = ram_state;
1698     RAMBlock *block;
1699
1700     /* Open UFFD file descriptor */
1701     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1702     if (uffd_fd < 0) {
1703         return uffd_fd;
1704     }
1705     rs->uffdio_fd = uffd_fd;
1706
1707     RCU_READ_LOCK_GUARD();
1708
1709     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1710         /* Nothing to do with read-only and MMIO-writable regions */
1711         if (block->mr->readonly || block->mr->rom_device) {
1712             continue;
1713         }
1714
1715         /* Register block memory with UFFD to track writes */
1716         if (uffd_register_memory(rs->uffdio_fd, block->host,
1717                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1718             goto fail;
1719         }
1720         block->flags |= RAM_UF_WRITEPROTECT;
1721         memory_region_ref(block->mr);
1722
1723         /* Apply UFFD write protection to the block memory range */
1724         if (ram_block_uffd_protect(block, uffd_fd)) {
1725             goto fail;
1726         }
1727
1728         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1729                 block->host, block->max_length);
1730     }
1731
1732     return 0;
1733
1734 fail:
1735     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1736
1737     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1738         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1739             continue;
1740         }
1741         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1742         /* Cleanup flags and remove reference */
1743         block->flags &= ~RAM_UF_WRITEPROTECT;
1744         memory_region_unref(block->mr);
1745     }
1746
1747     uffd_close_fd(uffd_fd);
1748     rs->uffdio_fd = -1;
1749     return -1;
1750 }
1751
1752 /**
1753  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1754  */
1755 void ram_write_tracking_stop(void)
1756 {
1757     RAMState *rs = ram_state;
1758     RAMBlock *block;
1759
1760     RCU_READ_LOCK_GUARD();
1761
1762     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1763         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1764             continue;
1765         }
1766         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1767
1768         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1769                 block->host, block->max_length);
1770
1771         /* Cleanup flags and remove reference */
1772         block->flags &= ~RAM_UF_WRITEPROTECT;
1773         memory_region_unref(block->mr);
1774     }
1775
1776     /* Finally close UFFD file descriptor */
1777     uffd_close_fd(rs->uffdio_fd);
1778     rs->uffdio_fd = -1;
1779 }
1780
1781 #else
1782 /* No target OS support, stubs just fail or ignore */
1783
1784 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1785 {
1786     (void) rs;
1787     (void) offset;
1788
1789     return NULL;
1790 }
1791
1792 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1793         unsigned long start_page)
1794 {
1795     (void) rs;
1796     (void) pss;
1797     (void) start_page;
1798
1799     return 0;
1800 }
1801
1802 bool ram_write_tracking_available(void)
1803 {
1804     return false;
1805 }
1806
1807 bool ram_write_tracking_compatible(void)
1808 {
1809     assert(0);
1810     return false;
1811 }
1812
1813 int ram_write_tracking_start(void)
1814 {
1815     assert(0);
1816     return -1;
1817 }
1818
1819 void ram_write_tracking_stop(void)
1820 {
1821     assert(0);
1822 }
1823 #endif /* defined(__linux__) */
1824
1825 /**
1826  * get_queued_page: unqueue a page from the postcopy requests
1827  *
1828  * Skips pages that are already sent (!dirty)
1829  *
1830  * Returns true if a queued page is found
1831  *
1832  * @rs: current RAM state
1833  * @pss: data about the state of the current dirty page scan
1834  */
1835 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1836 {
1837     RAMBlock  *block;
1838     ram_addr_t offset;
1839     bool dirty;
1840
1841     do {
1842         block = unqueue_page(rs, &offset);
1843         /*
1844          * We're sending this page, and since it's postcopy nothing else
1845          * will dirty it, and we must make sure it doesn't get sent again
1846          * even if this queue request was received after the background
1847          * search already sent it.
1848          */
1849         if (block) {
1850             unsigned long page;
1851
1852             page = offset >> TARGET_PAGE_BITS;
1853             dirty = test_bit(page, block->bmap);
1854             if (!dirty) {
1855                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1856                                                 page);
1857             } else {
1858                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1859             }
1860         }
1861
1862     } while (block && !dirty);
1863
1864     if (!block) {
1865         /*
1866          * Poll write faults too if background snapshot is enabled; that's
1867          * when we have vcpus got blocked by the write protected pages.
1868          */
1869         block = poll_fault_page(rs, &offset);
1870     }
1871
1872     if (block) {
1873         /*
1874          * We want the background search to continue from the queued page
1875          * since the guest is likely to want other pages near to the page
1876          * it just requested.
1877          */
1878         pss->block = block;
1879         pss->page = offset >> TARGET_PAGE_BITS;
1880
1881         /*
1882          * This unqueued page would break the "one round" check, even is
1883          * really rare.
1884          */
1885         pss->complete_round = false;
1886     }
1887
1888     return !!block;
1889 }
1890
1891 /**
1892  * migration_page_queue_free: drop any remaining pages in the ram
1893  * request queue
1894  *
1895  * It should be empty at the end anyway, but in error cases there may
1896  * be some left.  in case that there is any page left, we drop it.
1897  *
1898  */
1899 static void migration_page_queue_free(RAMState *rs)
1900 {
1901     struct RAMSrcPageRequest *mspr, *next_mspr;
1902     /* This queue generally should be empty - but in the case of a failed
1903      * migration might have some droppings in.
1904      */
1905     RCU_READ_LOCK_GUARD();
1906     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1907         memory_region_unref(mspr->rb->mr);
1908         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1909         g_free(mspr);
1910     }
1911 }
1912
1913 /**
1914  * ram_save_queue_pages: queue the page for transmission
1915  *
1916  * A request from postcopy destination for example.
1917  *
1918  * Returns zero on success or negative on error
1919  *
1920  * @rbname: Name of the RAMBLock of the request. NULL means the
1921  *          same that last one.
1922  * @start: starting address from the start of the RAMBlock
1923  * @len: length (in bytes) to send
1924  */
1925 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1926 {
1927     RAMBlock *ramblock;
1928     RAMState *rs = ram_state;
1929
1930     stat64_add(&mig_stats.postcopy_requests, 1);
1931     RCU_READ_LOCK_GUARD();
1932
1933     if (!rbname) {
1934         /* Reuse last RAMBlock */
1935         ramblock = rs->last_req_rb;
1936
1937         if (!ramblock) {
1938             /*
1939              * Shouldn't happen, we can't reuse the last RAMBlock if
1940              * it's the 1st request.
1941              */
1942             error_report("ram_save_queue_pages no previous block");
1943             return -1;
1944         }
1945     } else {
1946         ramblock = qemu_ram_block_by_name(rbname);
1947
1948         if (!ramblock) {
1949             /* We shouldn't be asked for a non-existent RAMBlock */
1950             error_report("ram_save_queue_pages no block '%s'", rbname);
1951             return -1;
1952         }
1953         rs->last_req_rb = ramblock;
1954     }
1955     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1956     if (!offset_in_ramblock(ramblock, start + len - 1)) {
1957         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1958                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1959                      __func__, start, len, ramblock->used_length);
1960         return -1;
1961     }
1962
1963     /*
1964      * When with postcopy preempt, we send back the page directly in the
1965      * rp-return thread.
1966      */
1967     if (postcopy_preempt_active()) {
1968         ram_addr_t page_start = start >> TARGET_PAGE_BITS;
1969         size_t page_size = qemu_ram_pagesize(ramblock);
1970         PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
1971         int ret = 0;
1972
1973         qemu_mutex_lock(&rs->bitmap_mutex);
1974
1975         pss_init(pss, ramblock, page_start);
1976         /*
1977          * Always use the preempt channel, and make sure it's there.  It's
1978          * safe to access without lock, because when rp-thread is running
1979          * we should be the only one who operates on the qemufile
1980          */
1981         pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
1982         assert(pss->pss_channel);
1983
1984         /*
1985          * It must be either one or multiple of host page size.  Just
1986          * assert; if something wrong we're mostly split brain anyway.
1987          */
1988         assert(len % page_size == 0);
1989         while (len) {
1990             if (ram_save_host_page_urgent(pss)) {
1991                 error_report("%s: ram_save_host_page_urgent() failed: "
1992                              "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
1993                              __func__, ramblock->idstr, start);
1994                 ret = -1;
1995                 break;
1996             }
1997             /*
1998              * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
1999              * will automatically be moved and point to the next host page
2000              * we're going to send, so no need to update here.
2001              *
2002              * Normally QEMU never sends >1 host page in requests, so
2003              * logically we don't even need that as the loop should only
2004              * run once, but just to be consistent.
2005              */
2006             len -= page_size;
2007         };
2008         qemu_mutex_unlock(&rs->bitmap_mutex);
2009
2010         return ret;
2011     }
2012
2013     struct RAMSrcPageRequest *new_entry =
2014         g_new0(struct RAMSrcPageRequest, 1);
2015     new_entry->rb = ramblock;
2016     new_entry->offset = start;
2017     new_entry->len = len;
2018
2019     memory_region_ref(ramblock->mr);
2020     qemu_mutex_lock(&rs->src_page_req_mutex);
2021     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2022     migration_make_urgent_request();
2023     qemu_mutex_unlock(&rs->src_page_req_mutex);
2024
2025     return 0;
2026 }
2027
2028 static bool save_page_use_compression(RAMState *rs)
2029 {
2030     if (!migrate_compress()) {
2031         return false;
2032     }
2033
2034     /*
2035      * If xbzrle is enabled (e.g., after first round of migration), stop
2036      * using the data compression. In theory, xbzrle can do better than
2037      * compression.
2038      */
2039     if (rs->xbzrle_started) {
2040         return false;
2041     }
2042
2043     return true;
2044 }
2045
2046 /*
2047  * try to compress the page before posting it out, return true if the page
2048  * has been properly handled by compression, otherwise needs other
2049  * paths to handle it
2050  */
2051 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2052                                RAMBlock *block, ram_addr_t offset)
2053 {
2054     if (!save_page_use_compression(rs)) {
2055         return false;
2056     }
2057
2058     /*
2059      * When starting the process of a new block, the first page of
2060      * the block should be sent out before other pages in the same
2061      * block, and all the pages in last block should have been sent
2062      * out, keeping this order is important, because the 'cont' flag
2063      * is used to avoid resending the block name.
2064      *
2065      * We post the fist page as normal page as compression will take
2066      * much CPU resource.
2067      */
2068     if (block != pss->last_sent_block) {
2069         ram_flush_compressed_data(rs);
2070         return false;
2071     }
2072
2073     if (compress_page_with_multi_thread(block, offset, send_queued_data) > 0) {
2074         return true;
2075     }
2076
2077     compression_counters.busy++;
2078     return false;
2079 }
2080
2081 /**
2082  * ram_save_target_page_legacy: save one target page
2083  *
2084  * Returns the number of pages written
2085  *
2086  * @rs: current RAM state
2087  * @pss: data about the page we want to send
2088  */
2089 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
2090 {
2091     RAMBlock *block = pss->block;
2092     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2093     int res;
2094
2095     if (control_save_page(pss, block, offset, &res)) {
2096         return res;
2097     }
2098
2099     if (save_compress_page(rs, pss, block, offset)) {
2100         return 1;
2101     }
2102
2103     res = save_zero_page(pss, pss->pss_channel, block, offset);
2104     if (res > 0) {
2105         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2106          * page would be stale
2107          */
2108         if (rs->xbzrle_started) {
2109             XBZRLE_cache_lock();
2110             xbzrle_cache_zero_page(rs, block->offset + offset);
2111             XBZRLE_cache_unlock();
2112         }
2113         return res;
2114     }
2115
2116     /*
2117      * Do not use multifd in postcopy as one whole host page should be
2118      * placed.  Meanwhile postcopy requires atomic update of pages, so even
2119      * if host page size == guest page size the dest guest during run may
2120      * still see partially copied pages which is data corruption.
2121      */
2122     if (migrate_multifd() && !migration_in_postcopy()) {
2123         return ram_save_multifd_page(pss->pss_channel, block, offset);
2124     }
2125
2126     return ram_save_page(rs, pss);
2127 }
2128
2129 /* Should be called before sending a host page */
2130 static void pss_host_page_prepare(PageSearchStatus *pss)
2131 {
2132     /* How many guest pages are there in one host page? */
2133     size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2134
2135     pss->host_page_sending = true;
2136     if (guest_pfns <= 1) {
2137         /*
2138          * This covers both when guest psize == host psize, or when guest
2139          * has larger psize than the host (guest_pfns==0).
2140          *
2141          * For the latter, we always send one whole guest page per
2142          * iteration of the host page (example: an Alpha VM on x86 host
2143          * will have guest psize 8K while host psize 4K).
2144          */
2145         pss->host_page_start = pss->page;
2146         pss->host_page_end = pss->page + 1;
2147     } else {
2148         /*
2149          * The host page spans over multiple guest pages, we send them
2150          * within the same host page iteration.
2151          */
2152         pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2153         pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2154     }
2155 }
2156
2157 /*
2158  * Whether the page pointed by PSS is within the host page being sent.
2159  * Must be called after a previous pss_host_page_prepare().
2160  */
2161 static bool pss_within_range(PageSearchStatus *pss)
2162 {
2163     ram_addr_t ram_addr;
2164
2165     assert(pss->host_page_sending);
2166
2167     /* Over host-page boundary? */
2168     if (pss->page >= pss->host_page_end) {
2169         return false;
2170     }
2171
2172     ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2173
2174     return offset_in_ramblock(pss->block, ram_addr);
2175 }
2176
2177 static void pss_host_page_finish(PageSearchStatus *pss)
2178 {
2179     pss->host_page_sending = false;
2180     /* This is not needed, but just to reset it */
2181     pss->host_page_start = pss->host_page_end = 0;
2182 }
2183
2184 /*
2185  * Send an urgent host page specified by `pss'.  Need to be called with
2186  * bitmap_mutex held.
2187  *
2188  * Returns 0 if save host page succeeded, false otherwise.
2189  */
2190 static int ram_save_host_page_urgent(PageSearchStatus *pss)
2191 {
2192     bool page_dirty, sent = false;
2193     RAMState *rs = ram_state;
2194     int ret = 0;
2195
2196     trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2197     pss_host_page_prepare(pss);
2198
2199     /*
2200      * If precopy is sending the same page, let it be done in precopy, or
2201      * we could send the same page in two channels and none of them will
2202      * receive the whole page.
2203      */
2204     if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2205         trace_postcopy_preempt_hit(pss->block->idstr,
2206                                    pss->page << TARGET_PAGE_BITS);
2207         return 0;
2208     }
2209
2210     do {
2211         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2212
2213         if (page_dirty) {
2214             /* Be strict to return code; it must be 1, or what else? */
2215             if (migration_ops->ram_save_target_page(rs, pss) != 1) {
2216                 error_report_once("%s: ram_save_target_page failed", __func__);
2217                 ret = -1;
2218                 goto out;
2219             }
2220             sent = true;
2221         }
2222         pss_find_next_dirty(pss);
2223     } while (pss_within_range(pss));
2224 out:
2225     pss_host_page_finish(pss);
2226     /* For urgent requests, flush immediately if sent */
2227     if (sent) {
2228         qemu_fflush(pss->pss_channel);
2229     }
2230     return ret;
2231 }
2232
2233 /**
2234  * ram_save_host_page: save a whole host page
2235  *
2236  * Starting at *offset send pages up to the end of the current host
2237  * page. It's valid for the initial offset to point into the middle of
2238  * a host page in which case the remainder of the hostpage is sent.
2239  * Only dirty target pages are sent. Note that the host page size may
2240  * be a huge page for this block.
2241  *
2242  * The saving stops at the boundary of the used_length of the block
2243  * if the RAMBlock isn't a multiple of the host page size.
2244  *
2245  * The caller must be with ram_state.bitmap_mutex held to call this
2246  * function.  Note that this function can temporarily release the lock, but
2247  * when the function is returned it'll make sure the lock is still held.
2248  *
2249  * Returns the number of pages written or negative on error
2250  *
2251  * @rs: current RAM state
2252  * @pss: data about the page we want to send
2253  */
2254 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2255 {
2256     bool page_dirty, preempt_active = postcopy_preempt_active();
2257     int tmppages, pages = 0;
2258     size_t pagesize_bits =
2259         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2260     unsigned long start_page = pss->page;
2261     int res;
2262
2263     if (ramblock_is_ignored(pss->block)) {
2264         error_report("block %s should not be migrated !", pss->block->idstr);
2265         return 0;
2266     }
2267
2268     /* Update host page boundary information */
2269     pss_host_page_prepare(pss);
2270
2271     do {
2272         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2273
2274         /* Check the pages is dirty and if it is send it */
2275         if (page_dirty) {
2276             /*
2277              * Properly yield the lock only in postcopy preempt mode
2278              * because both migration thread and rp-return thread can
2279              * operate on the bitmaps.
2280              */
2281             if (preempt_active) {
2282                 qemu_mutex_unlock(&rs->bitmap_mutex);
2283             }
2284             tmppages = migration_ops->ram_save_target_page(rs, pss);
2285             if (tmppages >= 0) {
2286                 pages += tmppages;
2287                 /*
2288                  * Allow rate limiting to happen in the middle of huge pages if
2289                  * something is sent in the current iteration.
2290                  */
2291                 if (pagesize_bits > 1 && tmppages > 0) {
2292                     migration_rate_limit();
2293                 }
2294             }
2295             if (preempt_active) {
2296                 qemu_mutex_lock(&rs->bitmap_mutex);
2297             }
2298         } else {
2299             tmppages = 0;
2300         }
2301
2302         if (tmppages < 0) {
2303             pss_host_page_finish(pss);
2304             return tmppages;
2305         }
2306
2307         pss_find_next_dirty(pss);
2308     } while (pss_within_range(pss));
2309
2310     pss_host_page_finish(pss);
2311
2312     res = ram_save_release_protection(rs, pss, start_page);
2313     return (res < 0 ? res : pages);
2314 }
2315
2316 /**
2317  * ram_find_and_save_block: finds a dirty page and sends it to f
2318  *
2319  * Called within an RCU critical section.
2320  *
2321  * Returns the number of pages written where zero means no dirty pages,
2322  * or negative on error
2323  *
2324  * @rs: current RAM state
2325  *
2326  * On systems where host-page-size > target-page-size it will send all the
2327  * pages in a host page that are dirty.
2328  */
2329 static int ram_find_and_save_block(RAMState *rs)
2330 {
2331     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2332     int pages = 0;
2333
2334     /* No dirty page as there is zero RAM */
2335     if (!rs->ram_bytes_total) {
2336         return pages;
2337     }
2338
2339     /*
2340      * Always keep last_seen_block/last_page valid during this procedure,
2341      * because find_dirty_block() relies on these values (e.g., we compare
2342      * last_seen_block with pss.block to see whether we searched all the
2343      * ramblocks) to detect the completion of migration.  Having NULL value
2344      * of last_seen_block can conditionally cause below loop to run forever.
2345      */
2346     if (!rs->last_seen_block) {
2347         rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2348         rs->last_page = 0;
2349     }
2350
2351     pss_init(pss, rs->last_seen_block, rs->last_page);
2352
2353     while (true){
2354         if (!get_queued_page(rs, pss)) {
2355             /* priority queue empty, so just search for something dirty */
2356             int res = find_dirty_block(rs, pss);
2357             if (res != PAGE_DIRTY_FOUND) {
2358                 if (res == PAGE_ALL_CLEAN) {
2359                     break;
2360                 } else if (res == PAGE_TRY_AGAIN) {
2361                     continue;
2362                 } else if (res < 0) {
2363                     pages = res;
2364                     break;
2365                 }
2366             }
2367         }
2368         pages = ram_save_host_page(rs, pss);
2369         if (pages) {
2370             break;
2371         }
2372     }
2373
2374     rs->last_seen_block = pss->block;
2375     rs->last_page = pss->page;
2376
2377     return pages;
2378 }
2379
2380 static uint64_t ram_bytes_total_with_ignored(void)
2381 {
2382     RAMBlock *block;
2383     uint64_t total = 0;
2384
2385     RCU_READ_LOCK_GUARD();
2386
2387     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2388         total += block->used_length;
2389     }
2390     return total;
2391 }
2392
2393 uint64_t ram_bytes_total(void)
2394 {
2395     RAMBlock *block;
2396     uint64_t total = 0;
2397
2398     RCU_READ_LOCK_GUARD();
2399
2400     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2401         total += block->used_length;
2402     }
2403     return total;
2404 }
2405
2406 static void xbzrle_load_setup(void)
2407 {
2408     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2409 }
2410
2411 static void xbzrle_load_cleanup(void)
2412 {
2413     g_free(XBZRLE.decoded_buf);
2414     XBZRLE.decoded_buf = NULL;
2415 }
2416
2417 static void ram_state_cleanup(RAMState **rsp)
2418 {
2419     if (*rsp) {
2420         migration_page_queue_free(*rsp);
2421         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2422         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2423         g_free(*rsp);
2424         *rsp = NULL;
2425     }
2426 }
2427
2428 static void xbzrle_cleanup(void)
2429 {
2430     XBZRLE_cache_lock();
2431     if (XBZRLE.cache) {
2432         cache_fini(XBZRLE.cache);
2433         g_free(XBZRLE.encoded_buf);
2434         g_free(XBZRLE.current_buf);
2435         g_free(XBZRLE.zero_target_page);
2436         XBZRLE.cache = NULL;
2437         XBZRLE.encoded_buf = NULL;
2438         XBZRLE.current_buf = NULL;
2439         XBZRLE.zero_target_page = NULL;
2440     }
2441     XBZRLE_cache_unlock();
2442 }
2443
2444 static void ram_save_cleanup(void *opaque)
2445 {
2446     RAMState **rsp = opaque;
2447     RAMBlock *block;
2448
2449     /* We don't use dirty log with background snapshots */
2450     if (!migrate_background_snapshot()) {
2451         /* caller have hold iothread lock or is in a bh, so there is
2452          * no writing race against the migration bitmap
2453          */
2454         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2455             /*
2456              * do not stop dirty log without starting it, since
2457              * memory_global_dirty_log_stop will assert that
2458              * memory_global_dirty_log_start/stop used in pairs
2459              */
2460             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2461         }
2462     }
2463
2464     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2465         g_free(block->clear_bmap);
2466         block->clear_bmap = NULL;
2467         g_free(block->bmap);
2468         block->bmap = NULL;
2469     }
2470
2471     xbzrle_cleanup();
2472     compress_threads_save_cleanup();
2473     ram_state_cleanup(rsp);
2474     g_free(migration_ops);
2475     migration_ops = NULL;
2476 }
2477
2478 static void ram_state_reset(RAMState *rs)
2479 {
2480     int i;
2481
2482     for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2483         rs->pss[i].last_sent_block = NULL;
2484     }
2485
2486     rs->last_seen_block = NULL;
2487     rs->last_page = 0;
2488     rs->last_version = ram_list.version;
2489     rs->xbzrle_started = false;
2490 }
2491
2492 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2493
2494 /* **** functions for postcopy ***** */
2495
2496 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2497 {
2498     struct RAMBlock *block;
2499
2500     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2501         unsigned long *bitmap = block->bmap;
2502         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2503         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2504
2505         while (run_start < range) {
2506             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2507             ram_discard_range(block->idstr,
2508                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2509                               ((ram_addr_t)(run_end - run_start))
2510                                 << TARGET_PAGE_BITS);
2511             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2512         }
2513     }
2514 }
2515
2516 /**
2517  * postcopy_send_discard_bm_ram: discard a RAMBlock
2518  *
2519  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2520  *
2521  * @ms: current migration state
2522  * @block: RAMBlock to discard
2523  */
2524 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2525 {
2526     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2527     unsigned long current;
2528     unsigned long *bitmap = block->bmap;
2529
2530     for (current = 0; current < end; ) {
2531         unsigned long one = find_next_bit(bitmap, end, current);
2532         unsigned long zero, discard_length;
2533
2534         if (one >= end) {
2535             break;
2536         }
2537
2538         zero = find_next_zero_bit(bitmap, end, one + 1);
2539
2540         if (zero >= end) {
2541             discard_length = end - one;
2542         } else {
2543             discard_length = zero - one;
2544         }
2545         postcopy_discard_send_range(ms, one, discard_length);
2546         current = one + discard_length;
2547     }
2548 }
2549
2550 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2551
2552 /**
2553  * postcopy_each_ram_send_discard: discard all RAMBlocks
2554  *
2555  * Utility for the outgoing postcopy code.
2556  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2557  *   passing it bitmap indexes and name.
2558  * (qemu_ram_foreach_block ends up passing unscaled lengths
2559  *  which would mean postcopy code would have to deal with target page)
2560  *
2561  * @ms: current migration state
2562  */
2563 static void postcopy_each_ram_send_discard(MigrationState *ms)
2564 {
2565     struct RAMBlock *block;
2566
2567     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2568         postcopy_discard_send_init(ms, block->idstr);
2569
2570         /*
2571          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2572          * host-page size chunks, mark any partially dirty host-page size
2573          * chunks as all dirty.  In this case the host-page is the host-page
2574          * for the particular RAMBlock, i.e. it might be a huge page.
2575          */
2576         postcopy_chunk_hostpages_pass(ms, block);
2577
2578         /*
2579          * Postcopy sends chunks of bitmap over the wire, but it
2580          * just needs indexes at this point, avoids it having
2581          * target page specific code.
2582          */
2583         postcopy_send_discard_bm_ram(ms, block);
2584         postcopy_discard_send_finish(ms);
2585     }
2586 }
2587
2588 /**
2589  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2590  *
2591  * Helper for postcopy_chunk_hostpages; it's called twice to
2592  * canonicalize the two bitmaps, that are similar, but one is
2593  * inverted.
2594  *
2595  * Postcopy requires that all target pages in a hostpage are dirty or
2596  * clean, not a mix.  This function canonicalizes the bitmaps.
2597  *
2598  * @ms: current migration state
2599  * @block: block that contains the page we want to canonicalize
2600  */
2601 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2602 {
2603     RAMState *rs = ram_state;
2604     unsigned long *bitmap = block->bmap;
2605     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2606     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2607     unsigned long run_start;
2608
2609     if (block->page_size == TARGET_PAGE_SIZE) {
2610         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2611         return;
2612     }
2613
2614     /* Find a dirty page */
2615     run_start = find_next_bit(bitmap, pages, 0);
2616
2617     while (run_start < pages) {
2618
2619         /*
2620          * If the start of this run of pages is in the middle of a host
2621          * page, then we need to fixup this host page.
2622          */
2623         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2624             /* Find the end of this run */
2625             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2626             /*
2627              * If the end isn't at the start of a host page, then the
2628              * run doesn't finish at the end of a host page
2629              * and we need to discard.
2630              */
2631         }
2632
2633         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2634             unsigned long page;
2635             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2636                                                              host_ratio);
2637             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2638
2639             /* Clean up the bitmap */
2640             for (page = fixup_start_addr;
2641                  page < fixup_start_addr + host_ratio; page++) {
2642                 /*
2643                  * Remark them as dirty, updating the count for any pages
2644                  * that weren't previously dirty.
2645                  */
2646                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2647             }
2648         }
2649
2650         /* Find the next dirty page for the next iteration */
2651         run_start = find_next_bit(bitmap, pages, run_start);
2652     }
2653 }
2654
2655 /**
2656  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2657  *
2658  * Transmit the set of pages to be discarded after precopy to the target
2659  * these are pages that:
2660  *     a) Have been previously transmitted but are now dirty again
2661  *     b) Pages that have never been transmitted, this ensures that
2662  *        any pages on the destination that have been mapped by background
2663  *        tasks get discarded (transparent huge pages is the specific concern)
2664  * Hopefully this is pretty sparse
2665  *
2666  * @ms: current migration state
2667  */
2668 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2669 {
2670     RAMState *rs = ram_state;
2671
2672     RCU_READ_LOCK_GUARD();
2673
2674     /* This should be our last sync, the src is now paused */
2675     migration_bitmap_sync(rs, false);
2676
2677     /* Easiest way to make sure we don't resume in the middle of a host-page */
2678     rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
2679     rs->last_seen_block = NULL;
2680     rs->last_page = 0;
2681
2682     postcopy_each_ram_send_discard(ms);
2683
2684     trace_ram_postcopy_send_discard_bitmap();
2685 }
2686
2687 /**
2688  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2689  *
2690  * Returns zero on success
2691  *
2692  * @rbname: name of the RAMBlock of the request. NULL means the
2693  *          same that last one.
2694  * @start: RAMBlock starting page
2695  * @length: RAMBlock size
2696  */
2697 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2698 {
2699     trace_ram_discard_range(rbname, start, length);
2700
2701     RCU_READ_LOCK_GUARD();
2702     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2703
2704     if (!rb) {
2705         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2706         return -1;
2707     }
2708
2709     /*
2710      * On source VM, we don't need to update the received bitmap since
2711      * we don't even have one.
2712      */
2713     if (rb->receivedmap) {
2714         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2715                      length >> qemu_target_page_bits());
2716     }
2717
2718     return ram_block_discard_range(rb, start, length);
2719 }
2720
2721 /*
2722  * For every allocation, we will try not to crash the VM if the
2723  * allocation failed.
2724  */
2725 static int xbzrle_init(void)
2726 {
2727     Error *local_err = NULL;
2728
2729     if (!migrate_xbzrle()) {
2730         return 0;
2731     }
2732
2733     XBZRLE_cache_lock();
2734
2735     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2736     if (!XBZRLE.zero_target_page) {
2737         error_report("%s: Error allocating zero page", __func__);
2738         goto err_out;
2739     }
2740
2741     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2742                               TARGET_PAGE_SIZE, &local_err);
2743     if (!XBZRLE.cache) {
2744         error_report_err(local_err);
2745         goto free_zero_page;
2746     }
2747
2748     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2749     if (!XBZRLE.encoded_buf) {
2750         error_report("%s: Error allocating encoded_buf", __func__);
2751         goto free_cache;
2752     }
2753
2754     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2755     if (!XBZRLE.current_buf) {
2756         error_report("%s: Error allocating current_buf", __func__);
2757         goto free_encoded_buf;
2758     }
2759
2760     /* We are all good */
2761     XBZRLE_cache_unlock();
2762     return 0;
2763
2764 free_encoded_buf:
2765     g_free(XBZRLE.encoded_buf);
2766     XBZRLE.encoded_buf = NULL;
2767 free_cache:
2768     cache_fini(XBZRLE.cache);
2769     XBZRLE.cache = NULL;
2770 free_zero_page:
2771     g_free(XBZRLE.zero_target_page);
2772     XBZRLE.zero_target_page = NULL;
2773 err_out:
2774     XBZRLE_cache_unlock();
2775     return -ENOMEM;
2776 }
2777
2778 static int ram_state_init(RAMState **rsp)
2779 {
2780     *rsp = g_try_new0(RAMState, 1);
2781
2782     if (!*rsp) {
2783         error_report("%s: Init ramstate fail", __func__);
2784         return -1;
2785     }
2786
2787     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2788     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2789     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2790     (*rsp)->ram_bytes_total = ram_bytes_total();
2791
2792     /*
2793      * Count the total number of pages used by ram blocks not including any
2794      * gaps due to alignment or unplugs.
2795      * This must match with the initial values of dirty bitmap.
2796      */
2797     (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
2798     ram_state_reset(*rsp);
2799
2800     return 0;
2801 }
2802
2803 static void ram_list_init_bitmaps(void)
2804 {
2805     MigrationState *ms = migrate_get_current();
2806     RAMBlock *block;
2807     unsigned long pages;
2808     uint8_t shift;
2809
2810     /* Skip setting bitmap if there is no RAM */
2811     if (ram_bytes_total()) {
2812         shift = ms->clear_bitmap_shift;
2813         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2814             error_report("clear_bitmap_shift (%u) too big, using "
2815                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2816             shift = CLEAR_BITMAP_SHIFT_MAX;
2817         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2818             error_report("clear_bitmap_shift (%u) too small, using "
2819                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2820             shift = CLEAR_BITMAP_SHIFT_MIN;
2821         }
2822
2823         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2824             pages = block->max_length >> TARGET_PAGE_BITS;
2825             /*
2826              * The initial dirty bitmap for migration must be set with all
2827              * ones to make sure we'll migrate every guest RAM page to
2828              * destination.
2829              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2830              * new migration after a failed migration, ram_list.
2831              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2832              * guest memory.
2833              */
2834             block->bmap = bitmap_new(pages);
2835             bitmap_set(block->bmap, 0, pages);
2836             block->clear_bmap_shift = shift;
2837             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2838         }
2839     }
2840 }
2841
2842 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2843 {
2844     unsigned long pages;
2845     RAMBlock *rb;
2846
2847     RCU_READ_LOCK_GUARD();
2848
2849     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2850             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2851             rs->migration_dirty_pages -= pages;
2852     }
2853 }
2854
2855 static void ram_init_bitmaps(RAMState *rs)
2856 {
2857     /* For memory_global_dirty_log_start below.  */
2858     qemu_mutex_lock_iothread();
2859     qemu_mutex_lock_ramlist();
2860
2861     WITH_RCU_READ_LOCK_GUARD() {
2862         ram_list_init_bitmaps();
2863         /* We don't use dirty log with background snapshots */
2864         if (!migrate_background_snapshot()) {
2865             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2866             migration_bitmap_sync_precopy(rs, false);
2867         }
2868     }
2869     qemu_mutex_unlock_ramlist();
2870     qemu_mutex_unlock_iothread();
2871
2872     /*
2873      * After an eventual first bitmap sync, fixup the initial bitmap
2874      * containing all 1s to exclude any discarded pages from migration.
2875      */
2876     migration_bitmap_clear_discarded_pages(rs);
2877 }
2878
2879 static int ram_init_all(RAMState **rsp)
2880 {
2881     if (ram_state_init(rsp)) {
2882         return -1;
2883     }
2884
2885     if (xbzrle_init()) {
2886         ram_state_cleanup(rsp);
2887         return -1;
2888     }
2889
2890     ram_init_bitmaps(*rsp);
2891
2892     return 0;
2893 }
2894
2895 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2896 {
2897     RAMBlock *block;
2898     uint64_t pages = 0;
2899
2900     /*
2901      * Postcopy is not using xbzrle/compression, so no need for that.
2902      * Also, since source are already halted, we don't need to care
2903      * about dirty page logging as well.
2904      */
2905
2906     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2907         pages += bitmap_count_one(block->bmap,
2908                                   block->used_length >> TARGET_PAGE_BITS);
2909     }
2910
2911     /* This may not be aligned with current bitmaps. Recalculate. */
2912     rs->migration_dirty_pages = pages;
2913
2914     ram_state_reset(rs);
2915
2916     /* Update RAMState cache of output QEMUFile */
2917     rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
2918
2919     trace_ram_state_resume_prepare(pages);
2920 }
2921
2922 /*
2923  * This function clears bits of the free pages reported by the caller from the
2924  * migration dirty bitmap. @addr is the host address corresponding to the
2925  * start of the continuous guest free pages, and @len is the total bytes of
2926  * those pages.
2927  */
2928 void qemu_guest_free_page_hint(void *addr, size_t len)
2929 {
2930     RAMBlock *block;
2931     ram_addr_t offset;
2932     size_t used_len, start, npages;
2933     MigrationState *s = migrate_get_current();
2934
2935     /* This function is currently expected to be used during live migration */
2936     if (!migration_is_setup_or_active(s->state)) {
2937         return;
2938     }
2939
2940     for (; len > 0; len -= used_len, addr += used_len) {
2941         block = qemu_ram_block_from_host(addr, false, &offset);
2942         if (unlikely(!block || offset >= block->used_length)) {
2943             /*
2944              * The implementation might not support RAMBlock resize during
2945              * live migration, but it could happen in theory with future
2946              * updates. So we add a check here to capture that case.
2947              */
2948             error_report_once("%s unexpected error", __func__);
2949             return;
2950         }
2951
2952         if (len <= block->used_length - offset) {
2953             used_len = len;
2954         } else {
2955             used_len = block->used_length - offset;
2956         }
2957
2958         start = offset >> TARGET_PAGE_BITS;
2959         npages = used_len >> TARGET_PAGE_BITS;
2960
2961         qemu_mutex_lock(&ram_state->bitmap_mutex);
2962         /*
2963          * The skipped free pages are equavalent to be sent from clear_bmap's
2964          * perspective, so clear the bits from the memory region bitmap which
2965          * are initially set. Otherwise those skipped pages will be sent in
2966          * the next round after syncing from the memory region bitmap.
2967          */
2968         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2969         ram_state->migration_dirty_pages -=
2970                       bitmap_count_one_with_offset(block->bmap, start, npages);
2971         bitmap_clear(block->bmap, start, npages);
2972         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2973     }
2974 }
2975
2976 /*
2977  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2978  * long-running RCU critical section.  When rcu-reclaims in the code
2979  * start to become numerous it will be necessary to reduce the
2980  * granularity of these critical sections.
2981  */
2982
2983 /**
2984  * ram_save_setup: Setup RAM for migration
2985  *
2986  * Returns zero to indicate success and negative for error
2987  *
2988  * @f: QEMUFile where to send the data
2989  * @opaque: RAMState pointer
2990  */
2991 static int ram_save_setup(QEMUFile *f, void *opaque)
2992 {
2993     RAMState **rsp = opaque;
2994     RAMBlock *block;
2995     int ret;
2996
2997     if (compress_threads_save_setup()) {
2998         return -1;
2999     }
3000
3001     /* migration has already setup the bitmap, reuse it. */
3002     if (!migration_in_colo_state()) {
3003         if (ram_init_all(rsp) != 0) {
3004             compress_threads_save_cleanup();
3005             return -1;
3006         }
3007     }
3008     (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
3009
3010     WITH_RCU_READ_LOCK_GUARD() {
3011         qemu_put_be64(f, ram_bytes_total_with_ignored()
3012                          | RAM_SAVE_FLAG_MEM_SIZE);
3013
3014         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3015             qemu_put_byte(f, strlen(block->idstr));
3016             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3017             qemu_put_be64(f, block->used_length);
3018             if (migrate_postcopy_ram() && block->page_size !=
3019                                           qemu_host_page_size) {
3020                 qemu_put_be64(f, block->page_size);
3021             }
3022             if (migrate_ignore_shared()) {
3023                 qemu_put_be64(f, block->mr->addr);
3024             }
3025         }
3026     }
3027
3028     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3029     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3030
3031     migration_ops = g_malloc0(sizeof(MigrationOps));
3032     migration_ops->ram_save_target_page = ram_save_target_page_legacy;
3033     ret = multifd_send_sync_main(f);
3034     if (ret < 0) {
3035         return ret;
3036     }
3037
3038     if (!migrate_multifd_flush_after_each_section()) {
3039         qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3040     }
3041
3042     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3043     qemu_fflush(f);
3044
3045     return 0;
3046 }
3047
3048 /**
3049  * ram_save_iterate: iterative stage for migration
3050  *
3051  * Returns zero to indicate success and negative for error
3052  *
3053  * @f: QEMUFile where to send the data
3054  * @opaque: RAMState pointer
3055  */
3056 static int ram_save_iterate(QEMUFile *f, void *opaque)
3057 {
3058     RAMState **temp = opaque;
3059     RAMState *rs = *temp;
3060     int ret = 0;
3061     int i;
3062     int64_t t0;
3063     int done = 0;
3064
3065     if (blk_mig_bulk_active()) {
3066         /* Avoid transferring ram during bulk phase of block migration as
3067          * the bulk phase will usually take a long time and transferring
3068          * ram updates during that time is pointless. */
3069         goto out;
3070     }
3071
3072     /*
3073      * We'll take this lock a little bit long, but it's okay for two reasons.
3074      * Firstly, the only possible other thread to take it is who calls
3075      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3076      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3077      * guarantees that we'll at least released it in a regular basis.
3078      */
3079     qemu_mutex_lock(&rs->bitmap_mutex);
3080     WITH_RCU_READ_LOCK_GUARD() {
3081         if (ram_list.version != rs->last_version) {
3082             ram_state_reset(rs);
3083         }
3084
3085         /* Read version before ram_list.blocks */
3086         smp_rmb();
3087
3088         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3089
3090         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3091         i = 0;
3092         while ((ret = migration_rate_exceeded(f)) == 0 ||
3093                postcopy_has_request(rs)) {
3094             int pages;
3095
3096             if (qemu_file_get_error(f)) {
3097                 break;
3098             }
3099
3100             pages = ram_find_and_save_block(rs);
3101             /* no more pages to sent */
3102             if (pages == 0) {
3103                 done = 1;
3104                 break;
3105             }
3106
3107             if (pages < 0) {
3108                 qemu_file_set_error(f, pages);
3109                 break;
3110             }
3111
3112             rs->target_page_count += pages;
3113
3114             /*
3115              * During postcopy, it is necessary to make sure one whole host
3116              * page is sent in one chunk.
3117              */
3118             if (migrate_postcopy_ram()) {
3119                 ram_flush_compressed_data(rs);
3120             }
3121
3122             /*
3123              * we want to check in the 1st loop, just in case it was the 1st
3124              * time and we had to sync the dirty bitmap.
3125              * qemu_clock_get_ns() is a bit expensive, so we only check each
3126              * some iterations
3127              */
3128             if ((i & 63) == 0) {
3129                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3130                               1000000;
3131                 if (t1 > MAX_WAIT) {
3132                     trace_ram_save_iterate_big_wait(t1, i);
3133                     break;
3134                 }
3135             }
3136             i++;
3137         }
3138     }
3139     qemu_mutex_unlock(&rs->bitmap_mutex);
3140
3141     /*
3142      * Must occur before EOS (or any QEMUFile operation)
3143      * because of RDMA protocol.
3144      */
3145     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3146
3147 out:
3148     if (ret >= 0
3149         && migration_is_setup_or_active(migrate_get_current()->state)) {
3150         if (migrate_multifd_flush_after_each_section()) {
3151             ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3152             if (ret < 0) {
3153                 return ret;
3154             }
3155         }
3156
3157         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3158         qemu_fflush(f);
3159         ram_transferred_add(8);
3160
3161         ret = qemu_file_get_error(f);
3162     }
3163     if (ret < 0) {
3164         return ret;
3165     }
3166
3167     return done;
3168 }
3169
3170 /**
3171  * ram_save_complete: function called to send the remaining amount of ram
3172  *
3173  * Returns zero to indicate success or negative on error
3174  *
3175  * Called with iothread lock
3176  *
3177  * @f: QEMUFile where to send the data
3178  * @opaque: RAMState pointer
3179  */
3180 static int ram_save_complete(QEMUFile *f, void *opaque)
3181 {
3182     RAMState **temp = opaque;
3183     RAMState *rs = *temp;
3184     int ret = 0;
3185
3186     rs->last_stage = !migration_in_colo_state();
3187
3188     WITH_RCU_READ_LOCK_GUARD() {
3189         if (!migration_in_postcopy()) {
3190             migration_bitmap_sync_precopy(rs, true);
3191         }
3192
3193         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3194
3195         /* try transferring iterative blocks of memory */
3196
3197         /* flush all remaining blocks regardless of rate limiting */
3198         qemu_mutex_lock(&rs->bitmap_mutex);
3199         while (true) {
3200             int pages;
3201
3202             pages = ram_find_and_save_block(rs);
3203             /* no more blocks to sent */
3204             if (pages == 0) {
3205                 break;
3206             }
3207             if (pages < 0) {
3208                 ret = pages;
3209                 break;
3210             }
3211         }
3212         qemu_mutex_unlock(&rs->bitmap_mutex);
3213
3214         ram_flush_compressed_data(rs);
3215         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3216     }
3217
3218     if (ret < 0) {
3219         return ret;
3220     }
3221
3222     ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3223     if (ret < 0) {
3224         return ret;
3225     }
3226
3227     if (!migrate_multifd_flush_after_each_section()) {
3228         qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3229     }
3230     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3231     qemu_fflush(f);
3232
3233     return 0;
3234 }
3235
3236 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3237                                        uint64_t *can_postcopy)
3238 {
3239     RAMState **temp = opaque;
3240     RAMState *rs = *temp;
3241
3242     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3243
3244     if (migrate_postcopy_ram()) {
3245         /* We can do postcopy, and all the data is postcopiable */
3246         *can_postcopy += remaining_size;
3247     } else {
3248         *must_precopy += remaining_size;
3249     }
3250 }
3251
3252 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3253                                     uint64_t *can_postcopy)
3254 {
3255     MigrationState *s = migrate_get_current();
3256     RAMState **temp = opaque;
3257     RAMState *rs = *temp;
3258
3259     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3260
3261     if (!migration_in_postcopy() && remaining_size < s->threshold_size) {
3262         qemu_mutex_lock_iothread();
3263         WITH_RCU_READ_LOCK_GUARD() {
3264             migration_bitmap_sync_precopy(rs, false);
3265         }
3266         qemu_mutex_unlock_iothread();
3267         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3268     }
3269
3270     if (migrate_postcopy_ram()) {
3271         /* We can do postcopy, and all the data is postcopiable */
3272         *can_postcopy += remaining_size;
3273     } else {
3274         *must_precopy += remaining_size;
3275     }
3276 }
3277
3278 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3279 {
3280     unsigned int xh_len;
3281     int xh_flags;
3282     uint8_t *loaded_data;
3283
3284     /* extract RLE header */
3285     xh_flags = qemu_get_byte(f);
3286     xh_len = qemu_get_be16(f);
3287
3288     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3289         error_report("Failed to load XBZRLE page - wrong compression!");
3290         return -1;
3291     }
3292
3293     if (xh_len > TARGET_PAGE_SIZE) {
3294         error_report("Failed to load XBZRLE page - len overflow!");
3295         return -1;
3296     }
3297     loaded_data = XBZRLE.decoded_buf;
3298     /* load data and decode */
3299     /* it can change loaded_data to point to an internal buffer */
3300     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3301
3302     /* decode RLE */
3303     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3304                              TARGET_PAGE_SIZE) == -1) {
3305         error_report("Failed to load XBZRLE page - decode error!");
3306         return -1;
3307     }
3308
3309     return 0;
3310 }
3311
3312 /**
3313  * ram_block_from_stream: read a RAMBlock id from the migration stream
3314  *
3315  * Must be called from within a rcu critical section.
3316  *
3317  * Returns a pointer from within the RCU-protected ram_list.
3318  *
3319  * @mis: the migration incoming state pointer
3320  * @f: QEMUFile where to read the data from
3321  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3322  * @channel: the channel we're using
3323  */
3324 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3325                                               QEMUFile *f, int flags,
3326                                               int channel)
3327 {
3328     RAMBlock *block = mis->last_recv_block[channel];
3329     char id[256];
3330     uint8_t len;
3331
3332     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3333         if (!block) {
3334             error_report("Ack, bad migration stream!");
3335             return NULL;
3336         }
3337         return block;
3338     }
3339
3340     len = qemu_get_byte(f);
3341     qemu_get_buffer(f, (uint8_t *)id, len);
3342     id[len] = 0;
3343
3344     block = qemu_ram_block_by_name(id);
3345     if (!block) {
3346         error_report("Can't find block %s", id);
3347         return NULL;
3348     }
3349
3350     if (ramblock_is_ignored(block)) {
3351         error_report("block %s should not be migrated !", id);
3352         return NULL;
3353     }
3354
3355     mis->last_recv_block[channel] = block;
3356
3357     return block;
3358 }
3359
3360 static inline void *host_from_ram_block_offset(RAMBlock *block,
3361                                                ram_addr_t offset)
3362 {
3363     if (!offset_in_ramblock(block, offset)) {
3364         return NULL;
3365     }
3366
3367     return block->host + offset;
3368 }
3369
3370 static void *host_page_from_ram_block_offset(RAMBlock *block,
3371                                              ram_addr_t offset)
3372 {
3373     /* Note: Explicitly no check against offset_in_ramblock(). */
3374     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3375                                    block->page_size);
3376 }
3377
3378 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3379                                                          ram_addr_t offset)
3380 {
3381     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3382 }
3383
3384 void colo_record_bitmap(RAMBlock *block, ram_addr_t *normal, uint32_t pages)
3385 {
3386     qemu_mutex_lock(&ram_state->bitmap_mutex);
3387     for (int i = 0; i < pages; i++) {
3388         ram_addr_t offset = normal[i];
3389         ram_state->migration_dirty_pages += !test_and_set_bit(
3390                                                 offset >> TARGET_PAGE_BITS,
3391                                                 block->bmap);
3392     }
3393     qemu_mutex_unlock(&ram_state->bitmap_mutex);
3394 }
3395
3396 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3397                              ram_addr_t offset, bool record_bitmap)
3398 {
3399     if (!offset_in_ramblock(block, offset)) {
3400         return NULL;
3401     }
3402     if (!block->colo_cache) {
3403         error_report("%s: colo_cache is NULL in block :%s",
3404                      __func__, block->idstr);
3405         return NULL;
3406     }
3407
3408     /*
3409     * During colo checkpoint, we need bitmap of these migrated pages.
3410     * It help us to decide which pages in ram cache should be flushed
3411     * into VM's RAM later.
3412     */
3413     if (record_bitmap) {
3414         colo_record_bitmap(block, &offset, 1);
3415     }
3416     return block->colo_cache + offset;
3417 }
3418
3419 /**
3420  * ram_handle_compressed: handle the zero page case
3421  *
3422  * If a page (or a whole RDMA chunk) has been
3423  * determined to be zero, then zap it.
3424  *
3425  * @host: host address for the zero page
3426  * @ch: what the page is filled from.  We only support zero
3427  * @size: size of the zero page
3428  */
3429 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3430 {
3431     if (ch != 0 || !buffer_is_zero(host, size)) {
3432         memset(host, ch, size);
3433     }
3434 }
3435
3436 static void colo_init_ram_state(void)
3437 {
3438     ram_state_init(&ram_state);
3439 }
3440
3441 /*
3442  * colo cache: this is for secondary VM, we cache the whole
3443  * memory of the secondary VM, it is need to hold the global lock
3444  * to call this helper.
3445  */
3446 int colo_init_ram_cache(void)
3447 {
3448     RAMBlock *block;
3449
3450     WITH_RCU_READ_LOCK_GUARD() {
3451         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3452             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3453                                                     NULL, false, false);
3454             if (!block->colo_cache) {
3455                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3456                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3457                              block->used_length);
3458                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3459                     if (block->colo_cache) {
3460                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3461                         block->colo_cache = NULL;
3462                     }
3463                 }
3464                 return -errno;
3465             }
3466             if (!machine_dump_guest_core(current_machine)) {
3467                 qemu_madvise(block->colo_cache, block->used_length,
3468                              QEMU_MADV_DONTDUMP);
3469             }
3470         }
3471     }
3472
3473     /*
3474     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3475     * with to decide which page in cache should be flushed into SVM's RAM. Here
3476     * we use the same name 'ram_bitmap' as for migration.
3477     */
3478     if (ram_bytes_total()) {
3479         RAMBlock *block;
3480
3481         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3482             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3483             block->bmap = bitmap_new(pages);
3484         }
3485     }
3486
3487     colo_init_ram_state();
3488     return 0;
3489 }
3490
3491 /* TODO: duplicated with ram_init_bitmaps */
3492 void colo_incoming_start_dirty_log(void)
3493 {
3494     RAMBlock *block = NULL;
3495     /* For memory_global_dirty_log_start below. */
3496     qemu_mutex_lock_iothread();
3497     qemu_mutex_lock_ramlist();
3498
3499     memory_global_dirty_log_sync(false);
3500     WITH_RCU_READ_LOCK_GUARD() {
3501         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3502             ramblock_sync_dirty_bitmap(ram_state, block);
3503             /* Discard this dirty bitmap record */
3504             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3505         }
3506         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3507     }
3508     ram_state->migration_dirty_pages = 0;
3509     qemu_mutex_unlock_ramlist();
3510     qemu_mutex_unlock_iothread();
3511 }
3512
3513 /* It is need to hold the global lock to call this helper */
3514 void colo_release_ram_cache(void)
3515 {
3516     RAMBlock *block;
3517
3518     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3519     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3520         g_free(block->bmap);
3521         block->bmap = NULL;
3522     }
3523
3524     WITH_RCU_READ_LOCK_GUARD() {
3525         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3526             if (block->colo_cache) {
3527                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3528                 block->colo_cache = NULL;
3529             }
3530         }
3531     }
3532     ram_state_cleanup(&ram_state);
3533 }
3534
3535 /**
3536  * ram_load_setup: Setup RAM for migration incoming side
3537  *
3538  * Returns zero to indicate success and negative for error
3539  *
3540  * @f: QEMUFile where to receive the data
3541  * @opaque: RAMState pointer
3542  */
3543 static int ram_load_setup(QEMUFile *f, void *opaque)
3544 {
3545     xbzrle_load_setup();
3546     ramblock_recv_map_init();
3547
3548     return 0;
3549 }
3550
3551 static int ram_load_cleanup(void *opaque)
3552 {
3553     RAMBlock *rb;
3554
3555     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3556         qemu_ram_block_writeback(rb);
3557     }
3558
3559     xbzrle_load_cleanup();
3560
3561     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3562         g_free(rb->receivedmap);
3563         rb->receivedmap = NULL;
3564     }
3565
3566     return 0;
3567 }
3568
3569 /**
3570  * ram_postcopy_incoming_init: allocate postcopy data structures
3571  *
3572  * Returns 0 for success and negative if there was one error
3573  *
3574  * @mis: current migration incoming state
3575  *
3576  * Allocate data structures etc needed by incoming migration with
3577  * postcopy-ram. postcopy-ram's similarly names
3578  * postcopy_ram_incoming_init does the work.
3579  */
3580 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3581 {
3582     return postcopy_ram_incoming_init(mis);
3583 }
3584
3585 /**
3586  * ram_load_postcopy: load a page in postcopy case
3587  *
3588  * Returns 0 for success or -errno in case of error
3589  *
3590  * Called in postcopy mode by ram_load().
3591  * rcu_read_lock is taken prior to this being called.
3592  *
3593  * @f: QEMUFile where to send the data
3594  * @channel: the channel to use for loading
3595  */
3596 int ram_load_postcopy(QEMUFile *f, int channel)
3597 {
3598     int flags = 0, ret = 0;
3599     bool place_needed = false;
3600     bool matches_target_page_size = false;
3601     MigrationIncomingState *mis = migration_incoming_get_current();
3602     PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
3603
3604     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3605         ram_addr_t addr;
3606         void *page_buffer = NULL;
3607         void *place_source = NULL;
3608         RAMBlock *block = NULL;
3609         uint8_t ch;
3610         int len;
3611
3612         addr = qemu_get_be64(f);
3613
3614         /*
3615          * If qemu file error, we should stop here, and then "addr"
3616          * may be invalid
3617          */
3618         ret = qemu_file_get_error(f);
3619         if (ret) {
3620             break;
3621         }
3622
3623         flags = addr & ~TARGET_PAGE_MASK;
3624         addr &= TARGET_PAGE_MASK;
3625
3626         trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
3627         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3628                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3629             block = ram_block_from_stream(mis, f, flags, channel);
3630             if (!block) {
3631                 ret = -EINVAL;
3632                 break;
3633             }
3634
3635             /*
3636              * Relying on used_length is racy and can result in false positives.
3637              * We might place pages beyond used_length in case RAM was shrunk
3638              * while in postcopy, which is fine - trying to place via
3639              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3640              */
3641             if (!block->host || addr >= block->postcopy_length) {
3642                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3643                 ret = -EINVAL;
3644                 break;
3645             }
3646             tmp_page->target_pages++;
3647             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3648             /*
3649              * Postcopy requires that we place whole host pages atomically;
3650              * these may be huge pages for RAMBlocks that are backed by
3651              * hugetlbfs.
3652              * To make it atomic, the data is read into a temporary page
3653              * that's moved into place later.
3654              * The migration protocol uses,  possibly smaller, target-pages
3655              * however the source ensures it always sends all the components
3656              * of a host page in one chunk.
3657              */
3658             page_buffer = tmp_page->tmp_huge_page +
3659                           host_page_offset_from_ram_block_offset(block, addr);
3660             /* If all TP are zero then we can optimise the place */
3661             if (tmp_page->target_pages == 1) {
3662                 tmp_page->host_addr =
3663                     host_page_from_ram_block_offset(block, addr);
3664             } else if (tmp_page->host_addr !=
3665                        host_page_from_ram_block_offset(block, addr)) {
3666                 /* not the 1st TP within the HP */
3667                 error_report("Non-same host page detected on channel %d: "
3668                              "Target host page %p, received host page %p "
3669                              "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
3670                              channel, tmp_page->host_addr,
3671                              host_page_from_ram_block_offset(block, addr),
3672                              block->idstr, addr, tmp_page->target_pages);
3673                 ret = -EINVAL;
3674                 break;
3675             }
3676
3677             /*
3678              * If it's the last part of a host page then we place the host
3679              * page
3680              */
3681             if (tmp_page->target_pages ==
3682                 (block->page_size / TARGET_PAGE_SIZE)) {
3683                 place_needed = true;
3684             }
3685             place_source = tmp_page->tmp_huge_page;
3686         }
3687
3688         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3689         case RAM_SAVE_FLAG_ZERO:
3690             ch = qemu_get_byte(f);
3691             /*
3692              * Can skip to set page_buffer when
3693              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3694              */
3695             if (ch || !matches_target_page_size) {
3696                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3697             }
3698             if (ch) {
3699                 tmp_page->all_zero = false;
3700             }
3701             break;
3702
3703         case RAM_SAVE_FLAG_PAGE:
3704             tmp_page->all_zero = false;
3705             if (!matches_target_page_size) {
3706                 /* For huge pages, we always use temporary buffer */
3707                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3708             } else {
3709                 /*
3710                  * For small pages that matches target page size, we
3711                  * avoid the qemu_file copy.  Instead we directly use
3712                  * the buffer of QEMUFile to place the page.  Note: we
3713                  * cannot do any QEMUFile operation before using that
3714                  * buffer to make sure the buffer is valid when
3715                  * placing the page.
3716                  */
3717                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3718                                          TARGET_PAGE_SIZE);
3719             }
3720             break;
3721         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3722             tmp_page->all_zero = false;
3723             len = qemu_get_be32(f);
3724             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3725                 error_report("Invalid compressed data length: %d", len);
3726                 ret = -EINVAL;
3727                 break;
3728             }
3729             decompress_data_with_multi_threads(f, page_buffer, len);
3730             break;
3731         case RAM_SAVE_FLAG_MULTIFD_FLUSH:
3732             multifd_recv_sync_main();
3733             break;
3734         case RAM_SAVE_FLAG_EOS:
3735             /* normal exit */
3736             if (migrate_multifd_flush_after_each_section()) {
3737                 multifd_recv_sync_main();
3738             }
3739             break;
3740         default:
3741             error_report("Unknown combination of migration flags: 0x%x"
3742                          " (postcopy mode)", flags);
3743             ret = -EINVAL;
3744             break;
3745         }
3746
3747         /* Got the whole host page, wait for decompress before placing. */
3748         if (place_needed) {
3749             ret |= wait_for_decompress_done();
3750         }
3751
3752         /* Detect for any possible file errors */
3753         if (!ret && qemu_file_get_error(f)) {
3754             ret = qemu_file_get_error(f);
3755         }
3756
3757         if (!ret && place_needed) {
3758             if (tmp_page->all_zero) {
3759                 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
3760             } else {
3761                 ret = postcopy_place_page(mis, tmp_page->host_addr,
3762                                           place_source, block);
3763             }
3764             place_needed = false;
3765             postcopy_temp_page_reset(tmp_page);
3766         }
3767     }
3768
3769     return ret;
3770 }
3771
3772 static bool postcopy_is_running(void)
3773 {
3774     PostcopyState ps = postcopy_state_get();
3775     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3776 }
3777
3778 /*
3779  * Flush content of RAM cache into SVM's memory.
3780  * Only flush the pages that be dirtied by PVM or SVM or both.
3781  */
3782 void colo_flush_ram_cache(void)
3783 {
3784     RAMBlock *block = NULL;
3785     void *dst_host;
3786     void *src_host;
3787     unsigned long offset = 0;
3788
3789     memory_global_dirty_log_sync(false);
3790     qemu_mutex_lock(&ram_state->bitmap_mutex);
3791     WITH_RCU_READ_LOCK_GUARD() {
3792         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3793             ramblock_sync_dirty_bitmap(ram_state, block);
3794         }
3795     }
3796
3797     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3798     WITH_RCU_READ_LOCK_GUARD() {
3799         block = QLIST_FIRST_RCU(&ram_list.blocks);
3800
3801         while (block) {
3802             unsigned long num = 0;
3803
3804             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
3805             if (!offset_in_ramblock(block,
3806                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3807                 offset = 0;
3808                 num = 0;
3809                 block = QLIST_NEXT_RCU(block, next);
3810             } else {
3811                 unsigned long i = 0;
3812
3813                 for (i = 0; i < num; i++) {
3814                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
3815                 }
3816                 dst_host = block->host
3817                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3818                 src_host = block->colo_cache
3819                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3820                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3821                 offset += num;
3822             }
3823         }
3824     }
3825     qemu_mutex_unlock(&ram_state->bitmap_mutex);
3826     trace_colo_flush_ram_cache_end();
3827 }
3828
3829 /**
3830  * ram_load_precopy: load pages in precopy case
3831  *
3832  * Returns 0 for success or -errno in case of error
3833  *
3834  * Called in precopy mode by ram_load().
3835  * rcu_read_lock is taken prior to this being called.
3836  *
3837  * @f: QEMUFile where to send the data
3838  */
3839 static int ram_load_precopy(QEMUFile *f)
3840 {
3841     MigrationIncomingState *mis = migration_incoming_get_current();
3842     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3843     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3844     bool postcopy_advised = migration_incoming_postcopy_advised();
3845     if (!migrate_compress()) {
3846         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3847     }
3848
3849     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3850         ram_addr_t addr, total_ram_bytes;
3851         void *host = NULL, *host_bak = NULL;
3852         uint8_t ch;
3853
3854         /*
3855          * Yield periodically to let main loop run, but an iteration of
3856          * the main loop is expensive, so do it each some iterations
3857          */
3858         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3859             aio_co_schedule(qemu_get_current_aio_context(),
3860                             qemu_coroutine_self());
3861             qemu_coroutine_yield();
3862         }
3863         i++;
3864
3865         addr = qemu_get_be64(f);
3866         flags = addr & ~TARGET_PAGE_MASK;
3867         addr &= TARGET_PAGE_MASK;
3868
3869         if (flags & invalid_flags) {
3870             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3871                 error_report("Received an unexpected compressed page");
3872             }
3873
3874             ret = -EINVAL;
3875             break;
3876         }
3877
3878         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3879                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3880             RAMBlock *block = ram_block_from_stream(mis, f, flags,
3881                                                     RAM_CHANNEL_PRECOPY);
3882
3883             host = host_from_ram_block_offset(block, addr);
3884             /*
3885              * After going into COLO stage, we should not load the page
3886              * into SVM's memory directly, we put them into colo_cache firstly.
3887              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3888              * Previously, we copied all these memory in preparing stage of COLO
3889              * while we need to stop VM, which is a time-consuming process.
3890              * Here we optimize it by a trick, back-up every page while in
3891              * migration process while COLO is enabled, though it affects the
3892              * speed of the migration, but it obviously reduce the downtime of
3893              * back-up all SVM'S memory in COLO preparing stage.
3894              */
3895             if (migration_incoming_colo_enabled()) {
3896                 if (migration_incoming_in_colo_state()) {
3897                     /* In COLO stage, put all pages into cache temporarily */
3898                     host = colo_cache_from_block_offset(block, addr, true);
3899                 } else {
3900                    /*
3901                     * In migration stage but before COLO stage,
3902                     * Put all pages into both cache and SVM's memory.
3903                     */
3904                     host_bak = colo_cache_from_block_offset(block, addr, false);
3905                 }
3906             }
3907             if (!host) {
3908                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3909                 ret = -EINVAL;
3910                 break;
3911             }
3912             if (!migration_incoming_in_colo_state()) {
3913                 ramblock_recv_bitmap_set(block, host);
3914             }
3915
3916             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3917         }
3918
3919         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3920         case RAM_SAVE_FLAG_MEM_SIZE:
3921             /* Synchronize RAM block list */
3922             total_ram_bytes = addr;
3923             while (!ret && total_ram_bytes) {
3924                 RAMBlock *block;
3925                 char id[256];
3926                 ram_addr_t length;
3927
3928                 len = qemu_get_byte(f);
3929                 qemu_get_buffer(f, (uint8_t *)id, len);
3930                 id[len] = 0;
3931                 length = qemu_get_be64(f);
3932
3933                 block = qemu_ram_block_by_name(id);
3934                 if (block && !qemu_ram_is_migratable(block)) {
3935                     error_report("block %s should not be migrated !", id);
3936                     ret = -EINVAL;
3937                 } else if (block) {
3938                     if (length != block->used_length) {
3939                         Error *local_err = NULL;
3940
3941                         ret = qemu_ram_resize(block, length,
3942                                               &local_err);
3943                         if (local_err) {
3944                             error_report_err(local_err);
3945                         }
3946                     }
3947                     /* For postcopy we need to check hugepage sizes match */
3948                     if (postcopy_advised && migrate_postcopy_ram() &&
3949                         block->page_size != qemu_host_page_size) {
3950                         uint64_t remote_page_size = qemu_get_be64(f);
3951                         if (remote_page_size != block->page_size) {
3952                             error_report("Mismatched RAM page size %s "
3953                                          "(local) %zd != %" PRId64,
3954                                          id, block->page_size,
3955                                          remote_page_size);
3956                             ret = -EINVAL;
3957                         }
3958                     }
3959                     if (migrate_ignore_shared()) {
3960                         hwaddr addr = qemu_get_be64(f);
3961                         if (ramblock_is_ignored(block) &&
3962                             block->mr->addr != addr) {
3963                             error_report("Mismatched GPAs for block %s "
3964                                          "%" PRId64 "!= %" PRId64,
3965                                          id, (uint64_t)addr,
3966                                          (uint64_t)block->mr->addr);
3967                             ret = -EINVAL;
3968                         }
3969                     }
3970                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3971                                           block->idstr);
3972                 } else {
3973                     error_report("Unknown ramblock \"%s\", cannot "
3974                                  "accept migration", id);
3975                     ret = -EINVAL;
3976                 }
3977
3978                 total_ram_bytes -= length;
3979             }
3980             break;
3981
3982         case RAM_SAVE_FLAG_ZERO:
3983             ch = qemu_get_byte(f);
3984             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3985             break;
3986
3987         case RAM_SAVE_FLAG_PAGE:
3988             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3989             break;
3990
3991         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3992             len = qemu_get_be32(f);
3993             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3994                 error_report("Invalid compressed data length: %d", len);
3995                 ret = -EINVAL;
3996                 break;
3997             }
3998             decompress_data_with_multi_threads(f, host, len);
3999             break;
4000
4001         case RAM_SAVE_FLAG_XBZRLE:
4002             if (load_xbzrle(f, addr, host) < 0) {
4003                 error_report("Failed to decompress XBZRLE page at "
4004                              RAM_ADDR_FMT, addr);
4005                 ret = -EINVAL;
4006                 break;
4007             }
4008             break;
4009         case RAM_SAVE_FLAG_MULTIFD_FLUSH:
4010             multifd_recv_sync_main();
4011             break;
4012         case RAM_SAVE_FLAG_EOS:
4013             /* normal exit */
4014             if (migrate_multifd_flush_after_each_section()) {
4015                 multifd_recv_sync_main();
4016             }
4017             break;
4018         case RAM_SAVE_FLAG_HOOK:
4019             ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4020             break;
4021         default:
4022             error_report("Unknown combination of migration flags: 0x%x", flags);
4023             ret = -EINVAL;
4024         }
4025         if (!ret) {
4026             ret = qemu_file_get_error(f);
4027         }
4028         if (!ret && host_bak) {
4029             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4030         }
4031     }
4032
4033     ret |= wait_for_decompress_done();
4034     return ret;
4035 }
4036
4037 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4038 {
4039     int ret = 0;
4040     static uint64_t seq_iter;
4041     /*
4042      * If system is running in postcopy mode, page inserts to host memory must
4043      * be atomic
4044      */
4045     bool postcopy_running = postcopy_is_running();
4046
4047     seq_iter++;
4048
4049     if (version_id != 4) {
4050         return -EINVAL;
4051     }
4052
4053     /*
4054      * This RCU critical section can be very long running.
4055      * When RCU reclaims in the code start to become numerous,
4056      * it will be necessary to reduce the granularity of this
4057      * critical section.
4058      */
4059     WITH_RCU_READ_LOCK_GUARD() {
4060         if (postcopy_running) {
4061             /*
4062              * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
4063              * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4064              * service fast page faults.
4065              */
4066             ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4067         } else {
4068             ret = ram_load_precopy(f);
4069         }
4070     }
4071     trace_ram_load_complete(ret, seq_iter);
4072
4073     return ret;
4074 }
4075
4076 static bool ram_has_postcopy(void *opaque)
4077 {
4078     RAMBlock *rb;
4079     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4080         if (ramblock_is_pmem(rb)) {
4081             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4082                          "is not supported now!", rb->idstr, rb->host);
4083             return false;
4084         }
4085     }
4086
4087     return migrate_postcopy_ram();
4088 }
4089
4090 /* Sync all the dirty bitmap with destination VM.  */
4091 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4092 {
4093     RAMBlock *block;
4094     QEMUFile *file = s->to_dst_file;
4095     int ramblock_count = 0;
4096
4097     trace_ram_dirty_bitmap_sync_start();
4098
4099     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4100         qemu_savevm_send_recv_bitmap(file, block->idstr);
4101         trace_ram_dirty_bitmap_request(block->idstr);
4102         ramblock_count++;
4103     }
4104
4105     trace_ram_dirty_bitmap_sync_wait();
4106
4107     /* Wait until all the ramblocks' dirty bitmap synced */
4108     while (ramblock_count--) {
4109         qemu_sem_wait(&s->rp_state.rp_sem);
4110     }
4111
4112     trace_ram_dirty_bitmap_sync_complete();
4113
4114     return 0;
4115 }
4116
4117 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4118 {
4119     qemu_sem_post(&s->rp_state.rp_sem);
4120 }
4121
4122 /*
4123  * Read the received bitmap, revert it as the initial dirty bitmap.
4124  * This is only used when the postcopy migration is paused but wants
4125  * to resume from a middle point.
4126  */
4127 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4128 {
4129     int ret = -EINVAL;
4130     /* from_dst_file is always valid because we're within rp_thread */
4131     QEMUFile *file = s->rp_state.from_dst_file;
4132     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4133     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4134     uint64_t size, end_mark;
4135
4136     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4137
4138     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4139         error_report("%s: incorrect state %s", __func__,
4140                      MigrationStatus_str(s->state));
4141         return -EINVAL;
4142     }
4143
4144     /*
4145      * Note: see comments in ramblock_recv_bitmap_send() on why we
4146      * need the endianness conversion, and the paddings.
4147      */
4148     local_size = ROUND_UP(local_size, 8);
4149
4150     /* Add paddings */
4151     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4152
4153     size = qemu_get_be64(file);
4154
4155     /* The size of the bitmap should match with our ramblock */
4156     if (size != local_size) {
4157         error_report("%s: ramblock '%s' bitmap size mismatch "
4158                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4159                      block->idstr, size, local_size);
4160         ret = -EINVAL;
4161         goto out;
4162     }
4163
4164     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4165     end_mark = qemu_get_be64(file);
4166
4167     ret = qemu_file_get_error(file);
4168     if (ret || size != local_size) {
4169         error_report("%s: read bitmap failed for ramblock '%s': %d"
4170                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4171                      __func__, block->idstr, ret, local_size, size);
4172         ret = -EIO;
4173         goto out;
4174     }
4175
4176     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4177         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4178                      __func__, block->idstr, end_mark);
4179         ret = -EINVAL;
4180         goto out;
4181     }
4182
4183     /*
4184      * Endianness conversion. We are during postcopy (though paused).
4185      * The dirty bitmap won't change. We can directly modify it.
4186      */
4187     bitmap_from_le(block->bmap, le_bitmap, nbits);
4188
4189     /*
4190      * What we received is "received bitmap". Revert it as the initial
4191      * dirty bitmap for this ramblock.
4192      */
4193     bitmap_complement(block->bmap, block->bmap, nbits);
4194
4195     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4196     ramblock_dirty_bitmap_clear_discarded_pages(block);
4197
4198     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4199     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4200
4201     /*
4202      * We succeeded to sync bitmap for current ramblock. If this is
4203      * the last one to sync, we need to notify the main send thread.
4204      */
4205     ram_dirty_bitmap_reload_notify(s);
4206
4207     ret = 0;
4208 out:
4209     g_free(le_bitmap);
4210     return ret;
4211 }
4212
4213 static int ram_resume_prepare(MigrationState *s, void *opaque)
4214 {
4215     RAMState *rs = *(RAMState **)opaque;
4216     int ret;
4217
4218     ret = ram_dirty_bitmap_sync_all(s, rs);
4219     if (ret) {
4220         return ret;
4221     }
4222
4223     ram_state_resume_prepare(rs, s->to_dst_file);
4224
4225     return 0;
4226 }
4227
4228 void postcopy_preempt_shutdown_file(MigrationState *s)
4229 {
4230     qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4231     qemu_fflush(s->postcopy_qemufile_src);
4232 }
4233
4234 static SaveVMHandlers savevm_ram_handlers = {
4235     .save_setup = ram_save_setup,
4236     .save_live_iterate = ram_save_iterate,
4237     .save_live_complete_postcopy = ram_save_complete,
4238     .save_live_complete_precopy = ram_save_complete,
4239     .has_postcopy = ram_has_postcopy,
4240     .state_pending_exact = ram_state_pending_exact,
4241     .state_pending_estimate = ram_state_pending_estimate,
4242     .load_state = ram_load,
4243     .save_cleanup = ram_save_cleanup,
4244     .load_setup = ram_load_setup,
4245     .load_cleanup = ram_load_cleanup,
4246     .resume_prepare = ram_resume_prepare,
4247 };
4248
4249 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4250                                       size_t old_size, size_t new_size)
4251 {
4252     PostcopyState ps = postcopy_state_get();
4253     ram_addr_t offset;
4254     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4255     Error *err = NULL;
4256
4257     if (ramblock_is_ignored(rb)) {
4258         return;
4259     }
4260
4261     if (!migration_is_idle()) {
4262         /*
4263          * Precopy code on the source cannot deal with the size of RAM blocks
4264          * changing at random points in time - especially after sending the
4265          * RAM block sizes in the migration stream, they must no longer change.
4266          * Abort and indicate a proper reason.
4267          */
4268         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4269         migration_cancel(err);
4270         error_free(err);
4271     }
4272
4273     switch (ps) {
4274     case POSTCOPY_INCOMING_ADVISE:
4275         /*
4276          * Update what ram_postcopy_incoming_init()->init_range() does at the
4277          * time postcopy was advised. Syncing RAM blocks with the source will
4278          * result in RAM resizes.
4279          */
4280         if (old_size < new_size) {
4281             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4282                 error_report("RAM block '%s' discard of resized RAM failed",
4283                              rb->idstr);
4284             }
4285         }
4286         rb->postcopy_length = new_size;
4287         break;
4288     case POSTCOPY_INCOMING_NONE:
4289     case POSTCOPY_INCOMING_RUNNING:
4290     case POSTCOPY_INCOMING_END:
4291         /*
4292          * Once our guest is running, postcopy does no longer care about
4293          * resizes. When growing, the new memory was not available on the
4294          * source, no handler needed.
4295          */
4296         break;
4297     default:
4298         error_report("RAM block '%s' resized during postcopy state: %d",
4299                      rb->idstr, ps);
4300         exit(-1);
4301     }
4302 }
4303
4304 static RAMBlockNotifier ram_mig_ram_notifier = {
4305     .ram_block_resized = ram_mig_ram_block_resized,
4306 };
4307
4308 void ram_mig_init(void)
4309 {
4310     qemu_mutex_init(&XBZRLE.lock);
4311     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4312     ram_block_notifier_add(&ram_mig_ram_notifier);
4313 }