migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/madvise.h"
  34 #include "qemu/main-loop.h"
  35 #include "io/channel-null.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "migration/register.h"
  40 #include "migration/misc.h"
  41 #include "qemu-file.h"
  42 #include "postcopy-ram.h"
  43 #include "page_cache.h"
  44 #include "qemu/error-report.h"
  45 #include "qapi/error.h"
  46 #include "qapi/qapi-types-migration.h"
  47 #include "qapi/qapi-events-migration.h"
  48 #include "qapi/qmp/qerror.h"
  49 #include "trace.h"
  50 #include "exec/ram_addr.h"
  51 #include "exec/target_page.h"
  52 #include "qemu/rcu_queue.h"
  53 #include "migration/colo.h"
  54 #include "block.h"
  55 #include "sysemu/cpu-throttle.h"
  56 #include "savevm.h"
  57 #include "qemu/iov.h"
  58 #include "multifd.h"
  59 #include "sysemu/runstate.h"
  60
  61 #include "hw/boards.h" /* for machine_dump_guest_core() */
  62
  63 #if defined(__linux__)
  64 #include "qemu/userfaultfd.h"
  65 #endif /* defined(__linux__) */
  66
  67 /***********************************************************/
  68 /* ram save/restore */
  69
  70 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  71  * worked for pages that where filled with the same char.  We switched
  72  * it to only search for the zero value.  And to avoid confusion with
  73  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  74  */
  75
  76 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  77 #define RAM_SAVE_FLAG_ZERO     0x02
  78 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  79 #define RAM_SAVE_FLAG_PAGE     0x08
  80 #define RAM_SAVE_FLAG_EOS      0x10
  81 #define RAM_SAVE_FLAG_CONTINUE 0x20
  82 #define RAM_SAVE_FLAG_XBZRLE   0x40
  83 /* 0x80 is reserved in migration.h start with 0x100 next */
  84 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  85
  86 XBZRLECacheStats xbzrle_counters;
  87
  88 /* used by the search for pages to send */
  89 struct PageSearchStatus {
  90     /* The migration channel used for a specific host page */
  91     QEMUFile    *pss_channel;
  92     /* Last block from where we have sent data */
  93     RAMBlock *last_sent_block;
  94     /* Current block being searched */
  95     RAMBlock    *block;
  96     /* Current page to search from */
  97     unsigned long page;
  98     /* Set once we wrap around */
  99     bool         complete_round;
 100     /* Whether we're sending a host page */
 101     bool          host_page_sending;
 102     /* The start/end of current host page.  Invalid if host_page_sending==false */
 103     unsigned long host_page_start;
 104     unsigned long host_page_end;
 105 };
 106 typedef struct PageSearchStatus PageSearchStatus;
 107
 108 /* struct contains XBZRLE cache and a static page
 109    used by the compression */
 110 static struct {
 111     /* buffer used for XBZRLE encoding */
 112     uint8_t *encoded_buf;
 113     /* buffer for storing page content */
 114     uint8_t *current_buf;
 115     /* Cache for XBZRLE, Protected by lock. */
 116     PageCache *cache;
 117     QemuMutex lock;
 118     /* it will store a page full of zeros */
 119     uint8_t *zero_target_page;
 120     /* buffer used for XBZRLE decoding */
 121     uint8_t *decoded_buf;
 122 } XBZRLE;
 123
 124 static void XBZRLE_cache_lock(void)
 125 {
 126     if (migrate_use_xbzrle()) {
 127         qemu_mutex_lock(&XBZRLE.lock);
 128     }
 129 }
 130
 131 static void XBZRLE_cache_unlock(void)
 132 {
 133     if (migrate_use_xbzrle()) {
 134         qemu_mutex_unlock(&XBZRLE.lock);
 135     }
 136 }
 137
 138 /**
 139  * xbzrle_cache_resize: resize the xbzrle cache
 140  *
 141  * This function is called from migrate_params_apply in main
 142  * thread, possibly while a migration is in progress.  A running
 143  * migration may be using the cache and might finish during this call,
 144  * hence changes to the cache are protected by XBZRLE.lock().
 145  *
 146  * Returns 0 for success or -1 for error
 147  *
 148  * @new_size: new cache size
 149  * @errp: set *errp if the check failed, with reason
 150  */
 151 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 152 {
 153     PageCache *new_cache;
 154     int64_t ret = 0;
 155
 156     /* Check for truncation */
 157     if (new_size != (size_t)new_size) {
 158         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 159                    "exceeding address space");
 160         return -1;
 161     }
 162
 163     if (new_size == migrate_xbzrle_cache_size()) {
 164         /* nothing to do */
 165         return 0;
 166     }
 167
 168     XBZRLE_cache_lock();
 169
 170     if (XBZRLE.cache != NULL) {
 171         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 172         if (!new_cache) {
 173             ret = -1;
 174             goto out;
 175         }
 176
 177         cache_fini(XBZRLE.cache);
 178         XBZRLE.cache = new_cache;
 179     }
 180 out:
 181     XBZRLE_cache_unlock();
 182     return ret;
 183 }
 184
 185 static bool postcopy_preempt_active(void)
 186 {
 187     return migrate_postcopy_preempt() && migration_in_postcopy();
 188 }
 189
 190 bool ramblock_is_ignored(RAMBlock *block)
 191 {
 192     return !qemu_ram_is_migratable(block) ||
 193            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 194 }
 195
 196 #undef RAMBLOCK_FOREACH
 197
 198 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 199 {
 200     RAMBlock *block;
 201     int ret = 0;
 202
 203     RCU_READ_LOCK_GUARD();
 204
 205     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 206         ret = func(block, opaque);
 207         if (ret) {
 208             break;
 209         }
 210     }
 211     return ret;
 212 }
 213
 214 static void ramblock_recv_map_init(void)
 215 {
 216     RAMBlock *rb;
 217
 218     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 219         assert(!rb->receivedmap);
 220         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 221     }
 222 }
 223
 224 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 225 {
 226     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 227                     rb->receivedmap);
 228 }
 229
 230 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 231 {
 232     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 233 }
 234
 235 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 236 {
 237     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 238 }
 239
 240 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 241                                     size_t nr)
 242 {
 243     bitmap_set_atomic(rb->receivedmap,
 244                       ramblock_recv_bitmap_offset(host_addr, rb),
 245                       nr);
 246 }
 247
 248 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 249
 250 /*
 251  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 252  *
 253  * Returns >0 if success with sent bytes, or <0 if error.
 254  */
 255 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 256                                   const char *block_name)
 257 {
 258     RAMBlock *block = qemu_ram_block_by_name(block_name);
 259     unsigned long *le_bitmap, nbits;
 260     uint64_t size;
 261
 262     if (!block) {
 263         error_report("%s: invalid block name: %s", __func__, block_name);
 264         return -1;
 265     }
 266
 267     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 268
 269     /*
 270      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 271      * machines we may need 4 more bytes for padding (see below
 272      * comment). So extend it a bit before hand.
 273      */
 274     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 275
 276     /*
 277      * Always use little endian when sending the bitmap. This is
 278      * required that when source and destination VMs are not using the
 279      * same endianness. (Note: big endian won't work.)
 280      */
 281     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 282
 283     /* Size of the bitmap, in bytes */
 284     size = DIV_ROUND_UP(nbits, 8);
 285
 286     /*
 287      * size is always aligned to 8 bytes for 64bit machines, but it
 288      * may not be true for 32bit machines. We need this padding to
 289      * make sure the migration can survive even between 32bit and
 290      * 64bit machines.
 291      */
 292     size = ROUND_UP(size, 8);
 293
 294     qemu_put_be64(file, size);
 295     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 296     /*
 297      * Mark as an end, in case the middle part is screwed up due to
 298      * some "mysterious" reason.
 299      */
 300     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 301     qemu_fflush(file);
 302
 303     g_free(le_bitmap);
 304
 305     if (qemu_file_get_error(file)) {
 306         return qemu_file_get_error(file);
 307     }
 308
 309     return size + sizeof(size);
 310 }
 311
 312 /*
 313  * An outstanding page request, on the source, having been received
 314  * and queued
 315  */
 316 struct RAMSrcPageRequest {
 317     RAMBlock *rb;
 318     hwaddr    offset;
 319     hwaddr    len;
 320
 321     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 322 };
 323
 324 /* State of RAM for migration */
 325 struct RAMState {
 326     /*
 327      * PageSearchStatus structures for the channels when send pages.
 328      * Protected by the bitmap_mutex.
 329      */
 330     PageSearchStatus pss[RAM_CHANNEL_MAX];
 331     /* UFFD file descriptor, used in 'write-tracking' migration */
 332     int uffdio_fd;
 333     /* Last block that we have visited searching for dirty pages */
 334     RAMBlock *last_seen_block;
 335     /* Last dirty target page we have sent */
 336     ram_addr_t last_page;
 337     /* last ram version we have seen */
 338     uint32_t last_version;
 339     /* How many times we have dirty too many pages */
 340     int dirty_rate_high_cnt;
 341     /* these variables are used for bitmap sync */
 342     /* last time we did a full bitmap_sync */
 343     int64_t time_last_bitmap_sync;
 344     /* bytes transferred at start_time */
 345     uint64_t bytes_xfer_prev;
 346     /* number of dirty pages since start_time */
 347     uint64_t num_dirty_pages_period;
 348     /* xbzrle misses since the beginning of the period */
 349     uint64_t xbzrle_cache_miss_prev;
 350     /* Amount of xbzrle pages since the beginning of the period */
 351     uint64_t xbzrle_pages_prev;
 352     /* Amount of xbzrle encoded bytes since the beginning of the period */
 353     uint64_t xbzrle_bytes_prev;
 354     /* Start using XBZRLE (e.g., after the first round). */
 355     bool xbzrle_enabled;
 356     /* Are we on the last stage of migration */
 357     bool last_stage;
 358     /* compression statistics since the beginning of the period */
 359     /* amount of count that no free thread to compress data */
 360     uint64_t compress_thread_busy_prev;
 361     /* amount bytes after compression */
 362     uint64_t compressed_size_prev;
 363     /* amount of compressed pages */
 364     uint64_t compress_pages_prev;
 365
 366     /* total handled target pages at the beginning of period */
 367     uint64_t target_page_count_prev;
 368     /* total handled target pages since start */
 369     uint64_t target_page_count;
 370     /* number of dirty bits in the bitmap */
 371     uint64_t migration_dirty_pages;
 372     /*
 373      * Protects:
 374      * - dirty/clear bitmap
 375      * - migration_dirty_pages
 376      * - pss structures
 377      */
 378     QemuMutex bitmap_mutex;
 379     /* The RAMBlock used in the last src_page_requests */
 380     RAMBlock *last_req_rb;
 381     /* Queue of outstanding page requests from the destination */
 382     QemuMutex src_page_req_mutex;
 383     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 384 };
 385 typedef struct RAMState RAMState;
 386
 387 static RAMState *ram_state;
 388
 389 static NotifierWithReturnList precopy_notifier_list;
 390
 391 /* Whether postcopy has queued requests? */
 392 static bool postcopy_has_request(RAMState *rs)
 393 {
 394     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
 395 }
 396
 397 void precopy_infrastructure_init(void)
 398 {
 399     notifier_with_return_list_init(&precopy_notifier_list);
 400 }
 401
 402 void precopy_add_notifier(NotifierWithReturn *n)
 403 {
 404     notifier_with_return_list_add(&precopy_notifier_list, n);
 405 }
 406
 407 void precopy_remove_notifier(NotifierWithReturn *n)
 408 {
 409     notifier_with_return_remove(n);
 410 }
 411
 412 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 413 {
 414     PrecopyNotifyData pnd;
 415     pnd.reason = reason;
 416     pnd.errp = errp;
 417
 418     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 419 }
 420
 421 uint64_t ram_bytes_remaining(void)
 422 {
 423     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 424                        0;
 425 }
 426
 427 /*
 428  * NOTE: not all stats in ram_counters are used in reality.  See comments
 429  * for struct MigrationAtomicStats.  The ultimate result of ram migration
 430  * counters will be a merged version with both ram_counters and the atomic
 431  * fields in ram_atomic_counters.
 432  */
 433 MigrationStats ram_counters;
 434 MigrationAtomicStats ram_atomic_counters;
 435
 436 void ram_transferred_add(uint64_t bytes)
 437 {
 438     if (runstate_is_running()) {
 439         ram_counters.precopy_bytes += bytes;
 440     } else if (migration_in_postcopy()) {
 441         stat64_add(&ram_atomic_counters.postcopy_bytes, bytes);
 442     } else {
 443         ram_counters.downtime_bytes += bytes;
 444     }
 445     stat64_add(&ram_atomic_counters.transferred, bytes);
 446 }
 447
 448 void dirty_sync_missed_zero_copy(void)
 449 {
 450     ram_counters.dirty_sync_missed_zero_copy++;
 451 }
 452
 453 CompressionStats compression_counters;
 454
 455 struct CompressParam {
 456     bool done;
 457     bool quit;
 458     bool zero_page;
 459     QEMUFile *file;
 460     QemuMutex mutex;
 461     QemuCond cond;
 462     RAMBlock *block;
 463     ram_addr_t offset;
 464
 465     /* internally used fields */
 466     z_stream stream;
 467     uint8_t *originbuf;
 468 };
 469 typedef struct CompressParam CompressParam;
 470
 471 struct DecompressParam {
 472     bool done;
 473     bool quit;
 474     QemuMutex mutex;
 475     QemuCond cond;
 476     void *des;
 477     uint8_t *compbuf;
 478     int len;
 479     z_stream stream;
 480 };
 481 typedef struct DecompressParam DecompressParam;
 482
 483 static CompressParam *comp_param;
 484 static QemuThread *compress_threads;
 485 /* comp_done_cond is used to wake up the migration thread when
 486  * one of the compression threads has finished the compression.
 487  * comp_done_lock is used to co-work with comp_done_cond.
 488  */
 489 static QemuMutex comp_done_lock;
 490 static QemuCond comp_done_cond;
 491
 492 static QEMUFile *decomp_file;
 493 static DecompressParam *decomp_param;
 494 static QemuThread *decompress_threads;
 495 static QemuMutex decomp_done_lock;
 496 static QemuCond decomp_done_cond;
 497
 498 static int ram_save_host_page_urgent(PageSearchStatus *pss);
 499
 500 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 501                                  ram_addr_t offset, uint8_t *source_buf);
 502
 503 /* NOTE: page is the PFN not real ram_addr_t. */
 504 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
 505 {
 506     pss->block = rb;
 507     pss->page = page;
 508     pss->complete_round = false;
 509 }
 510
 511 /*
 512  * Check whether two PSSs are actively sending the same page.  Return true
 513  * if it is, false otherwise.
 514  */
 515 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
 516 {
 517     return pss1->host_page_sending && pss2->host_page_sending &&
 518         (pss1->host_page_start == pss2->host_page_start);
 519 }
 520
 521 static void *do_data_compress(void *opaque)
 522 {
 523     CompressParam *param = opaque;
 524     RAMBlock *block;
 525     ram_addr_t offset;
 526     bool zero_page;
 527
 528     qemu_mutex_lock(&param->mutex);
 529     while (!param->quit) {
 530         if (param->block) {
 531             block = param->block;
 532             offset = param->offset;
 533             param->block = NULL;
 534             qemu_mutex_unlock(&param->mutex);
 535
 536             zero_page = do_compress_ram_page(param->file, &param->stream,
 537                                              block, offset, param->originbuf);
 538
 539             qemu_mutex_lock(&comp_done_lock);
 540             param->done = true;
 541             param->zero_page = zero_page;
 542             qemu_cond_signal(&comp_done_cond);
 543             qemu_mutex_unlock(&comp_done_lock);
 544
 545             qemu_mutex_lock(&param->mutex);
 546         } else {
 547             qemu_cond_wait(&param->cond, &param->mutex);
 548         }
 549     }
 550     qemu_mutex_unlock(&param->mutex);
 551
 552     return NULL;
 553 }
 554
 555 static void compress_threads_save_cleanup(void)
 556 {
 557     int i, thread_count;
 558
 559     if (!migrate_use_compression() || !comp_param) {
 560         return;
 561     }
 562
 563     thread_count = migrate_compress_threads();
 564     for (i = 0; i < thread_count; i++) {
 565         /*
 566          * we use it as a indicator which shows if the thread is
 567          * properly init'd or not
 568          */
 569         if (!comp_param[i].file) {
 570             break;
 571         }
 572
 573         qemu_mutex_lock(&comp_param[i].mutex);
 574         comp_param[i].quit = true;
 575         qemu_cond_signal(&comp_param[i].cond);
 576         qemu_mutex_unlock(&comp_param[i].mutex);
 577
 578         qemu_thread_join(compress_threads + i);
 579         qemu_mutex_destroy(&comp_param[i].mutex);
 580         qemu_cond_destroy(&comp_param[i].cond);
 581         deflateEnd(&comp_param[i].stream);
 582         g_free(comp_param[i].originbuf);
 583         qemu_fclose(comp_param[i].file);
 584         comp_param[i].file = NULL;
 585     }
 586     qemu_mutex_destroy(&comp_done_lock);
 587     qemu_cond_destroy(&comp_done_cond);
 588     g_free(compress_threads);
 589     g_free(comp_param);
 590     compress_threads = NULL;
 591     comp_param = NULL;
 592 }
 593
 594 static int compress_threads_save_setup(void)
 595 {
 596     int i, thread_count;
 597
 598     if (!migrate_use_compression()) {
 599         return 0;
 600     }
 601     thread_count = migrate_compress_threads();
 602     compress_threads = g_new0(QemuThread, thread_count);
 603     comp_param = g_new0(CompressParam, thread_count);
 604     qemu_cond_init(&comp_done_cond);
 605     qemu_mutex_init(&comp_done_lock);
 606     for (i = 0; i < thread_count; i++) {
 607         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 608         if (!comp_param[i].originbuf) {
 609             goto exit;
 610         }
 611
 612         if (deflateInit(&comp_param[i].stream,
 613                         migrate_compress_level()) != Z_OK) {
 614             g_free(comp_param[i].originbuf);
 615             goto exit;
 616         }
 617
 618         /* comp_param[i].file is just used as a dummy buffer to save data,
 619          * set its ops to empty.
 620          */
 621         comp_param[i].file = qemu_file_new_output(
 622             QIO_CHANNEL(qio_channel_null_new()));
 623         comp_param[i].done = true;
 624         comp_param[i].quit = false;
 625         qemu_mutex_init(&comp_param[i].mutex);
 626         qemu_cond_init(&comp_param[i].cond);
 627         qemu_thread_create(compress_threads + i, "compress",
 628                            do_data_compress, comp_param + i,
 629                            QEMU_THREAD_JOINABLE);
 630     }
 631     return 0;
 632
 633 exit:
 634     compress_threads_save_cleanup();
 635     return -1;
 636 }
 637
 638 /**
 639  * save_page_header: write page header to wire
 640  *
 641  * If this is the 1st block, it also writes the block identification
 642  *
 643  * Returns the number of bytes written
 644  *
 645  * @pss: current PSS channel status
 646  * @block: block that contains the page we want to send
 647  * @offset: offset inside the block for the page
 648  *          in the lower bits, it contains flags
 649  */
 650 static size_t save_page_header(PageSearchStatus *pss, RAMBlock *block,
 651                                ram_addr_t offset)
 652 {
 653     size_t size, len;
 654     bool same_block = (block == pss->last_sent_block);
 655     QEMUFile *f = pss->pss_channel;
 656
 657     if (same_block) {
 658         offset |= RAM_SAVE_FLAG_CONTINUE;
 659     }
 660     qemu_put_be64(f, offset);
 661     size = 8;
 662
 663     if (!same_block) {
 664         len = strlen(block->idstr);
 665         qemu_put_byte(f, len);
 666         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 667         size += 1 + len;
 668         pss->last_sent_block = block;
 669     }
 670     return size;
 671 }
 672
 673 /**
 674  * mig_throttle_guest_down: throttle down the guest
 675  *
 676  * Reduce amount of guest cpu execution to hopefully slow down memory
 677  * writes. If guest dirty memory rate is reduced below the rate at
 678  * which we can transfer pages to the destination then we should be
 679  * able to complete migration. Some workloads dirty memory way too
 680  * fast and will not effectively converge, even with auto-converge.
 681  */
 682 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 683                                     uint64_t bytes_dirty_threshold)
 684 {
 685     MigrationState *s = migrate_get_current();
 686     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 687     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 688     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 689     int pct_max = s->parameters.max_cpu_throttle;
 690
 691     uint64_t throttle_now = cpu_throttle_get_percentage();
 692     uint64_t cpu_now, cpu_ideal, throttle_inc;
 693
 694     /* We have not started throttling yet. Let's start it. */
 695     if (!cpu_throttle_active()) {
 696         cpu_throttle_set(pct_initial);
 697     } else {
 698         /* Throttling already on, just increase the rate */
 699         if (!pct_tailslow) {
 700             throttle_inc = pct_increment;
 701         } else {
 702             /* Compute the ideal CPU percentage used by Guest, which may
 703              * make the dirty rate match the dirty rate threshold. */
 704             cpu_now = 100 - throttle_now;
 705             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 706                         bytes_dirty_period);
 707             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 708         }
 709         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 710     }
 711 }
 712
 713 void mig_throttle_counter_reset(void)
 714 {
 715     RAMState *rs = ram_state;
 716
 717     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 718     rs->num_dirty_pages_period = 0;
 719     rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
 720 }
 721
 722 /**
 723  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 724  *
 725  * @rs: current RAM state
 726  * @current_addr: address for the zero page
 727  *
 728  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 729  * The important thing is that a stale (not-yet-0'd) page be replaced
 730  * by the new data.
 731  * As a bonus, if the page wasn't in the cache it gets added so that
 732  * when a small write is made into the 0'd page it gets XBZRLE sent.
 733  */
 734 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 735 {
 736     /* We don't care if this fails to allocate a new cache page
 737      * as long as it updated an old one */
 738     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 739                  ram_counters.dirty_sync_count);
 740 }
 741
 742 #define ENCODING_FLAG_XBZRLE 0x1
 743
 744 /**
 745  * save_xbzrle_page: compress and send current page
 746  *
 747  * Returns: 1 means that we wrote the page
 748  *          0 means that page is identical to the one already sent
 749  *          -1 means that xbzrle would be longer than normal
 750  *
 751  * @rs: current RAM state
 752  * @pss: current PSS channel
 753  * @current_data: pointer to the address of the page contents
 754  * @current_addr: addr of the page
 755  * @block: block that contains the page we want to send
 756  * @offset: offset inside the block for the page
 757  */
 758 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
 759                             uint8_t **current_data, ram_addr_t current_addr,
 760                             RAMBlock *block, ram_addr_t offset)
 761 {
 762     int encoded_len = 0, bytes_xbzrle;
 763     uint8_t *prev_cached_page;
 764     QEMUFile *file = pss->pss_channel;
 765
 766     if (!cache_is_cached(XBZRLE.cache, current_addr,
 767                          ram_counters.dirty_sync_count)) {
 768         xbzrle_counters.cache_miss++;
 769         if (!rs->last_stage) {
 770             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 771                              ram_counters.dirty_sync_count) == -1) {
 772                 return -1;
 773             } else {
 774                 /* update *current_data when the page has been
 775                    inserted into cache */
 776                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 777             }
 778         }
 779         return -1;
 780     }
 781
 782     /*
 783      * Reaching here means the page has hit the xbzrle cache, no matter what
 784      * encoding result it is (normal encoding, overflow or skipping the page),
 785      * count the page as encoded. This is used to calculate the encoding rate.
 786      *
 787      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 788      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 789      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 790      * skipped page included. In this way, the encoding rate can tell if the
 791      * guest page is good for xbzrle encoding.
 792      */
 793     xbzrle_counters.pages++;
 794     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 795
 796     /* save current buffer into memory */
 797     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 798
 799     /* XBZRLE encoding (if there is no overflow) */
 800     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 801                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 802                                        TARGET_PAGE_SIZE);
 803
 804     /*
 805      * Update the cache contents, so that it corresponds to the data
 806      * sent, in all cases except where we skip the page.
 807      */
 808     if (!rs->last_stage && encoded_len != 0) {
 809         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 810         /*
 811          * In the case where we couldn't compress, ensure that the caller
 812          * sends the data from the cache, since the guest might have
 813          * changed the RAM since we copied it.
 814          */
 815         *current_data = prev_cached_page;
 816     }
 817
 818     if (encoded_len == 0) {
 819         trace_save_xbzrle_page_skipping();
 820         return 0;
 821     } else if (encoded_len == -1) {
 822         trace_save_xbzrle_page_overflow();
 823         xbzrle_counters.overflow++;
 824         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 825         return -1;
 826     }
 827
 828     /* Send XBZRLE based compressed page */
 829     bytes_xbzrle = save_page_header(pss, block,
 830                                     offset | RAM_SAVE_FLAG_XBZRLE);
 831     qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
 832     qemu_put_be16(file, encoded_len);
 833     qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
 834     bytes_xbzrle += encoded_len + 1 + 2;
 835     /*
 836      * Like compressed_size (please see update_compress_thread_counts),
 837      * the xbzrle encoded bytes don't count the 8 byte header with
 838      * RAM_SAVE_FLAG_CONTINUE.
 839      */
 840     xbzrle_counters.bytes += bytes_xbzrle - 8;
 841     ram_transferred_add(bytes_xbzrle);
 842
 843     return 1;
 844 }
 845
 846 /**
 847  * pss_find_next_dirty: find the next dirty page of current ramblock
 848  *
 849  * This function updates pss->page to point to the next dirty page index
 850  * within the ramblock to migrate, or the end of ramblock when nothing
 851  * found.  Note that when pss->host_page_sending==true it means we're
 852  * during sending a host page, so we won't look for dirty page that is
 853  * outside the host page boundary.
 854  *
 855  * @pss: the current page search status
 856  */
 857 static void pss_find_next_dirty(PageSearchStatus *pss)
 858 {
 859     RAMBlock *rb = pss->block;
 860     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 861     unsigned long *bitmap = rb->bmap;
 862
 863     if (ramblock_is_ignored(rb)) {
 864         /* Points directly to the end, so we know no dirty page */
 865         pss->page = size;
 866         return;
 867     }
 868
 869     /*
 870      * If during sending a host page, only look for dirty pages within the
 871      * current host page being send.
 872      */
 873     if (pss->host_page_sending) {
 874         assert(pss->host_page_end);
 875         size = MIN(size, pss->host_page_end);
 876     }
 877
 878     pss->page = find_next_bit(bitmap, size, pss->page);
 879 }
 880
 881 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
 882                                                        unsigned long page)
 883 {
 884     uint8_t shift;
 885     hwaddr size, start;
 886
 887     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
 888         return;
 889     }
 890
 891     shift = rb->clear_bmap_shift;
 892     /*
 893      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 894      * can make things easier sometimes since then start address
 895      * of the small chunk will always be 64 pages aligned so the
 896      * bitmap will always be aligned to unsigned long. We should
 897      * even be able to remove this restriction but I'm simply
 898      * keeping it.
 899      */
 900     assert(shift >= 6);
 901
 902     size = 1ULL << (TARGET_PAGE_BITS + shift);
 903     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
 904     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 905     memory_region_clear_dirty_bitmap(rb->mr, start, size);
 906 }
 907
 908 static void
 909 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
 910                                                  unsigned long start,
 911                                                  unsigned long npages)
 912 {
 913     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
 914     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
 915     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
 916
 917     /*
 918      * Clear pages from start to start + npages - 1, so the end boundary is
 919      * exclusive.
 920      */
 921     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
 922         migration_clear_memory_region_dirty_bitmap(rb, i);
 923     }
 924 }
 925
 926 /*
 927  * colo_bitmap_find_diry:find contiguous dirty pages from start
 928  *
 929  * Returns the page offset within memory region of the start of the contiguout
 930  * dirty page
 931  *
 932  * @rs: current RAM state
 933  * @rb: RAMBlock where to search for dirty pages
 934  * @start: page where we start the search
 935  * @num: the number of contiguous dirty pages
 936  */
 937 static inline
 938 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 939                                      unsigned long start, unsigned long *num)
 940 {
 941     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 942     unsigned long *bitmap = rb->bmap;
 943     unsigned long first, next;
 944
 945     *num = 0;
 946
 947     if (ramblock_is_ignored(rb)) {
 948         return size;
 949     }
 950
 951     first = find_next_bit(bitmap, size, start);
 952     if (first >= size) {
 953         return first;
 954     }
 955     next = find_next_zero_bit(bitmap, size, first + 1);
 956     assert(next >= first);
 957     *num = next - first;
 958     return first;
 959 }
 960
 961 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 962                                                 RAMBlock *rb,
 963                                                 unsigned long page)
 964 {
 965     bool ret;
 966
 967     /*
 968      * Clear dirty bitmap if needed.  This _must_ be called before we
 969      * send any of the page in the chunk because we need to make sure
 970      * we can capture further page content changes when we sync dirty
 971      * log the next time.  So as long as we are going to send any of
 972      * the page in the chunk we clear the remote dirty bitmap for all.
 973      * Clearing it earlier won't be a problem, but too late will.
 974      */
 975     migration_clear_memory_region_dirty_bitmap(rb, page);
 976
 977     ret = test_and_clear_bit(page, rb->bmap);
 978     if (ret) {
 979         rs->migration_dirty_pages--;
 980     }
 981
 982     return ret;
 983 }
 984
 985 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
 986                                        void *opaque)
 987 {
 988     const hwaddr offset = section->offset_within_region;
 989     const hwaddr size = int128_get64(section->size);
 990     const unsigned long start = offset >> TARGET_PAGE_BITS;
 991     const unsigned long npages = size >> TARGET_PAGE_BITS;
 992     RAMBlock *rb = section->mr->ram_block;
 993     uint64_t *cleared_bits = opaque;
 994
 995     /*
 996      * We don't grab ram_state->bitmap_mutex because we expect to run
 997      * only when starting migration or during postcopy recovery where
 998      * we don't have concurrent access.
 999      */
1000     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
1001         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
1002     }
1003     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
1004     bitmap_clear(rb->bmap, start, npages);
1005 }
1006
1007 /*
1008  * Exclude all dirty pages from migration that fall into a discarded range as
1009  * managed by a RamDiscardManager responsible for the mapped memory region of
1010  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1011  *
1012  * Discarded pages ("logically unplugged") have undefined content and must
1013  * not get migrated, because even reading these pages for migration might
1014  * result in undesired behavior.
1015  *
1016  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1017  *
1018  * Note: The result is only stable while migrating (precopy/postcopy).
1019  */
1020 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1021 {
1022     uint64_t cleared_bits = 0;
1023
1024     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1025         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1026         MemoryRegionSection section = {
1027             .mr = rb->mr,
1028             .offset_within_region = 0,
1029             .size = int128_make64(qemu_ram_get_used_length(rb)),
1030         };
1031
1032         ram_discard_manager_replay_discarded(rdm, &section,
1033                                              dirty_bitmap_clear_section,
1034                                              &cleared_bits);
1035     }
1036     return cleared_bits;
1037 }
1038
1039 /*
1040  * Check if a host-page aligned page falls into a discarded range as managed by
1041  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1042  *
1043  * Note: The result is only stable while migrating (precopy/postcopy).
1044  */
1045 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1046 {
1047     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1048         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1049         MemoryRegionSection section = {
1050             .mr = rb->mr,
1051             .offset_within_region = start,
1052             .size = int128_make64(qemu_ram_pagesize(rb)),
1053         };
1054
1055         return !ram_discard_manager_is_populated(rdm, &section);
1056     }
1057     return false;
1058 }
1059
1060 /* Called with RCU critical section */
1061 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1062 {
1063     uint64_t new_dirty_pages =
1064         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1065
1066     rs->migration_dirty_pages += new_dirty_pages;
1067     rs->num_dirty_pages_period += new_dirty_pages;
1068 }
1069
1070 /**
1071  * ram_pagesize_summary: calculate all the pagesizes of a VM
1072  *
1073  * Returns a summary bitmap of the page sizes of all RAMBlocks
1074  *
1075  * For VMs with just normal pages this is equivalent to the host page
1076  * size. If it's got some huge pages then it's the OR of all the
1077  * different page sizes.
1078  */
1079 uint64_t ram_pagesize_summary(void)
1080 {
1081     RAMBlock *block;
1082     uint64_t summary = 0;
1083
1084     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1085         summary |= block->page_size;
1086     }
1087
1088     return summary;
1089 }
1090
1091 uint64_t ram_get_total_transferred_pages(void)
1092 {
1093     return  stat64_get(&ram_atomic_counters.normal) +
1094         stat64_get(&ram_atomic_counters.duplicate) +
1095         compression_counters.pages + xbzrle_counters.pages;
1096 }
1097
1098 static void migration_update_rates(RAMState *rs, int64_t end_time)
1099 {
1100     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1101     double compressed_size;
1102
1103     /* calculate period counters */
1104     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1105                 / (end_time - rs->time_last_bitmap_sync);
1106
1107     if (!page_count) {
1108         return;
1109     }
1110
1111     if (migrate_use_xbzrle()) {
1112         double encoded_size, unencoded_size;
1113
1114         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1115             rs->xbzrle_cache_miss_prev) / page_count;
1116         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1117         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1118                          TARGET_PAGE_SIZE;
1119         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1120         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1121             xbzrle_counters.encoding_rate = 0;
1122         } else {
1123             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1124         }
1125         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1126         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1127     }
1128
1129     if (migrate_use_compression()) {
1130         compression_counters.busy_rate = (double)(compression_counters.busy -
1131             rs->compress_thread_busy_prev) / page_count;
1132         rs->compress_thread_busy_prev = compression_counters.busy;
1133
1134         compressed_size = compression_counters.compressed_size -
1135                           rs->compressed_size_prev;
1136         if (compressed_size) {
1137             double uncompressed_size = (compression_counters.pages -
1138                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1139
1140             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1141             compression_counters.compression_rate =
1142                                         uncompressed_size / compressed_size;
1143
1144             rs->compress_pages_prev = compression_counters.pages;
1145             rs->compressed_size_prev = compression_counters.compressed_size;
1146         }
1147     }
1148 }
1149
1150 static void migration_trigger_throttle(RAMState *rs)
1151 {
1152     MigrationState *s = migrate_get_current();
1153     uint64_t threshold = s->parameters.throttle_trigger_threshold;
1154     uint64_t bytes_xfer_period =
1155         stat64_get(&ram_atomic_counters.transferred) - rs->bytes_xfer_prev;
1156     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1157     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1158
1159     /* During block migration the auto-converge logic incorrectly detects
1160      * that ram migration makes no progress. Avoid this by disabling the
1161      * throttling logic during the bulk phase of block migration. */
1162     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1163         /* The following detection logic can be refined later. For now:
1164            Check to see if the ratio between dirtied bytes and the approx.
1165            amount of bytes that just got transferred since the last time
1166            we were in this routine reaches the threshold. If that happens
1167            twice, start or increase throttling. */
1168
1169         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1170             (++rs->dirty_rate_high_cnt >= 2)) {
1171             trace_migration_throttle();
1172             rs->dirty_rate_high_cnt = 0;
1173             mig_throttle_guest_down(bytes_dirty_period,
1174                                     bytes_dirty_threshold);
1175         }
1176     }
1177 }
1178
1179 static void migration_bitmap_sync(RAMState *rs)
1180 {
1181     RAMBlock *block;
1182     int64_t end_time;
1183
1184     ram_counters.dirty_sync_count++;
1185
1186     if (!rs->time_last_bitmap_sync) {
1187         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1188     }
1189
1190     trace_migration_bitmap_sync_start();
1191     memory_global_dirty_log_sync();
1192
1193     qemu_mutex_lock(&rs->bitmap_mutex);
1194     WITH_RCU_READ_LOCK_GUARD() {
1195         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1196             ramblock_sync_dirty_bitmap(rs, block);
1197         }
1198         ram_counters.remaining = ram_bytes_remaining();
1199     }
1200     qemu_mutex_unlock(&rs->bitmap_mutex);
1201
1202     memory_global_after_dirty_log_sync();
1203     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1204
1205     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1206
1207     /* more than 1 second = 1000 millisecons */
1208     if (end_time > rs->time_last_bitmap_sync + 1000) {
1209         migration_trigger_throttle(rs);
1210
1211         migration_update_rates(rs, end_time);
1212
1213         rs->target_page_count_prev = rs->target_page_count;
1214
1215         /* reset period counters */
1216         rs->time_last_bitmap_sync = end_time;
1217         rs->num_dirty_pages_period = 0;
1218         rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
1219     }
1220     if (migrate_use_events()) {
1221         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1222     }
1223 }
1224
1225 static void migration_bitmap_sync_precopy(RAMState *rs)
1226 {
1227     Error *local_err = NULL;
1228
1229     /*
1230      * The current notifier usage is just an optimization to migration, so we
1231      * don't stop the normal migration process in the error case.
1232      */
1233     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1234         error_report_err(local_err);
1235         local_err = NULL;
1236     }
1237
1238     migration_bitmap_sync(rs);
1239
1240     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1241         error_report_err(local_err);
1242     }
1243 }
1244
1245 void ram_release_page(const char *rbname, uint64_t offset)
1246 {
1247     if (!migrate_release_ram() || !migration_in_postcopy()) {
1248         return;
1249     }
1250
1251     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1252 }
1253
1254 /**
1255  * save_zero_page_to_file: send the zero page to the file
1256  *
1257  * Returns the size of data written to the file, 0 means the page is not
1258  * a zero page
1259  *
1260  * @pss: current PSS channel
1261  * @block: block that contains the page we want to send
1262  * @offset: offset inside the block for the page
1263  */
1264 static int save_zero_page_to_file(PageSearchStatus *pss,
1265                                   RAMBlock *block, ram_addr_t offset)
1266 {
1267     uint8_t *p = block->host + offset;
1268     QEMUFile *file = pss->pss_channel;
1269     int len = 0;
1270
1271     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1272         len += save_page_header(pss, block, offset | RAM_SAVE_FLAG_ZERO);
1273         qemu_put_byte(file, 0);
1274         len += 1;
1275         ram_release_page(block->idstr, offset);
1276     }
1277     return len;
1278 }
1279
1280 /**
1281  * save_zero_page: send the zero page to the stream
1282  *
1283  * Returns the number of pages written.
1284  *
1285  * @pss: current PSS channel
1286  * @block: block that contains the page we want to send
1287  * @offset: offset inside the block for the page
1288  */
1289 static int save_zero_page(PageSearchStatus *pss, RAMBlock *block,
1290                           ram_addr_t offset)
1291 {
1292     int len = save_zero_page_to_file(pss, block, offset);
1293
1294     if (len) {
1295         stat64_add(&ram_atomic_counters.duplicate, 1);
1296         ram_transferred_add(len);
1297         return 1;
1298     }
1299     return -1;
1300 }
1301
1302 /*
1303  * @pages: the number of pages written by the control path,
1304  *        < 0 - error
1305  *        > 0 - number of pages written
1306  *
1307  * Return true if the pages has been saved, otherwise false is returned.
1308  */
1309 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1310                               ram_addr_t offset, int *pages)
1311 {
1312     uint64_t bytes_xmit = 0;
1313     int ret;
1314
1315     *pages = -1;
1316     ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1317                                 TARGET_PAGE_SIZE, &bytes_xmit);
1318     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1319         return false;
1320     }
1321
1322     if (bytes_xmit) {
1323         ram_transferred_add(bytes_xmit);
1324         *pages = 1;
1325     }
1326
1327     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1328         return true;
1329     }
1330
1331     if (bytes_xmit > 0) {
1332         stat64_add(&ram_atomic_counters.normal, 1);
1333     } else if (bytes_xmit == 0) {
1334         stat64_add(&ram_atomic_counters.duplicate, 1);
1335     }
1336
1337     return true;
1338 }
1339
1340 /*
1341  * directly send the page to the stream
1342  *
1343  * Returns the number of pages written.
1344  *
1345  * @pss: current PSS channel
1346  * @block: block that contains the page we want to send
1347  * @offset: offset inside the block for the page
1348  * @buf: the page to be sent
1349  * @async: send to page asyncly
1350  */
1351 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1352                             ram_addr_t offset, uint8_t *buf, bool async)
1353 {
1354     QEMUFile *file = pss->pss_channel;
1355
1356     ram_transferred_add(save_page_header(pss, block,
1357                                          offset | RAM_SAVE_FLAG_PAGE));
1358     if (async) {
1359         qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1360                               migrate_release_ram() &&
1361                               migration_in_postcopy());
1362     } else {
1363         qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1364     }
1365     ram_transferred_add(TARGET_PAGE_SIZE);
1366     stat64_add(&ram_atomic_counters.normal, 1);
1367     return 1;
1368 }
1369
1370 /**
1371  * ram_save_page: send the given page to the stream
1372  *
1373  * Returns the number of pages written.
1374  *          < 0 - error
1375  *          >=0 - Number of pages written - this might legally be 0
1376  *                if xbzrle noticed the page was the same.
1377  *
1378  * @rs: current RAM state
1379  * @block: block that contains the page we want to send
1380  * @offset: offset inside the block for the page
1381  */
1382 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1383 {
1384     int pages = -1;
1385     uint8_t *p;
1386     bool send_async = true;
1387     RAMBlock *block = pss->block;
1388     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1389     ram_addr_t current_addr = block->offset + offset;
1390
1391     p = block->host + offset;
1392     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1393
1394     XBZRLE_cache_lock();
1395     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1396         pages = save_xbzrle_page(rs, pss, &p, current_addr,
1397                                  block, offset);
1398         if (!rs->last_stage) {
1399             /* Can't send this cached data async, since the cache page
1400              * might get updated before it gets to the wire
1401              */
1402             send_async = false;
1403         }
1404     }
1405
1406     /* XBZRLE overflow or normal page */
1407     if (pages == -1) {
1408         pages = save_normal_page(pss, block, offset, p, send_async);
1409     }
1410
1411     XBZRLE_cache_unlock();
1412
1413     return pages;
1414 }
1415
1416 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
1417                                  ram_addr_t offset)
1418 {
1419     if (multifd_queue_page(file, block, offset) < 0) {
1420         return -1;
1421     }
1422     stat64_add(&ram_atomic_counters.normal, 1);
1423
1424     return 1;
1425 }
1426
1427 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1428                                  ram_addr_t offset, uint8_t *source_buf)
1429 {
1430     RAMState *rs = ram_state;
1431     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
1432     uint8_t *p = block->host + offset;
1433     int ret;
1434
1435     if (save_zero_page_to_file(pss, block, offset)) {
1436         return true;
1437     }
1438
1439     save_page_header(pss, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1440
1441     /*
1442      * copy it to a internal buffer to avoid it being modified by VM
1443      * so that we can catch up the error during compression and
1444      * decompression
1445      */
1446     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1447     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1448     if (ret < 0) {
1449         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1450         error_report("compressed data failed!");
1451     }
1452     return false;
1453 }
1454
1455 static void
1456 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1457 {
1458     ram_transferred_add(bytes_xmit);
1459
1460     if (param->zero_page) {
1461         stat64_add(&ram_atomic_counters.duplicate, 1);
1462         return;
1463     }
1464
1465     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1466     compression_counters.compressed_size += bytes_xmit - 8;
1467     compression_counters.pages++;
1468 }
1469
1470 static bool save_page_use_compression(RAMState *rs);
1471
1472 static void flush_compressed_data(RAMState *rs)
1473 {
1474     MigrationState *ms = migrate_get_current();
1475     int idx, len, thread_count;
1476
1477     if (!save_page_use_compression(rs)) {
1478         return;
1479     }
1480     thread_count = migrate_compress_threads();
1481
1482     qemu_mutex_lock(&comp_done_lock);
1483     for (idx = 0; idx < thread_count; idx++) {
1484         while (!comp_param[idx].done) {
1485             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1486         }
1487     }
1488     qemu_mutex_unlock(&comp_done_lock);
1489
1490     for (idx = 0; idx < thread_count; idx++) {
1491         qemu_mutex_lock(&comp_param[idx].mutex);
1492         if (!comp_param[idx].quit) {
1493             len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
1494             /*
1495              * it's safe to fetch zero_page without holding comp_done_lock
1496              * as there is no further request submitted to the thread,
1497              * i.e, the thread should be waiting for a request at this point.
1498              */
1499             update_compress_thread_counts(&comp_param[idx], len);
1500         }
1501         qemu_mutex_unlock(&comp_param[idx].mutex);
1502     }
1503 }
1504
1505 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1506                                        ram_addr_t offset)
1507 {
1508     param->block = block;
1509     param->offset = offset;
1510 }
1511
1512 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
1513 {
1514     int idx, thread_count, bytes_xmit = -1, pages = -1;
1515     bool wait = migrate_compress_wait_thread();
1516     MigrationState *ms = migrate_get_current();
1517
1518     thread_count = migrate_compress_threads();
1519     qemu_mutex_lock(&comp_done_lock);
1520 retry:
1521     for (idx = 0; idx < thread_count; idx++) {
1522         if (comp_param[idx].done) {
1523             comp_param[idx].done = false;
1524             bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
1525                                             comp_param[idx].file);
1526             qemu_mutex_lock(&comp_param[idx].mutex);
1527             set_compress_params(&comp_param[idx], block, offset);
1528             qemu_cond_signal(&comp_param[idx].cond);
1529             qemu_mutex_unlock(&comp_param[idx].mutex);
1530             pages = 1;
1531             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1532             break;
1533         }
1534     }
1535
1536     /*
1537      * wait for the free thread if the user specifies 'compress-wait-thread',
1538      * otherwise we will post the page out in the main thread as normal page.
1539      */
1540     if (pages < 0 && wait) {
1541         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1542         goto retry;
1543     }
1544     qemu_mutex_unlock(&comp_done_lock);
1545
1546     return pages;
1547 }
1548
1549 /**
1550  * find_dirty_block: find the next dirty page and update any state
1551  * associated with the search process.
1552  *
1553  * Returns true if a page is found
1554  *
1555  * @rs: current RAM state
1556  * @pss: data about the state of the current dirty page scan
1557  * @again: set to false if the search has scanned the whole of RAM
1558  */
1559 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1560 {
1561     /* Update pss->page for the next dirty bit in ramblock */
1562     pss_find_next_dirty(pss);
1563
1564     if (pss->complete_round && pss->block == rs->last_seen_block &&
1565         pss->page >= rs->last_page) {
1566         /*
1567          * We've been once around the RAM and haven't found anything.
1568          * Give up.
1569          */
1570         *again = false;
1571         return false;
1572     }
1573     if (!offset_in_ramblock(pss->block,
1574                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1575         /* Didn't find anything in this RAM Block */
1576         pss->page = 0;
1577         pss->block = QLIST_NEXT_RCU(pss->block, next);
1578         if (!pss->block) {
1579             /*
1580              * If memory migration starts over, we will meet a dirtied page
1581              * which may still exists in compression threads's ring, so we
1582              * should flush the compressed data to make sure the new page
1583              * is not overwritten by the old one in the destination.
1584              *
1585              * Also If xbzrle is on, stop using the data compression at this
1586              * point. In theory, xbzrle can do better than compression.
1587              */
1588             flush_compressed_data(rs);
1589
1590             /* Hit the end of the list */
1591             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1592             /* Flag that we've looped */
1593             pss->complete_round = true;
1594             /* After the first round, enable XBZRLE. */
1595             if (migrate_use_xbzrle()) {
1596                 rs->xbzrle_enabled = true;
1597             }
1598         }
1599         /* Didn't find anything this time, but try again on the new block */
1600         *again = true;
1601         return false;
1602     } else {
1603         /* Can go around again, but... */
1604         *again = true;
1605         /* We've found something so probably don't need to */
1606         return true;
1607     }
1608 }
1609
1610 /**
1611  * unqueue_page: gets a page of the queue
1612  *
1613  * Helper for 'get_queued_page' - gets a page off the queue
1614  *
1615  * Returns the block of the page (or NULL if none available)
1616  *
1617  * @rs: current RAM state
1618  * @offset: used to return the offset within the RAMBlock
1619  */
1620 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1621 {
1622     struct RAMSrcPageRequest *entry;
1623     RAMBlock *block = NULL;
1624
1625     if (!postcopy_has_request(rs)) {
1626         return NULL;
1627     }
1628
1629     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1630
1631     /*
1632      * This should _never_ change even after we take the lock, because no one
1633      * should be taking anything off the request list other than us.
1634      */
1635     assert(postcopy_has_request(rs));
1636
1637     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1638     block = entry->rb;
1639     *offset = entry->offset;
1640
1641     if (entry->len > TARGET_PAGE_SIZE) {
1642         entry->len -= TARGET_PAGE_SIZE;
1643         entry->offset += TARGET_PAGE_SIZE;
1644     } else {
1645         memory_region_unref(block->mr);
1646         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1647         g_free(entry);
1648         migration_consume_urgent_request();
1649     }
1650
1651     return block;
1652 }
1653
1654 #if defined(__linux__)
1655 /**
1656  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1657  *   is found, return RAM block pointer and page offset
1658  *
1659  * Returns pointer to the RAMBlock containing faulting page,
1660  *   NULL if no write faults are pending
1661  *
1662  * @rs: current RAM state
1663  * @offset: page offset from the beginning of the block
1664  */
1665 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1666 {
1667     struct uffd_msg uffd_msg;
1668     void *page_address;
1669     RAMBlock *block;
1670     int res;
1671
1672     if (!migrate_background_snapshot()) {
1673         return NULL;
1674     }
1675
1676     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1677     if (res <= 0) {
1678         return NULL;
1679     }
1680
1681     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1682     block = qemu_ram_block_from_host(page_address, false, offset);
1683     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1684     return block;
1685 }
1686
1687 /**
1688  * ram_save_release_protection: release UFFD write protection after
1689  *   a range of pages has been saved
1690  *
1691  * @rs: current RAM state
1692  * @pss: page-search-status structure
1693  * @start_page: index of the first page in the range relative to pss->block
1694  *
1695  * Returns 0 on success, negative value in case of an error
1696 */
1697 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1698         unsigned long start_page)
1699 {
1700     int res = 0;
1701
1702     /* Check if page is from UFFD-managed region. */
1703     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1704         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1705         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1706
1707         /* Flush async buffers before un-protect. */
1708         qemu_fflush(pss->pss_channel);
1709         /* Un-protect memory range. */
1710         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1711                 false, false);
1712     }
1713
1714     return res;
1715 }
1716
1717 /* ram_write_tracking_available: check if kernel supports required UFFD features
1718  *
1719  * Returns true if supports, false otherwise
1720  */
1721 bool ram_write_tracking_available(void)
1722 {
1723     uint64_t uffd_features;
1724     int res;
1725
1726     res = uffd_query_features(&uffd_features);
1727     return (res == 0 &&
1728             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1729 }
1730
1731 /* ram_write_tracking_compatible: check if guest configuration is
1732  *   compatible with 'write-tracking'
1733  *
1734  * Returns true if compatible, false otherwise
1735  */
1736 bool ram_write_tracking_compatible(void)
1737 {
1738     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1739     int uffd_fd;
1740     RAMBlock *block;
1741     bool ret = false;
1742
1743     /* Open UFFD file descriptor */
1744     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1745     if (uffd_fd < 0) {
1746         return false;
1747     }
1748
1749     RCU_READ_LOCK_GUARD();
1750
1751     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1752         uint64_t uffd_ioctls;
1753
1754         /* Nothing to do with read-only and MMIO-writable regions */
1755         if (block->mr->readonly || block->mr->rom_device) {
1756             continue;
1757         }
1758         /* Try to register block memory via UFFD-IO to track writes */
1759         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1760                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1761             goto out;
1762         }
1763         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1764             goto out;
1765         }
1766     }
1767     ret = true;
1768
1769 out:
1770     uffd_close_fd(uffd_fd);
1771     return ret;
1772 }
1773
1774 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1775                                        ram_addr_t size)
1776 {
1777     /*
1778      * We read one byte of each page; this will preallocate page tables if
1779      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1780      * where no page was populated yet. This might require adaption when
1781      * supporting other mappings, like shmem.
1782      */
1783     for (; offset < size; offset += block->page_size) {
1784         char tmp = *((char *)block->host + offset);
1785
1786         /* Don't optimize the read out */
1787         asm volatile("" : "+r" (tmp));
1788     }
1789 }
1790
1791 static inline int populate_read_section(MemoryRegionSection *section,
1792                                         void *opaque)
1793 {
1794     const hwaddr size = int128_get64(section->size);
1795     hwaddr offset = section->offset_within_region;
1796     RAMBlock *block = section->mr->ram_block;
1797
1798     populate_read_range(block, offset, size);
1799     return 0;
1800 }
1801
1802 /*
1803  * ram_block_populate_read: preallocate page tables and populate pages in the
1804  *   RAM block by reading a byte of each page.
1805  *
1806  * Since it's solely used for userfault_fd WP feature, here we just
1807  *   hardcode page size to qemu_real_host_page_size.
1808  *
1809  * @block: RAM block to populate
1810  */
1811 static void ram_block_populate_read(RAMBlock *rb)
1812 {
1813     /*
1814      * Skip populating all pages that fall into a discarded range as managed by
1815      * a RamDiscardManager responsible for the mapped memory region of the
1816      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1817      * must not get populated automatically. We don't have to track
1818      * modifications via userfaultfd WP reliably, because these pages will
1819      * not be part of the migration stream either way -- see
1820      * ramblock_dirty_bitmap_exclude_discarded_pages().
1821      *
1822      * Note: The result is only stable while migrating (precopy/postcopy).
1823      */
1824     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1825         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1826         MemoryRegionSection section = {
1827             .mr = rb->mr,
1828             .offset_within_region = 0,
1829             .size = rb->mr->size,
1830         };
1831
1832         ram_discard_manager_replay_populated(rdm, &section,
1833                                              populate_read_section, NULL);
1834     } else {
1835         populate_read_range(rb, 0, rb->used_length);
1836     }
1837 }
1838
1839 /*
1840  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1841  */
1842 void ram_write_tracking_prepare(void)
1843 {
1844     RAMBlock *block;
1845
1846     RCU_READ_LOCK_GUARD();
1847
1848     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1849         /* Nothing to do with read-only and MMIO-writable regions */
1850         if (block->mr->readonly || block->mr->rom_device) {
1851             continue;
1852         }
1853
1854         /*
1855          * Populate pages of the RAM block before enabling userfault_fd
1856          * write protection.
1857          *
1858          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1859          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1860          * pages with pte_none() entries in page table.
1861          */
1862         ram_block_populate_read(block);
1863     }
1864 }
1865
1866 /*
1867  * ram_write_tracking_start: start UFFD-WP memory tracking
1868  *
1869  * Returns 0 for success or negative value in case of error
1870  */
1871 int ram_write_tracking_start(void)
1872 {
1873     int uffd_fd;
1874     RAMState *rs = ram_state;
1875     RAMBlock *block;
1876
1877     /* Open UFFD file descriptor */
1878     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1879     if (uffd_fd < 0) {
1880         return uffd_fd;
1881     }
1882     rs->uffdio_fd = uffd_fd;
1883
1884     RCU_READ_LOCK_GUARD();
1885
1886     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1887         /* Nothing to do with read-only and MMIO-writable regions */
1888         if (block->mr->readonly || block->mr->rom_device) {
1889             continue;
1890         }
1891
1892         /* Register block memory with UFFD to track writes */
1893         if (uffd_register_memory(rs->uffdio_fd, block->host,
1894                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1895             goto fail;
1896         }
1897         /* Apply UFFD write protection to the block memory range */
1898         if (uffd_change_protection(rs->uffdio_fd, block->host,
1899                 block->max_length, true, false)) {
1900             goto fail;
1901         }
1902         block->flags |= RAM_UF_WRITEPROTECT;
1903         memory_region_ref(block->mr);
1904
1905         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1906                 block->host, block->max_length);
1907     }
1908
1909     return 0;
1910
1911 fail:
1912     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1913
1914     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1915         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1916             continue;
1917         }
1918         /*
1919          * In case some memory block failed to be write-protected
1920          * remove protection and unregister all succeeded RAM blocks
1921          */
1922         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1923                 false, false);
1924         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1925         /* Cleanup flags and remove reference */
1926         block->flags &= ~RAM_UF_WRITEPROTECT;
1927         memory_region_unref(block->mr);
1928     }
1929
1930     uffd_close_fd(uffd_fd);
1931     rs->uffdio_fd = -1;
1932     return -1;
1933 }
1934
1935 /**
1936  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1937  */
1938 void ram_write_tracking_stop(void)
1939 {
1940     RAMState *rs = ram_state;
1941     RAMBlock *block;
1942
1943     RCU_READ_LOCK_GUARD();
1944
1945     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1946         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1947             continue;
1948         }
1949         /* Remove protection and unregister all affected RAM blocks */
1950         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1951                 false, false);
1952         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1953
1954         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1955                 block->host, block->max_length);
1956
1957         /* Cleanup flags and remove reference */
1958         block->flags &= ~RAM_UF_WRITEPROTECT;
1959         memory_region_unref(block->mr);
1960     }
1961
1962     /* Finally close UFFD file descriptor */
1963     uffd_close_fd(rs->uffdio_fd);
1964     rs->uffdio_fd = -1;
1965 }
1966
1967 #else
1968 /* No target OS support, stubs just fail or ignore */
1969
1970 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1971 {
1972     (void) rs;
1973     (void) offset;
1974
1975     return NULL;
1976 }
1977
1978 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1979         unsigned long start_page)
1980 {
1981     (void) rs;
1982     (void) pss;
1983     (void) start_page;
1984
1985     return 0;
1986 }
1987
1988 bool ram_write_tracking_available(void)
1989 {
1990     return false;
1991 }
1992
1993 bool ram_write_tracking_compatible(void)
1994 {
1995     assert(0);
1996     return false;
1997 }
1998
1999 int ram_write_tracking_start(void)
2000 {
2001     assert(0);
2002     return -1;
2003 }
2004
2005 void ram_write_tracking_stop(void)
2006 {
2007     assert(0);
2008 }
2009 #endif /* defined(__linux__) */
2010
2011 /**
2012  * get_queued_page: unqueue a page from the postcopy requests
2013  *
2014  * Skips pages that are already sent (!dirty)
2015  *
2016  * Returns true if a queued page is found
2017  *
2018  * @rs: current RAM state
2019  * @pss: data about the state of the current dirty page scan
2020  */
2021 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2022 {
2023     RAMBlock  *block;
2024     ram_addr_t offset;
2025     bool dirty;
2026
2027     do {
2028         block = unqueue_page(rs, &offset);
2029         /*
2030          * We're sending this page, and since it's postcopy nothing else
2031          * will dirty it, and we must make sure it doesn't get sent again
2032          * even if this queue request was received after the background
2033          * search already sent it.
2034          */
2035         if (block) {
2036             unsigned long page;
2037
2038             page = offset >> TARGET_PAGE_BITS;
2039             dirty = test_bit(page, block->bmap);
2040             if (!dirty) {
2041                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2042                                                 page);
2043             } else {
2044                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2045             }
2046         }
2047
2048     } while (block && !dirty);
2049
2050     if (!block) {
2051         /*
2052          * Poll write faults too if background snapshot is enabled; that's
2053          * when we have vcpus got blocked by the write protected pages.
2054          */
2055         block = poll_fault_page(rs, &offset);
2056     }
2057
2058     if (block) {
2059         /*
2060          * We want the background search to continue from the queued page
2061          * since the guest is likely to want other pages near to the page
2062          * it just requested.
2063          */
2064         pss->block = block;
2065         pss->page = offset >> TARGET_PAGE_BITS;
2066
2067         /*
2068          * This unqueued page would break the "one round" check, even is
2069          * really rare.
2070          */
2071         pss->complete_round = false;
2072     }
2073
2074     return !!block;
2075 }
2076
2077 /**
2078  * migration_page_queue_free: drop any remaining pages in the ram
2079  * request queue
2080  *
2081  * It should be empty at the end anyway, but in error cases there may
2082  * be some left.  in case that there is any page left, we drop it.
2083  *
2084  */
2085 static void migration_page_queue_free(RAMState *rs)
2086 {
2087     struct RAMSrcPageRequest *mspr, *next_mspr;
2088     /* This queue generally should be empty - but in the case of a failed
2089      * migration might have some droppings in.
2090      */
2091     RCU_READ_LOCK_GUARD();
2092     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2093         memory_region_unref(mspr->rb->mr);
2094         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2095         g_free(mspr);
2096     }
2097 }
2098
2099 /**
2100  * ram_save_queue_pages: queue the page for transmission
2101  *
2102  * A request from postcopy destination for example.
2103  *
2104  * Returns zero on success or negative on error
2105  *
2106  * @rbname: Name of the RAMBLock of the request. NULL means the
2107  *          same that last one.
2108  * @start: starting address from the start of the RAMBlock
2109  * @len: length (in bytes) to send
2110  */
2111 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2112 {
2113     RAMBlock *ramblock;
2114     RAMState *rs = ram_state;
2115
2116     ram_counters.postcopy_requests++;
2117     RCU_READ_LOCK_GUARD();
2118
2119     if (!rbname) {
2120         /* Reuse last RAMBlock */
2121         ramblock = rs->last_req_rb;
2122
2123         if (!ramblock) {
2124             /*
2125              * Shouldn't happen, we can't reuse the last RAMBlock if
2126              * it's the 1st request.
2127              */
2128             error_report("ram_save_queue_pages no previous block");
2129             return -1;
2130         }
2131     } else {
2132         ramblock = qemu_ram_block_by_name(rbname);
2133
2134         if (!ramblock) {
2135             /* We shouldn't be asked for a non-existent RAMBlock */
2136             error_report("ram_save_queue_pages no block '%s'", rbname);
2137             return -1;
2138         }
2139         rs->last_req_rb = ramblock;
2140     }
2141     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2142     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2143         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2144                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2145                      __func__, start, len, ramblock->used_length);
2146         return -1;
2147     }
2148
2149     /*
2150      * When with postcopy preempt, we send back the page directly in the
2151      * rp-return thread.
2152      */
2153     if (postcopy_preempt_active()) {
2154         ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2155         size_t page_size = qemu_ram_pagesize(ramblock);
2156         PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2157         int ret = 0;
2158
2159         qemu_mutex_lock(&rs->bitmap_mutex);
2160
2161         pss_init(pss, ramblock, page_start);
2162         /*
2163          * Always use the preempt channel, and make sure it's there.  It's
2164          * safe to access without lock, because when rp-thread is running
2165          * we should be the only one who operates on the qemufile
2166          */
2167         pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
2168         assert(pss->pss_channel);
2169
2170         /*
2171          * It must be either one or multiple of host page size.  Just
2172          * assert; if something wrong we're mostly split brain anyway.
2173          */
2174         assert(len % page_size == 0);
2175         while (len) {
2176             if (ram_save_host_page_urgent(pss)) {
2177                 error_report("%s: ram_save_host_page_urgent() failed: "
2178                              "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2179                              __func__, ramblock->idstr, start);
2180                 ret = -1;
2181                 break;
2182             }
2183             /*
2184              * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2185              * will automatically be moved and point to the next host page
2186              * we're going to send, so no need to update here.
2187              *
2188              * Normally QEMU never sends >1 host page in requests, so
2189              * logically we don't even need that as the loop should only
2190              * run once, but just to be consistent.
2191              */
2192             len -= page_size;
2193         };
2194         qemu_mutex_unlock(&rs->bitmap_mutex);
2195
2196         return ret;
2197     }
2198
2199     struct RAMSrcPageRequest *new_entry =
2200         g_new0(struct RAMSrcPageRequest, 1);
2201     new_entry->rb = ramblock;
2202     new_entry->offset = start;
2203     new_entry->len = len;
2204
2205     memory_region_ref(ramblock->mr);
2206     qemu_mutex_lock(&rs->src_page_req_mutex);
2207     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2208     migration_make_urgent_request();
2209     qemu_mutex_unlock(&rs->src_page_req_mutex);
2210
2211     return 0;
2212 }
2213
2214 static bool save_page_use_compression(RAMState *rs)
2215 {
2216     if (!migrate_use_compression()) {
2217         return false;
2218     }
2219
2220     /*
2221      * If xbzrle is enabled (e.g., after first round of migration), stop
2222      * using the data compression. In theory, xbzrle can do better than
2223      * compression.
2224      */
2225     if (rs->xbzrle_enabled) {
2226         return false;
2227     }
2228
2229     return true;
2230 }
2231
2232 /*
2233  * try to compress the page before posting it out, return true if the page
2234  * has been properly handled by compression, otherwise needs other
2235  * paths to handle it
2236  */
2237 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2238                                RAMBlock *block, ram_addr_t offset)
2239 {
2240     if (!save_page_use_compression(rs)) {
2241         return false;
2242     }
2243
2244     /*
2245      * When starting the process of a new block, the first page of
2246      * the block should be sent out before other pages in the same
2247      * block, and all the pages in last block should have been sent
2248      * out, keeping this order is important, because the 'cont' flag
2249      * is used to avoid resending the block name.
2250      *
2251      * We post the fist page as normal page as compression will take
2252      * much CPU resource.
2253      */
2254     if (block != pss->last_sent_block) {
2255         flush_compressed_data(rs);
2256         return false;
2257     }
2258
2259     if (compress_page_with_multi_thread(block, offset) > 0) {
2260         return true;
2261     }
2262
2263     compression_counters.busy++;
2264     return false;
2265 }
2266
2267 /**
2268  * ram_save_target_page: save one target page
2269  *
2270  * Returns the number of pages written
2271  *
2272  * @rs: current RAM state
2273  * @pss: data about the page we want to send
2274  */
2275 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
2276 {
2277     RAMBlock *block = pss->block;
2278     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2279     int res;
2280
2281     if (control_save_page(pss, block, offset, &res)) {
2282         return res;
2283     }
2284
2285     if (save_compress_page(rs, pss, block, offset)) {
2286         return 1;
2287     }
2288
2289     res = save_zero_page(pss, block, offset);
2290     if (res > 0) {
2291         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2292          * page would be stale
2293          */
2294         if (rs->xbzrle_enabled) {
2295             XBZRLE_cache_lock();
2296             xbzrle_cache_zero_page(rs, block->offset + offset);
2297             XBZRLE_cache_unlock();
2298         }
2299         return res;
2300     }
2301
2302     /*
2303      * Do not use multifd in postcopy as one whole host page should be
2304      * placed.  Meanwhile postcopy requires atomic update of pages, so even
2305      * if host page size == guest page size the dest guest during run may
2306      * still see partially copied pages which is data corruption.
2307      */
2308     if (migrate_use_multifd() && !migration_in_postcopy()) {
2309         return ram_save_multifd_page(pss->pss_channel, block, offset);
2310     }
2311
2312     return ram_save_page(rs, pss);
2313 }
2314
2315 /* Should be called before sending a host page */
2316 static void pss_host_page_prepare(PageSearchStatus *pss)
2317 {
2318     /* How many guest pages are there in one host page? */
2319     size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2320
2321     pss->host_page_sending = true;
2322     if (guest_pfns <= 1) {
2323         /*
2324          * This covers both when guest psize == host psize, or when guest
2325          * has larger psize than the host (guest_pfns==0).
2326          *
2327          * For the latter, we always send one whole guest page per
2328          * iteration of the host page (example: an Alpha VM on x86 host
2329          * will have guest psize 8K while host psize 4K).
2330          */
2331         pss->host_page_start = pss->page;
2332         pss->host_page_end = pss->page + 1;
2333     } else {
2334         /*
2335          * The host page spans over multiple guest pages, we send them
2336          * within the same host page iteration.
2337          */
2338         pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2339         pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2340     }
2341 }
2342
2343 /*
2344  * Whether the page pointed by PSS is within the host page being sent.
2345  * Must be called after a previous pss_host_page_prepare().
2346  */
2347 static bool pss_within_range(PageSearchStatus *pss)
2348 {
2349     ram_addr_t ram_addr;
2350
2351     assert(pss->host_page_sending);
2352
2353     /* Over host-page boundary? */
2354     if (pss->page >= pss->host_page_end) {
2355         return false;
2356     }
2357
2358     ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2359
2360     return offset_in_ramblock(pss->block, ram_addr);
2361 }
2362
2363 static void pss_host_page_finish(PageSearchStatus *pss)
2364 {
2365     pss->host_page_sending = false;
2366     /* This is not needed, but just to reset it */
2367     pss->host_page_start = pss->host_page_end = 0;
2368 }
2369
2370 /*
2371  * Send an urgent host page specified by `pss'.  Need to be called with
2372  * bitmap_mutex held.
2373  *
2374  * Returns 0 if save host page succeeded, false otherwise.
2375  */
2376 static int ram_save_host_page_urgent(PageSearchStatus *pss)
2377 {
2378     bool page_dirty, sent = false;
2379     RAMState *rs = ram_state;
2380     int ret = 0;
2381
2382     trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2383     pss_host_page_prepare(pss);
2384
2385     /*
2386      * If precopy is sending the same page, let it be done in precopy, or
2387      * we could send the same page in two channels and none of them will
2388      * receive the whole page.
2389      */
2390     if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2391         trace_postcopy_preempt_hit(pss->block->idstr,
2392                                    pss->page << TARGET_PAGE_BITS);
2393         return 0;
2394     }
2395
2396     do {
2397         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2398
2399         if (page_dirty) {
2400             /* Be strict to return code; it must be 1, or what else? */
2401             if (ram_save_target_page(rs, pss) != 1) {
2402                 error_report_once("%s: ram_save_target_page failed", __func__);
2403                 ret = -1;
2404                 goto out;
2405             }
2406             sent = true;
2407         }
2408         pss_find_next_dirty(pss);
2409     } while (pss_within_range(pss));
2410 out:
2411     pss_host_page_finish(pss);
2412     /* For urgent requests, flush immediately if sent */
2413     if (sent) {
2414         qemu_fflush(pss->pss_channel);
2415     }
2416     return ret;
2417 }
2418
2419 /**
2420  * ram_save_host_page: save a whole host page
2421  *
2422  * Starting at *offset send pages up to the end of the current host
2423  * page. It's valid for the initial offset to point into the middle of
2424  * a host page in which case the remainder of the hostpage is sent.
2425  * Only dirty target pages are sent. Note that the host page size may
2426  * be a huge page for this block.
2427  *
2428  * The saving stops at the boundary of the used_length of the block
2429  * if the RAMBlock isn't a multiple of the host page size.
2430  *
2431  * The caller must be with ram_state.bitmap_mutex held to call this
2432  * function.  Note that this function can temporarily release the lock, but
2433  * when the function is returned it'll make sure the lock is still held.
2434  *
2435  * Returns the number of pages written or negative on error
2436  *
2437  * @rs: current RAM state
2438  * @pss: data about the page we want to send
2439  */
2440 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2441 {
2442     bool page_dirty, preempt_active = postcopy_preempt_active();
2443     int tmppages, pages = 0;
2444     size_t pagesize_bits =
2445         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2446     unsigned long start_page = pss->page;
2447     int res;
2448
2449     if (ramblock_is_ignored(pss->block)) {
2450         error_report("block %s should not be migrated !", pss->block->idstr);
2451         return 0;
2452     }
2453
2454     /* Update host page boundary information */
2455     pss_host_page_prepare(pss);
2456
2457     do {
2458         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2459
2460         /* Check the pages is dirty and if it is send it */
2461         if (page_dirty) {
2462             /*
2463              * Properly yield the lock only in postcopy preempt mode
2464              * because both migration thread and rp-return thread can
2465              * operate on the bitmaps.
2466              */
2467             if (preempt_active) {
2468                 qemu_mutex_unlock(&rs->bitmap_mutex);
2469             }
2470             tmppages = ram_save_target_page(rs, pss);
2471             if (tmppages >= 0) {
2472                 pages += tmppages;
2473                 /*
2474                  * Allow rate limiting to happen in the middle of huge pages if
2475                  * something is sent in the current iteration.
2476                  */
2477                 if (pagesize_bits > 1 && tmppages > 0) {
2478                     migration_rate_limit();
2479                 }
2480             }
2481             if (preempt_active) {
2482                 qemu_mutex_lock(&rs->bitmap_mutex);
2483             }
2484         } else {
2485             tmppages = 0;
2486         }
2487
2488         if (tmppages < 0) {
2489             pss_host_page_finish(pss);
2490             return tmppages;
2491         }
2492
2493         pss_find_next_dirty(pss);
2494     } while (pss_within_range(pss));
2495
2496     pss_host_page_finish(pss);
2497
2498     res = ram_save_release_protection(rs, pss, start_page);
2499     return (res < 0 ? res : pages);
2500 }
2501
2502 /**
2503  * ram_find_and_save_block: finds a dirty page and sends it to f
2504  *
2505  * Called within an RCU critical section.
2506  *
2507  * Returns the number of pages written where zero means no dirty pages,
2508  * or negative on error
2509  *
2510  * @rs: current RAM state
2511  *
2512  * On systems where host-page-size > target-page-size it will send all the
2513  * pages in a host page that are dirty.
2514  */
2515 static int ram_find_and_save_block(RAMState *rs)
2516 {
2517     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2518     int pages = 0;
2519     bool again, found;
2520
2521     /* No dirty page as there is zero RAM */
2522     if (!ram_bytes_total()) {
2523         return pages;
2524     }
2525
2526     /*
2527      * Always keep last_seen_block/last_page valid during this procedure,
2528      * because find_dirty_block() relies on these values (e.g., we compare
2529      * last_seen_block with pss.block to see whether we searched all the
2530      * ramblocks) to detect the completion of migration.  Having NULL value
2531      * of last_seen_block can conditionally cause below loop to run forever.
2532      */
2533     if (!rs->last_seen_block) {
2534         rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2535         rs->last_page = 0;
2536     }
2537
2538     pss_init(pss, rs->last_seen_block, rs->last_page);
2539
2540     do {
2541         again = true;
2542         found = get_queued_page(rs, pss);
2543
2544         if (!found) {
2545             /* priority queue empty, so just search for something dirty */
2546             found = find_dirty_block(rs, pss, &again);
2547         }
2548
2549         if (found) {
2550             pages = ram_save_host_page(rs, pss);
2551         }
2552     } while (!pages && again);
2553
2554     rs->last_seen_block = pss->block;
2555     rs->last_page = pss->page;
2556
2557     return pages;
2558 }
2559
2560 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2561 {
2562     uint64_t pages = size / TARGET_PAGE_SIZE;
2563
2564     if (zero) {
2565         stat64_add(&ram_atomic_counters.duplicate, pages);
2566     } else {
2567         stat64_add(&ram_atomic_counters.normal, pages);
2568         ram_transferred_add(size);
2569         qemu_file_credit_transfer(f, size);
2570     }
2571 }
2572
2573 static uint64_t ram_bytes_total_common(bool count_ignored)
2574 {
2575     RAMBlock *block;
2576     uint64_t total = 0;
2577
2578     RCU_READ_LOCK_GUARD();
2579
2580     if (count_ignored) {
2581         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2582             total += block->used_length;
2583         }
2584     } else {
2585         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2586             total += block->used_length;
2587         }
2588     }
2589     return total;
2590 }
2591
2592 uint64_t ram_bytes_total(void)
2593 {
2594     return ram_bytes_total_common(false);
2595 }
2596
2597 static void xbzrle_load_setup(void)
2598 {
2599     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2600 }
2601
2602 static void xbzrle_load_cleanup(void)
2603 {
2604     g_free(XBZRLE.decoded_buf);
2605     XBZRLE.decoded_buf = NULL;
2606 }
2607
2608 static void ram_state_cleanup(RAMState **rsp)
2609 {
2610     if (*rsp) {
2611         migration_page_queue_free(*rsp);
2612         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2613         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2614         g_free(*rsp);
2615         *rsp = NULL;
2616     }
2617 }
2618
2619 static void xbzrle_cleanup(void)
2620 {
2621     XBZRLE_cache_lock();
2622     if (XBZRLE.cache) {
2623         cache_fini(XBZRLE.cache);
2624         g_free(XBZRLE.encoded_buf);
2625         g_free(XBZRLE.current_buf);
2626         g_free(XBZRLE.zero_target_page);
2627         XBZRLE.cache = NULL;
2628         XBZRLE.encoded_buf = NULL;
2629         XBZRLE.current_buf = NULL;
2630         XBZRLE.zero_target_page = NULL;
2631     }
2632     XBZRLE_cache_unlock();
2633 }
2634
2635 static void ram_save_cleanup(void *opaque)
2636 {
2637     RAMState **rsp = opaque;
2638     RAMBlock *block;
2639
2640     /* We don't use dirty log with background snapshots */
2641     if (!migrate_background_snapshot()) {
2642         /* caller have hold iothread lock or is in a bh, so there is
2643          * no writing race against the migration bitmap
2644          */
2645         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2646             /*
2647              * do not stop dirty log without starting it, since
2648              * memory_global_dirty_log_stop will assert that
2649              * memory_global_dirty_log_start/stop used in pairs
2650              */
2651             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2652         }
2653     }
2654
2655     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2656         g_free(block->clear_bmap);
2657         block->clear_bmap = NULL;
2658         g_free(block->bmap);
2659         block->bmap = NULL;
2660     }
2661
2662     xbzrle_cleanup();
2663     compress_threads_save_cleanup();
2664     ram_state_cleanup(rsp);
2665 }
2666
2667 static void ram_state_reset(RAMState *rs)
2668 {
2669     int i;
2670
2671     for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2672         rs->pss[i].last_sent_block = NULL;
2673     }
2674
2675     rs->last_seen_block = NULL;
2676     rs->last_page = 0;
2677     rs->last_version = ram_list.version;
2678     rs->xbzrle_enabled = false;
2679 }
2680
2681 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2682
2683 /* **** functions for postcopy ***** */
2684
2685 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2686 {
2687     struct RAMBlock *block;
2688
2689     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2690         unsigned long *bitmap = block->bmap;
2691         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2692         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2693
2694         while (run_start < range) {
2695             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2696             ram_discard_range(block->idstr,
2697                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2698                               ((ram_addr_t)(run_end - run_start))
2699                                 << TARGET_PAGE_BITS);
2700             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2701         }
2702     }
2703 }
2704
2705 /**
2706  * postcopy_send_discard_bm_ram: discard a RAMBlock
2707  *
2708  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2709  *
2710  * @ms: current migration state
2711  * @block: RAMBlock to discard
2712  */
2713 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2714 {
2715     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2716     unsigned long current;
2717     unsigned long *bitmap = block->bmap;
2718
2719     for (current = 0; current < end; ) {
2720         unsigned long one = find_next_bit(bitmap, end, current);
2721         unsigned long zero, discard_length;
2722
2723         if (one >= end) {
2724             break;
2725         }
2726
2727         zero = find_next_zero_bit(bitmap, end, one + 1);
2728
2729         if (zero >= end) {
2730             discard_length = end - one;
2731         } else {
2732             discard_length = zero - one;
2733         }
2734         postcopy_discard_send_range(ms, one, discard_length);
2735         current = one + discard_length;
2736     }
2737 }
2738
2739 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2740
2741 /**
2742  * postcopy_each_ram_send_discard: discard all RAMBlocks
2743  *
2744  * Utility for the outgoing postcopy code.
2745  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2746  *   passing it bitmap indexes and name.
2747  * (qemu_ram_foreach_block ends up passing unscaled lengths
2748  *  which would mean postcopy code would have to deal with target page)
2749  *
2750  * @ms: current migration state
2751  */
2752 static void postcopy_each_ram_send_discard(MigrationState *ms)
2753 {
2754     struct RAMBlock *block;
2755
2756     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2757         postcopy_discard_send_init(ms, block->idstr);
2758
2759         /*
2760          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2761          * host-page size chunks, mark any partially dirty host-page size
2762          * chunks as all dirty.  In this case the host-page is the host-page
2763          * for the particular RAMBlock, i.e. it might be a huge page.
2764          */
2765         postcopy_chunk_hostpages_pass(ms, block);
2766
2767         /*
2768          * Postcopy sends chunks of bitmap over the wire, but it
2769          * just needs indexes at this point, avoids it having
2770          * target page specific code.
2771          */
2772         postcopy_send_discard_bm_ram(ms, block);
2773         postcopy_discard_send_finish(ms);
2774     }
2775 }
2776
2777 /**
2778  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2779  *
2780  * Helper for postcopy_chunk_hostpages; it's called twice to
2781  * canonicalize the two bitmaps, that are similar, but one is
2782  * inverted.
2783  *
2784  * Postcopy requires that all target pages in a hostpage are dirty or
2785  * clean, not a mix.  This function canonicalizes the bitmaps.
2786  *
2787  * @ms: current migration state
2788  * @block: block that contains the page we want to canonicalize
2789  */
2790 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2791 {
2792     RAMState *rs = ram_state;
2793     unsigned long *bitmap = block->bmap;
2794     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2795     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2796     unsigned long run_start;
2797
2798     if (block->page_size == TARGET_PAGE_SIZE) {
2799         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2800         return;
2801     }
2802
2803     /* Find a dirty page */
2804     run_start = find_next_bit(bitmap, pages, 0);
2805
2806     while (run_start < pages) {
2807
2808         /*
2809          * If the start of this run of pages is in the middle of a host
2810          * page, then we need to fixup this host page.
2811          */
2812         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2813             /* Find the end of this run */
2814             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2815             /*
2816              * If the end isn't at the start of a host page, then the
2817              * run doesn't finish at the end of a host page
2818              * and we need to discard.
2819              */
2820         }
2821
2822         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2823             unsigned long page;
2824             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2825                                                              host_ratio);
2826             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2827
2828             /* Clean up the bitmap */
2829             for (page = fixup_start_addr;
2830                  page < fixup_start_addr + host_ratio; page++) {
2831                 /*
2832                  * Remark them as dirty, updating the count for any pages
2833                  * that weren't previously dirty.
2834                  */
2835                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2836             }
2837         }
2838
2839         /* Find the next dirty page for the next iteration */
2840         run_start = find_next_bit(bitmap, pages, run_start);
2841     }
2842 }
2843
2844 /**
2845  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2846  *
2847  * Transmit the set of pages to be discarded after precopy to the target
2848  * these are pages that:
2849  *     a) Have been previously transmitted but are now dirty again
2850  *     b) Pages that have never been transmitted, this ensures that
2851  *        any pages on the destination that have been mapped by background
2852  *        tasks get discarded (transparent huge pages is the specific concern)
2853  * Hopefully this is pretty sparse
2854  *
2855  * @ms: current migration state
2856  */
2857 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2858 {
2859     RAMState *rs = ram_state;
2860
2861     RCU_READ_LOCK_GUARD();
2862
2863     /* This should be our last sync, the src is now paused */
2864     migration_bitmap_sync(rs);
2865
2866     /* Easiest way to make sure we don't resume in the middle of a host-page */
2867     rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
2868     rs->last_seen_block = NULL;
2869     rs->last_page = 0;
2870
2871     postcopy_each_ram_send_discard(ms);
2872
2873     trace_ram_postcopy_send_discard_bitmap();
2874 }
2875
2876 /**
2877  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2878  *
2879  * Returns zero on success
2880  *
2881  * @rbname: name of the RAMBlock of the request. NULL means the
2882  *          same that last one.
2883  * @start: RAMBlock starting page
2884  * @length: RAMBlock size
2885  */
2886 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2887 {
2888     trace_ram_discard_range(rbname, start, length);
2889
2890     RCU_READ_LOCK_GUARD();
2891     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2892
2893     if (!rb) {
2894         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2895         return -1;
2896     }
2897
2898     /*
2899      * On source VM, we don't need to update the received bitmap since
2900      * we don't even have one.
2901      */
2902     if (rb->receivedmap) {
2903         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2904                      length >> qemu_target_page_bits());
2905     }
2906
2907     return ram_block_discard_range(rb, start, length);
2908 }
2909
2910 /*
2911  * For every allocation, we will try not to crash the VM if the
2912  * allocation failed.
2913  */
2914 static int xbzrle_init(void)
2915 {
2916     Error *local_err = NULL;
2917
2918     if (!migrate_use_xbzrle()) {
2919         return 0;
2920     }
2921
2922     XBZRLE_cache_lock();
2923
2924     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2925     if (!XBZRLE.zero_target_page) {
2926         error_report("%s: Error allocating zero page", __func__);
2927         goto err_out;
2928     }
2929
2930     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2931                               TARGET_PAGE_SIZE, &local_err);
2932     if (!XBZRLE.cache) {
2933         error_report_err(local_err);
2934         goto free_zero_page;
2935     }
2936
2937     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2938     if (!XBZRLE.encoded_buf) {
2939         error_report("%s: Error allocating encoded_buf", __func__);
2940         goto free_cache;
2941     }
2942
2943     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2944     if (!XBZRLE.current_buf) {
2945         error_report("%s: Error allocating current_buf", __func__);
2946         goto free_encoded_buf;
2947     }
2948
2949     /* We are all good */
2950     XBZRLE_cache_unlock();
2951     return 0;
2952
2953 free_encoded_buf:
2954     g_free(XBZRLE.encoded_buf);
2955     XBZRLE.encoded_buf = NULL;
2956 free_cache:
2957     cache_fini(XBZRLE.cache);
2958     XBZRLE.cache = NULL;
2959 free_zero_page:
2960     g_free(XBZRLE.zero_target_page);
2961     XBZRLE.zero_target_page = NULL;
2962 err_out:
2963     XBZRLE_cache_unlock();
2964     return -ENOMEM;
2965 }
2966
2967 static int ram_state_init(RAMState **rsp)
2968 {
2969     *rsp = g_try_new0(RAMState, 1);
2970
2971     if (!*rsp) {
2972         error_report("%s: Init ramstate fail", __func__);
2973         return -1;
2974     }
2975
2976     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2977     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2978     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2979
2980     /*
2981      * Count the total number of pages used by ram blocks not including any
2982      * gaps due to alignment or unplugs.
2983      * This must match with the initial values of dirty bitmap.
2984      */
2985     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2986     ram_state_reset(*rsp);
2987
2988     return 0;
2989 }
2990
2991 static void ram_list_init_bitmaps(void)
2992 {
2993     MigrationState *ms = migrate_get_current();
2994     RAMBlock *block;
2995     unsigned long pages;
2996     uint8_t shift;
2997
2998     /* Skip setting bitmap if there is no RAM */
2999     if (ram_bytes_total()) {
3000         shift = ms->clear_bitmap_shift;
3001         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3002             error_report("clear_bitmap_shift (%u) too big, using "
3003                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3004             shift = CLEAR_BITMAP_SHIFT_MAX;
3005         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3006             error_report("clear_bitmap_shift (%u) too small, using "
3007                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3008             shift = CLEAR_BITMAP_SHIFT_MIN;
3009         }
3010
3011         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3012             pages = block->max_length >> TARGET_PAGE_BITS;
3013             /*
3014              * The initial dirty bitmap for migration must be set with all
3015              * ones to make sure we'll migrate every guest RAM page to
3016              * destination.
3017              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3018              * new migration after a failed migration, ram_list.
3019              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3020              * guest memory.
3021              */
3022             block->bmap = bitmap_new(pages);
3023             bitmap_set(block->bmap, 0, pages);
3024             block->clear_bmap_shift = shift;
3025             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3026         }
3027     }
3028 }
3029
3030 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3031 {
3032     unsigned long pages;
3033     RAMBlock *rb;
3034
3035     RCU_READ_LOCK_GUARD();
3036
3037     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3038             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3039             rs->migration_dirty_pages -= pages;
3040     }
3041 }
3042
3043 static void ram_init_bitmaps(RAMState *rs)
3044 {
3045     /* For memory_global_dirty_log_start below.  */
3046     qemu_mutex_lock_iothread();
3047     qemu_mutex_lock_ramlist();
3048
3049     WITH_RCU_READ_LOCK_GUARD() {
3050         ram_list_init_bitmaps();
3051         /* We don't use dirty log with background snapshots */
3052         if (!migrate_background_snapshot()) {
3053             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3054             migration_bitmap_sync_precopy(rs);
3055         }
3056     }
3057     qemu_mutex_unlock_ramlist();
3058     qemu_mutex_unlock_iothread();
3059
3060     /*
3061      * After an eventual first bitmap sync, fixup the initial bitmap
3062      * containing all 1s to exclude any discarded pages from migration.
3063      */
3064     migration_bitmap_clear_discarded_pages(rs);
3065 }
3066
3067 static int ram_init_all(RAMState **rsp)
3068 {
3069     if (ram_state_init(rsp)) {
3070         return -1;
3071     }
3072
3073     if (xbzrle_init()) {
3074         ram_state_cleanup(rsp);
3075         return -1;
3076     }
3077
3078     ram_init_bitmaps(*rsp);
3079
3080     return 0;
3081 }
3082
3083 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3084 {
3085     RAMBlock *block;
3086     uint64_t pages = 0;
3087
3088     /*
3089      * Postcopy is not using xbzrle/compression, so no need for that.
3090      * Also, since source are already halted, we don't need to care
3091      * about dirty page logging as well.
3092      */
3093
3094     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3095         pages += bitmap_count_one(block->bmap,
3096                                   block->used_length >> TARGET_PAGE_BITS);
3097     }
3098
3099     /* This may not be aligned with current bitmaps. Recalculate. */
3100     rs->migration_dirty_pages = pages;
3101
3102     ram_state_reset(rs);
3103
3104     /* Update RAMState cache of output QEMUFile */
3105     rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
3106
3107     trace_ram_state_resume_prepare(pages);
3108 }
3109
3110 /*
3111  * This function clears bits of the free pages reported by the caller from the
3112  * migration dirty bitmap. @addr is the host address corresponding to the
3113  * start of the continuous guest free pages, and @len is the total bytes of
3114  * those pages.
3115  */
3116 void qemu_guest_free_page_hint(void *addr, size_t len)
3117 {
3118     RAMBlock *block;
3119     ram_addr_t offset;
3120     size_t used_len, start, npages;
3121     MigrationState *s = migrate_get_current();
3122
3123     /* This function is currently expected to be used during live migration */
3124     if (!migration_is_setup_or_active(s->state)) {
3125         return;
3126     }
3127
3128     for (; len > 0; len -= used_len, addr += used_len) {
3129         block = qemu_ram_block_from_host(addr, false, &offset);
3130         if (unlikely(!block || offset >= block->used_length)) {
3131             /*
3132              * The implementation might not support RAMBlock resize during
3133              * live migration, but it could happen in theory with future
3134              * updates. So we add a check here to capture that case.
3135              */
3136             error_report_once("%s unexpected error", __func__);
3137             return;
3138         }
3139
3140         if (len <= block->used_length - offset) {
3141             used_len = len;
3142         } else {
3143             used_len = block->used_length - offset;
3144         }
3145
3146         start = offset >> TARGET_PAGE_BITS;
3147         npages = used_len >> TARGET_PAGE_BITS;
3148
3149         qemu_mutex_lock(&ram_state->bitmap_mutex);
3150         /*
3151          * The skipped free pages are equavalent to be sent from clear_bmap's
3152          * perspective, so clear the bits from the memory region bitmap which
3153          * are initially set. Otherwise those skipped pages will be sent in
3154          * the next round after syncing from the memory region bitmap.
3155          */
3156         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
3157         ram_state->migration_dirty_pages -=
3158                       bitmap_count_one_with_offset(block->bmap, start, npages);
3159         bitmap_clear(block->bmap, start, npages);
3160         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3161     }
3162 }
3163
3164 /*
3165  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3166  * long-running RCU critical section.  When rcu-reclaims in the code
3167  * start to become numerous it will be necessary to reduce the
3168  * granularity of these critical sections.
3169  */
3170
3171 /**
3172  * ram_save_setup: Setup RAM for migration
3173  *
3174  * Returns zero to indicate success and negative for error
3175  *
3176  * @f: QEMUFile where to send the data
3177  * @opaque: RAMState pointer
3178  */
3179 static int ram_save_setup(QEMUFile *f, void *opaque)
3180 {
3181     RAMState **rsp = opaque;
3182     RAMBlock *block;
3183     int ret;
3184
3185     if (compress_threads_save_setup()) {
3186         return -1;
3187     }
3188
3189     /* migration has already setup the bitmap, reuse it. */
3190     if (!migration_in_colo_state()) {
3191         if (ram_init_all(rsp) != 0) {
3192             compress_threads_save_cleanup();
3193             return -1;
3194         }
3195     }
3196     (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
3197
3198     WITH_RCU_READ_LOCK_GUARD() {
3199         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3200
3201         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3202             qemu_put_byte(f, strlen(block->idstr));
3203             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3204             qemu_put_be64(f, block->used_length);
3205             if (migrate_postcopy_ram() && block->page_size !=
3206                                           qemu_host_page_size) {
3207                 qemu_put_be64(f, block->page_size);
3208             }
3209             if (migrate_ignore_shared()) {
3210                 qemu_put_be64(f, block->mr->addr);
3211             }
3212         }
3213     }
3214
3215     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3216     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3217
3218     ret =  multifd_send_sync_main(f);
3219     if (ret < 0) {
3220         return ret;
3221     }
3222
3223     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3224     qemu_fflush(f);
3225
3226     return 0;
3227 }
3228
3229 /**
3230  * ram_save_iterate: iterative stage for migration
3231  *
3232  * Returns zero to indicate success and negative for error
3233  *
3234  * @f: QEMUFile where to send the data
3235  * @opaque: RAMState pointer
3236  */
3237 static int ram_save_iterate(QEMUFile *f, void *opaque)
3238 {
3239     RAMState **temp = opaque;
3240     RAMState *rs = *temp;
3241     int ret = 0;
3242     int i;
3243     int64_t t0;
3244     int done = 0;
3245
3246     if (blk_mig_bulk_active()) {
3247         /* Avoid transferring ram during bulk phase of block migration as
3248          * the bulk phase will usually take a long time and transferring
3249          * ram updates during that time is pointless. */
3250         goto out;
3251     }
3252
3253     /*
3254      * We'll take this lock a little bit long, but it's okay for two reasons.
3255      * Firstly, the only possible other thread to take it is who calls
3256      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3257      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3258      * guarantees that we'll at least released it in a regular basis.
3259      */
3260     qemu_mutex_lock(&rs->bitmap_mutex);
3261     WITH_RCU_READ_LOCK_GUARD() {
3262         if (ram_list.version != rs->last_version) {
3263             ram_state_reset(rs);
3264         }
3265
3266         /* Read version before ram_list.blocks */
3267         smp_rmb();
3268
3269         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3270
3271         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3272         i = 0;
3273         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3274                postcopy_has_request(rs)) {
3275             int pages;
3276
3277             if (qemu_file_get_error(f)) {
3278                 break;
3279             }
3280
3281             pages = ram_find_and_save_block(rs);
3282             /* no more pages to sent */
3283             if (pages == 0) {
3284                 done = 1;
3285                 break;
3286             }
3287
3288             if (pages < 0) {
3289                 qemu_file_set_error(f, pages);
3290                 break;
3291             }
3292
3293             rs->target_page_count += pages;
3294
3295             /*
3296              * During postcopy, it is necessary to make sure one whole host
3297              * page is sent in one chunk.
3298              */
3299             if (migrate_postcopy_ram()) {
3300                 flush_compressed_data(rs);
3301             }
3302
3303             /*
3304              * we want to check in the 1st loop, just in case it was the 1st
3305              * time and we had to sync the dirty bitmap.
3306              * qemu_clock_get_ns() is a bit expensive, so we only check each
3307              * some iterations
3308              */
3309             if ((i & 63) == 0) {
3310                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3311                               1000000;
3312                 if (t1 > MAX_WAIT) {
3313                     trace_ram_save_iterate_big_wait(t1, i);
3314                     break;
3315                 }
3316             }
3317             i++;
3318         }
3319     }
3320     qemu_mutex_unlock(&rs->bitmap_mutex);
3321
3322     /*
3323      * Must occur before EOS (or any QEMUFile operation)
3324      * because of RDMA protocol.
3325      */
3326     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3327
3328 out:
3329     if (ret >= 0
3330         && migration_is_setup_or_active(migrate_get_current()->state)) {
3331         ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3332         if (ret < 0) {
3333             return ret;
3334         }
3335
3336         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3337         qemu_fflush(f);
3338         ram_transferred_add(8);
3339
3340         ret = qemu_file_get_error(f);
3341     }
3342     if (ret < 0) {
3343         return ret;
3344     }
3345
3346     return done;
3347 }
3348
3349 /**
3350  * ram_save_complete: function called to send the remaining amount of ram
3351  *
3352  * Returns zero to indicate success or negative on error
3353  *
3354  * Called with iothread lock
3355  *
3356  * @f: QEMUFile where to send the data
3357  * @opaque: RAMState pointer
3358  */
3359 static int ram_save_complete(QEMUFile *f, void *opaque)
3360 {
3361     RAMState **temp = opaque;
3362     RAMState *rs = *temp;
3363     int ret = 0;
3364
3365     rs->last_stage = !migration_in_colo_state();
3366
3367     WITH_RCU_READ_LOCK_GUARD() {
3368         if (!migration_in_postcopy()) {
3369             migration_bitmap_sync_precopy(rs);
3370         }
3371
3372         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3373
3374         /* try transferring iterative blocks of memory */
3375
3376         /* flush all remaining blocks regardless of rate limiting */
3377         qemu_mutex_lock(&rs->bitmap_mutex);
3378         while (true) {
3379             int pages;
3380
3381             pages = ram_find_and_save_block(rs);
3382             /* no more blocks to sent */
3383             if (pages == 0) {
3384                 break;
3385             }
3386             if (pages < 0) {
3387                 ret = pages;
3388                 break;
3389             }
3390         }
3391         qemu_mutex_unlock(&rs->bitmap_mutex);
3392
3393         flush_compressed_data(rs);
3394         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3395     }
3396
3397     if (ret < 0) {
3398         return ret;
3399     }
3400
3401     ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3402     if (ret < 0) {
3403         return ret;
3404     }
3405
3406     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3407     qemu_fflush(f);
3408
3409     return 0;
3410 }
3411
3412 static void ram_state_pending_estimate(void *opaque, uint64_t max_size,
3413                                        uint64_t *res_precopy_only,
3414                                        uint64_t *res_compatible,
3415                                        uint64_t *res_postcopy_only)
3416 {
3417     RAMState **temp = opaque;
3418     RAMState *rs = *temp;
3419
3420     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3421
3422     if (migrate_postcopy_ram()) {
3423         /* We can do postcopy, and all the data is postcopiable */
3424         *res_postcopy_only += remaining_size;
3425     } else {
3426         *res_precopy_only += remaining_size;
3427     }
3428 }
3429
3430 static void ram_state_pending_exact(void *opaque, uint64_t max_size,
3431                                     uint64_t *res_precopy_only,
3432                                     uint64_t *res_compatible,
3433                                     uint64_t *res_postcopy_only)
3434 {
3435     RAMState **temp = opaque;
3436     RAMState *rs = *temp;
3437
3438     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3439
3440     if (!migration_in_postcopy()) {
3441         qemu_mutex_lock_iothread();
3442         WITH_RCU_READ_LOCK_GUARD() {
3443             migration_bitmap_sync_precopy(rs);
3444         }
3445         qemu_mutex_unlock_iothread();
3446         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3447     }
3448
3449     if (migrate_postcopy_ram()) {
3450         /* We can do postcopy, and all the data is postcopiable */
3451         *res_compatible += remaining_size;
3452     } else {
3453         *res_precopy_only += remaining_size;
3454     }
3455 }
3456
3457 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3458 {
3459     unsigned int xh_len;
3460     int xh_flags;
3461     uint8_t *loaded_data;
3462
3463     /* extract RLE header */
3464     xh_flags = qemu_get_byte(f);
3465     xh_len = qemu_get_be16(f);
3466
3467     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3468         error_report("Failed to load XBZRLE page - wrong compression!");
3469         return -1;
3470     }
3471
3472     if (xh_len > TARGET_PAGE_SIZE) {
3473         error_report("Failed to load XBZRLE page - len overflow!");
3474         return -1;
3475     }
3476     loaded_data = XBZRLE.decoded_buf;
3477     /* load data and decode */
3478     /* it can change loaded_data to point to an internal buffer */
3479     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3480
3481     /* decode RLE */
3482     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3483                              TARGET_PAGE_SIZE) == -1) {
3484         error_report("Failed to load XBZRLE page - decode error!");
3485         return -1;
3486     }
3487
3488     return 0;
3489 }
3490
3491 /**
3492  * ram_block_from_stream: read a RAMBlock id from the migration stream
3493  *
3494  * Must be called from within a rcu critical section.
3495  *
3496  * Returns a pointer from within the RCU-protected ram_list.
3497  *
3498  * @mis: the migration incoming state pointer
3499  * @f: QEMUFile where to read the data from
3500  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3501  * @channel: the channel we're using
3502  */
3503 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3504                                               QEMUFile *f, int flags,
3505                                               int channel)
3506 {
3507     RAMBlock *block = mis->last_recv_block[channel];
3508     char id[256];
3509     uint8_t len;
3510
3511     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3512         if (!block) {
3513             error_report("Ack, bad migration stream!");
3514             return NULL;
3515         }
3516         return block;
3517     }
3518
3519     len = qemu_get_byte(f);
3520     qemu_get_buffer(f, (uint8_t *)id, len);
3521     id[len] = 0;
3522
3523     block = qemu_ram_block_by_name(id);
3524     if (!block) {
3525         error_report("Can't find block %s", id);
3526         return NULL;
3527     }
3528
3529     if (ramblock_is_ignored(block)) {
3530         error_report("block %s should not be migrated !", id);
3531         return NULL;
3532     }
3533
3534     mis->last_recv_block[channel] = block;
3535
3536     return block;
3537 }
3538
3539 static inline void *host_from_ram_block_offset(RAMBlock *block,
3540                                                ram_addr_t offset)
3541 {
3542     if (!offset_in_ramblock(block, offset)) {
3543         return NULL;
3544     }
3545
3546     return block->host + offset;
3547 }
3548
3549 static void *host_page_from_ram_block_offset(RAMBlock *block,
3550                                              ram_addr_t offset)
3551 {
3552     /* Note: Explicitly no check against offset_in_ramblock(). */
3553     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3554                                    block->page_size);
3555 }
3556
3557 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3558                                                          ram_addr_t offset)
3559 {
3560     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3561 }
3562
3563 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3564                              ram_addr_t offset, bool record_bitmap)
3565 {
3566     if (!offset_in_ramblock(block, offset)) {
3567         return NULL;
3568     }
3569     if (!block->colo_cache) {
3570         error_report("%s: colo_cache is NULL in block :%s",
3571                      __func__, block->idstr);
3572         return NULL;
3573     }
3574
3575     /*
3576     * During colo checkpoint, we need bitmap of these migrated pages.
3577     * It help us to decide which pages in ram cache should be flushed
3578     * into VM's RAM later.
3579     */
3580     if (record_bitmap &&
3581         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3582         ram_state->migration_dirty_pages++;
3583     }
3584     return block->colo_cache + offset;
3585 }
3586
3587 /**
3588  * ram_handle_compressed: handle the zero page case
3589  *
3590  * If a page (or a whole RDMA chunk) has been
3591  * determined to be zero, then zap it.
3592  *
3593  * @host: host address for the zero page
3594  * @ch: what the page is filled from.  We only support zero
3595  * @size: size of the zero page
3596  */
3597 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3598 {
3599     if (ch != 0 || !buffer_is_zero(host, size)) {
3600         memset(host, ch, size);
3601     }
3602 }
3603
3604 /* return the size after decompression, or negative value on error */
3605 static int
3606 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3607                      const uint8_t *source, size_t source_len)
3608 {
3609     int err;
3610
3611     err = inflateReset(stream);
3612     if (err != Z_OK) {
3613         return -1;
3614     }
3615
3616     stream->avail_in = source_len;
3617     stream->next_in = (uint8_t *)source;
3618     stream->avail_out = dest_len;
3619     stream->next_out = dest;
3620
3621     err = inflate(stream, Z_NO_FLUSH);
3622     if (err != Z_STREAM_END) {
3623         return -1;
3624     }
3625
3626     return stream->total_out;
3627 }
3628
3629 static void *do_data_decompress(void *opaque)
3630 {
3631     DecompressParam *param = opaque;
3632     unsigned long pagesize;
3633     uint8_t *des;
3634     int len, ret;
3635
3636     qemu_mutex_lock(&param->mutex);
3637     while (!param->quit) {
3638         if (param->des) {
3639             des = param->des;
3640             len = param->len;
3641             param->des = 0;
3642             qemu_mutex_unlock(&param->mutex);
3643
3644             pagesize = TARGET_PAGE_SIZE;
3645
3646             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3647                                        param->compbuf, len);
3648             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3649                 error_report("decompress data failed");
3650                 qemu_file_set_error(decomp_file, ret);
3651             }
3652
3653             qemu_mutex_lock(&decomp_done_lock);
3654             param->done = true;
3655             qemu_cond_signal(&decomp_done_cond);
3656             qemu_mutex_unlock(&decomp_done_lock);
3657
3658             qemu_mutex_lock(&param->mutex);
3659         } else {
3660             qemu_cond_wait(&param->cond, &param->mutex);
3661         }
3662     }
3663     qemu_mutex_unlock(&param->mutex);
3664
3665     return NULL;
3666 }
3667
3668 static int wait_for_decompress_done(void)
3669 {
3670     int idx, thread_count;
3671
3672     if (!migrate_use_compression()) {
3673         return 0;
3674     }
3675
3676     thread_count = migrate_decompress_threads();
3677     qemu_mutex_lock(&decomp_done_lock);
3678     for (idx = 0; idx < thread_count; idx++) {
3679         while (!decomp_param[idx].done) {
3680             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3681         }
3682     }
3683     qemu_mutex_unlock(&decomp_done_lock);
3684     return qemu_file_get_error(decomp_file);
3685 }
3686
3687 static void compress_threads_load_cleanup(void)
3688 {
3689     int i, thread_count;
3690
3691     if (!migrate_use_compression()) {
3692         return;
3693     }
3694     thread_count = migrate_decompress_threads();
3695     for (i = 0; i < thread_count; i++) {
3696         /*
3697          * we use it as a indicator which shows if the thread is
3698          * properly init'd or not
3699          */
3700         if (!decomp_param[i].compbuf) {
3701             break;
3702         }
3703
3704         qemu_mutex_lock(&decomp_param[i].mutex);
3705         decomp_param[i].quit = true;
3706         qemu_cond_signal(&decomp_param[i].cond);
3707         qemu_mutex_unlock(&decomp_param[i].mutex);
3708     }
3709     for (i = 0; i < thread_count; i++) {
3710         if (!decomp_param[i].compbuf) {
3711             break;
3712         }
3713
3714         qemu_thread_join(decompress_threads + i);
3715         qemu_mutex_destroy(&decomp_param[i].mutex);
3716         qemu_cond_destroy(&decomp_param[i].cond);
3717         inflateEnd(&decomp_param[i].stream);
3718         g_free(decomp_param[i].compbuf);
3719         decomp_param[i].compbuf = NULL;
3720     }
3721     g_free(decompress_threads);
3722     g_free(decomp_param);
3723     decompress_threads = NULL;
3724     decomp_param = NULL;
3725     decomp_file = NULL;
3726 }
3727
3728 static int compress_threads_load_setup(QEMUFile *f)
3729 {
3730     int i, thread_count;
3731
3732     if (!migrate_use_compression()) {
3733         return 0;
3734     }
3735
3736     thread_count = migrate_decompress_threads();
3737     decompress_threads = g_new0(QemuThread, thread_count);
3738     decomp_param = g_new0(DecompressParam, thread_count);
3739     qemu_mutex_init(&decomp_done_lock);
3740     qemu_cond_init(&decomp_done_cond);
3741     decomp_file = f;
3742     for (i = 0; i < thread_count; i++) {
3743         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3744             goto exit;
3745         }
3746
3747         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3748         qemu_mutex_init(&decomp_param[i].mutex);
3749         qemu_cond_init(&decomp_param[i].cond);
3750         decomp_param[i].done = true;
3751         decomp_param[i].quit = false;
3752         qemu_thread_create(decompress_threads + i, "decompress",
3753                            do_data_decompress, decomp_param + i,
3754                            QEMU_THREAD_JOINABLE);
3755     }
3756     return 0;
3757 exit:
3758     compress_threads_load_cleanup();
3759     return -1;
3760 }
3761
3762 static void decompress_data_with_multi_threads(QEMUFile *f,
3763                                                void *host, int len)
3764 {
3765     int idx, thread_count;
3766
3767     thread_count = migrate_decompress_threads();
3768     QEMU_LOCK_GUARD(&decomp_done_lock);
3769     while (true) {
3770         for (idx = 0; idx < thread_count; idx++) {
3771             if (decomp_param[idx].done) {
3772                 decomp_param[idx].done = false;
3773                 qemu_mutex_lock(&decomp_param[idx].mutex);
3774                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3775                 decomp_param[idx].des = host;
3776                 decomp_param[idx].len = len;
3777                 qemu_cond_signal(&decomp_param[idx].cond);
3778                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3779                 break;
3780             }
3781         }
3782         if (idx < thread_count) {
3783             break;
3784         } else {
3785             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3786         }
3787     }
3788 }
3789
3790 static void colo_init_ram_state(void)
3791 {
3792     ram_state_init(&ram_state);
3793 }
3794
3795 /*
3796  * colo cache: this is for secondary VM, we cache the whole
3797  * memory of the secondary VM, it is need to hold the global lock
3798  * to call this helper.
3799  */
3800 int colo_init_ram_cache(void)
3801 {
3802     RAMBlock *block;
3803
3804     WITH_RCU_READ_LOCK_GUARD() {
3805         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3806             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3807                                                     NULL, false, false);
3808             if (!block->colo_cache) {
3809                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3810                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3811                              block->used_length);
3812                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3813                     if (block->colo_cache) {
3814                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3815                         block->colo_cache = NULL;
3816                     }
3817                 }
3818                 return -errno;
3819             }
3820             if (!machine_dump_guest_core(current_machine)) {
3821                 qemu_madvise(block->colo_cache, block->used_length,
3822                              QEMU_MADV_DONTDUMP);
3823             }
3824         }
3825     }
3826
3827     /*
3828     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3829     * with to decide which page in cache should be flushed into SVM's RAM. Here
3830     * we use the same name 'ram_bitmap' as for migration.
3831     */
3832     if (ram_bytes_total()) {
3833         RAMBlock *block;
3834
3835         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3836             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3837             block->bmap = bitmap_new(pages);
3838         }
3839     }
3840
3841     colo_init_ram_state();
3842     return 0;
3843 }
3844
3845 /* TODO: duplicated with ram_init_bitmaps */
3846 void colo_incoming_start_dirty_log(void)
3847 {
3848     RAMBlock *block = NULL;
3849     /* For memory_global_dirty_log_start below. */
3850     qemu_mutex_lock_iothread();
3851     qemu_mutex_lock_ramlist();
3852
3853     memory_global_dirty_log_sync();
3854     WITH_RCU_READ_LOCK_GUARD() {
3855         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3856             ramblock_sync_dirty_bitmap(ram_state, block);
3857             /* Discard this dirty bitmap record */
3858             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3859         }
3860         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3861     }
3862     ram_state->migration_dirty_pages = 0;
3863     qemu_mutex_unlock_ramlist();
3864     qemu_mutex_unlock_iothread();
3865 }
3866
3867 /* It is need to hold the global lock to call this helper */
3868 void colo_release_ram_cache(void)
3869 {
3870     RAMBlock *block;
3871
3872     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3873     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3874         g_free(block->bmap);
3875         block->bmap = NULL;
3876     }
3877
3878     WITH_RCU_READ_LOCK_GUARD() {
3879         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3880             if (block->colo_cache) {
3881                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3882                 block->colo_cache = NULL;
3883             }
3884         }
3885     }
3886     ram_state_cleanup(&ram_state);
3887 }
3888
3889 /**
3890  * ram_load_setup: Setup RAM for migration incoming side
3891  *
3892  * Returns zero to indicate success and negative for error
3893  *
3894  * @f: QEMUFile where to receive the data
3895  * @opaque: RAMState pointer
3896  */
3897 static int ram_load_setup(QEMUFile *f, void *opaque)
3898 {
3899     if (compress_threads_load_setup(f)) {
3900         return -1;
3901     }
3902
3903     xbzrle_load_setup();
3904     ramblock_recv_map_init();
3905
3906     return 0;
3907 }
3908
3909 static int ram_load_cleanup(void *opaque)
3910 {
3911     RAMBlock *rb;
3912
3913     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3914         qemu_ram_block_writeback(rb);
3915     }
3916
3917     xbzrle_load_cleanup();
3918     compress_threads_load_cleanup();
3919
3920     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3921         g_free(rb->receivedmap);
3922         rb->receivedmap = NULL;
3923     }
3924
3925     return 0;
3926 }
3927
3928 /**
3929  * ram_postcopy_incoming_init: allocate postcopy data structures
3930  *
3931  * Returns 0 for success and negative if there was one error
3932  *
3933  * @mis: current migration incoming state
3934  *
3935  * Allocate data structures etc needed by incoming migration with
3936  * postcopy-ram. postcopy-ram's similarly names
3937  * postcopy_ram_incoming_init does the work.
3938  */
3939 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3940 {
3941     return postcopy_ram_incoming_init(mis);
3942 }
3943
3944 /**
3945  * ram_load_postcopy: load a page in postcopy case
3946  *
3947  * Returns 0 for success or -errno in case of error
3948  *
3949  * Called in postcopy mode by ram_load().
3950  * rcu_read_lock is taken prior to this being called.
3951  *
3952  * @f: QEMUFile where to send the data
3953  * @channel: the channel to use for loading
3954  */
3955 int ram_load_postcopy(QEMUFile *f, int channel)
3956 {
3957     int flags = 0, ret = 0;
3958     bool place_needed = false;
3959     bool matches_target_page_size = false;
3960     MigrationIncomingState *mis = migration_incoming_get_current();
3961     PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
3962
3963     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3964         ram_addr_t addr;
3965         void *page_buffer = NULL;
3966         void *place_source = NULL;
3967         RAMBlock *block = NULL;
3968         uint8_t ch;
3969         int len;
3970
3971         addr = qemu_get_be64(f);
3972
3973         /*
3974          * If qemu file error, we should stop here, and then "addr"
3975          * may be invalid
3976          */
3977         ret = qemu_file_get_error(f);
3978         if (ret) {
3979             break;
3980         }
3981
3982         flags = addr & ~TARGET_PAGE_MASK;
3983         addr &= TARGET_PAGE_MASK;
3984
3985         trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
3986         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3987                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3988             block = ram_block_from_stream(mis, f, flags, channel);
3989             if (!block) {
3990                 ret = -EINVAL;
3991                 break;
3992             }
3993
3994             /*
3995              * Relying on used_length is racy and can result in false positives.
3996              * We might place pages beyond used_length in case RAM was shrunk
3997              * while in postcopy, which is fine - trying to place via
3998              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3999              */
4000             if (!block->host || addr >= block->postcopy_length) {
4001                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4002                 ret = -EINVAL;
4003                 break;
4004             }
4005             tmp_page->target_pages++;
4006             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4007             /*
4008              * Postcopy requires that we place whole host pages atomically;
4009              * these may be huge pages for RAMBlocks that are backed by
4010              * hugetlbfs.
4011              * To make it atomic, the data is read into a temporary page
4012              * that's moved into place later.
4013              * The migration protocol uses,  possibly smaller, target-pages
4014              * however the source ensures it always sends all the components
4015              * of a host page in one chunk.
4016              */
4017             page_buffer = tmp_page->tmp_huge_page +
4018                           host_page_offset_from_ram_block_offset(block, addr);
4019             /* If all TP are zero then we can optimise the place */
4020             if (tmp_page->target_pages == 1) {
4021                 tmp_page->host_addr =
4022                     host_page_from_ram_block_offset(block, addr);
4023             } else if (tmp_page->host_addr !=
4024                        host_page_from_ram_block_offset(block, addr)) {
4025                 /* not the 1st TP within the HP */
4026                 error_report("Non-same host page detected on channel %d: "
4027                              "Target host page %p, received host page %p "
4028                              "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
4029                              channel, tmp_page->host_addr,
4030                              host_page_from_ram_block_offset(block, addr),
4031                              block->idstr, addr, tmp_page->target_pages);
4032                 ret = -EINVAL;
4033                 break;
4034             }
4035
4036             /*
4037              * If it's the last part of a host page then we place the host
4038              * page
4039              */
4040             if (tmp_page->target_pages ==
4041                 (block->page_size / TARGET_PAGE_SIZE)) {
4042                 place_needed = true;
4043             }
4044             place_source = tmp_page->tmp_huge_page;
4045         }
4046
4047         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4048         case RAM_SAVE_FLAG_ZERO:
4049             ch = qemu_get_byte(f);
4050             /*
4051              * Can skip to set page_buffer when
4052              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4053              */
4054             if (ch || !matches_target_page_size) {
4055                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4056             }
4057             if (ch) {
4058                 tmp_page->all_zero = false;
4059             }
4060             break;
4061
4062         case RAM_SAVE_FLAG_PAGE:
4063             tmp_page->all_zero = false;
4064             if (!matches_target_page_size) {
4065                 /* For huge pages, we always use temporary buffer */
4066                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4067             } else {
4068                 /*
4069                  * For small pages that matches target page size, we
4070                  * avoid the qemu_file copy.  Instead we directly use
4071                  * the buffer of QEMUFile to place the page.  Note: we
4072                  * cannot do any QEMUFile operation before using that
4073                  * buffer to make sure the buffer is valid when
4074                  * placing the page.
4075                  */
4076                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4077                                          TARGET_PAGE_SIZE);
4078             }
4079             break;
4080         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4081             tmp_page->all_zero = false;
4082             len = qemu_get_be32(f);
4083             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4084                 error_report("Invalid compressed data length: %d", len);
4085                 ret = -EINVAL;
4086                 break;
4087             }
4088             decompress_data_with_multi_threads(f, page_buffer, len);
4089             break;
4090
4091         case RAM_SAVE_FLAG_EOS:
4092             /* normal exit */
4093             multifd_recv_sync_main();
4094             break;
4095         default:
4096             error_report("Unknown combination of migration flags: 0x%x"
4097                          " (postcopy mode)", flags);
4098             ret = -EINVAL;
4099             break;
4100         }
4101
4102         /* Got the whole host page, wait for decompress before placing. */
4103         if (place_needed) {
4104             ret |= wait_for_decompress_done();
4105         }
4106
4107         /* Detect for any possible file errors */
4108         if (!ret && qemu_file_get_error(f)) {
4109             ret = qemu_file_get_error(f);
4110         }
4111
4112         if (!ret && place_needed) {
4113             if (tmp_page->all_zero) {
4114                 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
4115             } else {
4116                 ret = postcopy_place_page(mis, tmp_page->host_addr,
4117                                           place_source, block);
4118             }
4119             place_needed = false;
4120             postcopy_temp_page_reset(tmp_page);
4121         }
4122     }
4123
4124     return ret;
4125 }
4126
4127 static bool postcopy_is_advised(void)
4128 {
4129     PostcopyState ps = postcopy_state_get();
4130     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4131 }
4132
4133 static bool postcopy_is_running(void)
4134 {
4135     PostcopyState ps = postcopy_state_get();
4136     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4137 }
4138
4139 /*
4140  * Flush content of RAM cache into SVM's memory.
4141  * Only flush the pages that be dirtied by PVM or SVM or both.
4142  */
4143 void colo_flush_ram_cache(void)
4144 {
4145     RAMBlock *block = NULL;
4146     void *dst_host;
4147     void *src_host;
4148     unsigned long offset = 0;
4149
4150     memory_global_dirty_log_sync();
4151     WITH_RCU_READ_LOCK_GUARD() {
4152         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4153             ramblock_sync_dirty_bitmap(ram_state, block);
4154         }
4155     }
4156
4157     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4158     WITH_RCU_READ_LOCK_GUARD() {
4159         block = QLIST_FIRST_RCU(&ram_list.blocks);
4160
4161         while (block) {
4162             unsigned long num = 0;
4163
4164             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
4165             if (!offset_in_ramblock(block,
4166                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
4167                 offset = 0;
4168                 num = 0;
4169                 block = QLIST_NEXT_RCU(block, next);
4170             } else {
4171                 unsigned long i = 0;
4172
4173                 for (i = 0; i < num; i++) {
4174                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
4175                 }
4176                 dst_host = block->host
4177                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4178                 src_host = block->colo_cache
4179                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4180                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4181                 offset += num;
4182             }
4183         }
4184     }
4185     trace_colo_flush_ram_cache_end();
4186 }
4187
4188 /**
4189  * ram_load_precopy: load pages in precopy case
4190  *
4191  * Returns 0 for success or -errno in case of error
4192  *
4193  * Called in precopy mode by ram_load().
4194  * rcu_read_lock is taken prior to this being called.
4195  *
4196  * @f: QEMUFile where to send the data
4197  */
4198 static int ram_load_precopy(QEMUFile *f)
4199 {
4200     MigrationIncomingState *mis = migration_incoming_get_current();
4201     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4202     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4203     bool postcopy_advised = postcopy_is_advised();
4204     if (!migrate_use_compression()) {
4205         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4206     }
4207
4208     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4209         ram_addr_t addr, total_ram_bytes;
4210         void *host = NULL, *host_bak = NULL;
4211         uint8_t ch;
4212
4213         /*
4214          * Yield periodically to let main loop run, but an iteration of
4215          * the main loop is expensive, so do it each some iterations
4216          */
4217         if ((i & 32767) == 0 && qemu_in_coroutine()) {
4218             aio_co_schedule(qemu_get_current_aio_context(),
4219                             qemu_coroutine_self());
4220             qemu_coroutine_yield();
4221         }
4222         i++;
4223
4224         addr = qemu_get_be64(f);
4225         flags = addr & ~TARGET_PAGE_MASK;
4226         addr &= TARGET_PAGE_MASK;
4227
4228         if (flags & invalid_flags) {
4229             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4230                 error_report("Received an unexpected compressed page");
4231             }
4232
4233             ret = -EINVAL;
4234             break;
4235         }
4236
4237         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4238                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4239             RAMBlock *block = ram_block_from_stream(mis, f, flags,
4240                                                     RAM_CHANNEL_PRECOPY);
4241
4242             host = host_from_ram_block_offset(block, addr);
4243             /*
4244              * After going into COLO stage, we should not load the page
4245              * into SVM's memory directly, we put them into colo_cache firstly.
4246              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4247              * Previously, we copied all these memory in preparing stage of COLO
4248              * while we need to stop VM, which is a time-consuming process.
4249              * Here we optimize it by a trick, back-up every page while in
4250              * migration process while COLO is enabled, though it affects the
4251              * speed of the migration, but it obviously reduce the downtime of
4252              * back-up all SVM'S memory in COLO preparing stage.
4253              */
4254             if (migration_incoming_colo_enabled()) {
4255                 if (migration_incoming_in_colo_state()) {
4256                     /* In COLO stage, put all pages into cache temporarily */
4257                     host = colo_cache_from_block_offset(block, addr, true);
4258                 } else {
4259                    /*
4260                     * In migration stage but before COLO stage,
4261                     * Put all pages into both cache and SVM's memory.
4262                     */
4263                     host_bak = colo_cache_from_block_offset(block, addr, false);
4264                 }
4265             }
4266             if (!host) {
4267                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4268                 ret = -EINVAL;
4269                 break;
4270             }
4271             if (!migration_incoming_in_colo_state()) {
4272                 ramblock_recv_bitmap_set(block, host);
4273             }
4274
4275             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4276         }
4277
4278         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4279         case RAM_SAVE_FLAG_MEM_SIZE:
4280             /* Synchronize RAM block list */
4281             total_ram_bytes = addr;
4282             while (!ret && total_ram_bytes) {
4283                 RAMBlock *block;
4284                 char id[256];
4285                 ram_addr_t length;
4286
4287                 len = qemu_get_byte(f);
4288                 qemu_get_buffer(f, (uint8_t *)id, len);
4289                 id[len] = 0;
4290                 length = qemu_get_be64(f);
4291
4292                 block = qemu_ram_block_by_name(id);
4293                 if (block && !qemu_ram_is_migratable(block)) {
4294                     error_report("block %s should not be migrated !", id);
4295                     ret = -EINVAL;
4296                 } else if (block) {
4297                     if (length != block->used_length) {
4298                         Error *local_err = NULL;
4299
4300                         ret = qemu_ram_resize(block, length,
4301                                               &local_err);
4302                         if (local_err) {
4303                             error_report_err(local_err);
4304                         }
4305                     }
4306                     /* For postcopy we need to check hugepage sizes match */
4307                     if (postcopy_advised && migrate_postcopy_ram() &&
4308                         block->page_size != qemu_host_page_size) {
4309                         uint64_t remote_page_size = qemu_get_be64(f);
4310                         if (remote_page_size != block->page_size) {
4311                             error_report("Mismatched RAM page size %s "
4312                                          "(local) %zd != %" PRId64,
4313                                          id, block->page_size,
4314                                          remote_page_size);
4315                             ret = -EINVAL;
4316                         }
4317                     }
4318                     if (migrate_ignore_shared()) {
4319                         hwaddr addr = qemu_get_be64(f);
4320                         if (ramblock_is_ignored(block) &&
4321                             block->mr->addr != addr) {
4322                             error_report("Mismatched GPAs for block %s "
4323                                          "%" PRId64 "!= %" PRId64,
4324                                          id, (uint64_t)addr,
4325                                          (uint64_t)block->mr->addr);
4326                             ret = -EINVAL;
4327                         }
4328                     }
4329                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4330                                           block->idstr);
4331                 } else {
4332                     error_report("Unknown ramblock \"%s\", cannot "
4333                                  "accept migration", id);
4334                     ret = -EINVAL;
4335                 }
4336
4337                 total_ram_bytes -= length;
4338             }
4339             break;
4340
4341         case RAM_SAVE_FLAG_ZERO:
4342             ch = qemu_get_byte(f);
4343             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4344             break;
4345
4346         case RAM_SAVE_FLAG_PAGE:
4347             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4348             break;
4349
4350         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4351             len = qemu_get_be32(f);
4352             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4353                 error_report("Invalid compressed data length: %d", len);
4354                 ret = -EINVAL;
4355                 break;
4356             }
4357             decompress_data_with_multi_threads(f, host, len);
4358             break;
4359
4360         case RAM_SAVE_FLAG_XBZRLE:
4361             if (load_xbzrle(f, addr, host) < 0) {
4362                 error_report("Failed to decompress XBZRLE page at "
4363                              RAM_ADDR_FMT, addr);
4364                 ret = -EINVAL;
4365                 break;
4366             }
4367             break;
4368         case RAM_SAVE_FLAG_EOS:
4369             /* normal exit */
4370             multifd_recv_sync_main();
4371             break;
4372         default:
4373             if (flags & RAM_SAVE_FLAG_HOOK) {
4374                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4375             } else {
4376                 error_report("Unknown combination of migration flags: 0x%x",
4377                              flags);
4378                 ret = -EINVAL;
4379             }
4380         }
4381         if (!ret) {
4382             ret = qemu_file_get_error(f);
4383         }
4384         if (!ret && host_bak) {
4385             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4386         }
4387     }
4388
4389     ret |= wait_for_decompress_done();
4390     return ret;
4391 }
4392
4393 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4394 {
4395     int ret = 0;
4396     static uint64_t seq_iter;
4397     /*
4398      * If system is running in postcopy mode, page inserts to host memory must
4399      * be atomic
4400      */
4401     bool postcopy_running = postcopy_is_running();
4402
4403     seq_iter++;
4404
4405     if (version_id != 4) {
4406         return -EINVAL;
4407     }
4408
4409     /*
4410      * This RCU critical section can be very long running.
4411      * When RCU reclaims in the code start to become numerous,
4412      * it will be necessary to reduce the granularity of this
4413      * critical section.
4414      */
4415     WITH_RCU_READ_LOCK_GUARD() {
4416         if (postcopy_running) {
4417             /*
4418              * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
4419              * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4420              * service fast page faults.
4421              */
4422             ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4423         } else {
4424             ret = ram_load_precopy(f);
4425         }
4426     }
4427     trace_ram_load_complete(ret, seq_iter);
4428
4429     return ret;
4430 }
4431
4432 static bool ram_has_postcopy(void *opaque)
4433 {
4434     RAMBlock *rb;
4435     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4436         if (ramblock_is_pmem(rb)) {
4437             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4438                          "is not supported now!", rb->idstr, rb->host);
4439             return false;
4440         }
4441     }
4442
4443     return migrate_postcopy_ram();
4444 }
4445
4446 /* Sync all the dirty bitmap with destination VM.  */
4447 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4448 {
4449     RAMBlock *block;
4450     QEMUFile *file = s->to_dst_file;
4451     int ramblock_count = 0;
4452
4453     trace_ram_dirty_bitmap_sync_start();
4454
4455     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4456         qemu_savevm_send_recv_bitmap(file, block->idstr);
4457         trace_ram_dirty_bitmap_request(block->idstr);
4458         ramblock_count++;
4459     }
4460
4461     trace_ram_dirty_bitmap_sync_wait();
4462
4463     /* Wait until all the ramblocks' dirty bitmap synced */
4464     while (ramblock_count--) {
4465         qemu_sem_wait(&s->rp_state.rp_sem);
4466     }
4467
4468     trace_ram_dirty_bitmap_sync_complete();
4469
4470     return 0;
4471 }
4472
4473 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4474 {
4475     qemu_sem_post(&s->rp_state.rp_sem);
4476 }
4477
4478 /*
4479  * Read the received bitmap, revert it as the initial dirty bitmap.
4480  * This is only used when the postcopy migration is paused but wants
4481  * to resume from a middle point.
4482  */
4483 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4484 {
4485     int ret = -EINVAL;
4486     /* from_dst_file is always valid because we're within rp_thread */
4487     QEMUFile *file = s->rp_state.from_dst_file;
4488     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4489     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4490     uint64_t size, end_mark;
4491
4492     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4493
4494     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4495         error_report("%s: incorrect state %s", __func__,
4496                      MigrationStatus_str(s->state));
4497         return -EINVAL;
4498     }
4499
4500     /*
4501      * Note: see comments in ramblock_recv_bitmap_send() on why we
4502      * need the endianness conversion, and the paddings.
4503      */
4504     local_size = ROUND_UP(local_size, 8);
4505
4506     /* Add paddings */
4507     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4508
4509     size = qemu_get_be64(file);
4510
4511     /* The size of the bitmap should match with our ramblock */
4512     if (size != local_size) {
4513         error_report("%s: ramblock '%s' bitmap size mismatch "
4514                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4515                      block->idstr, size, local_size);
4516         ret = -EINVAL;
4517         goto out;
4518     }
4519
4520     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4521     end_mark = qemu_get_be64(file);
4522
4523     ret = qemu_file_get_error(file);
4524     if (ret || size != local_size) {
4525         error_report("%s: read bitmap failed for ramblock '%s': %d"
4526                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4527                      __func__, block->idstr, ret, local_size, size);
4528         ret = -EIO;
4529         goto out;
4530     }
4531
4532     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4533         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4534                      __func__, block->idstr, end_mark);
4535         ret = -EINVAL;
4536         goto out;
4537     }
4538
4539     /*
4540      * Endianness conversion. We are during postcopy (though paused).
4541      * The dirty bitmap won't change. We can directly modify it.
4542      */
4543     bitmap_from_le(block->bmap, le_bitmap, nbits);
4544
4545     /*
4546      * What we received is "received bitmap". Revert it as the initial
4547      * dirty bitmap for this ramblock.
4548      */
4549     bitmap_complement(block->bmap, block->bmap, nbits);
4550
4551     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4552     ramblock_dirty_bitmap_clear_discarded_pages(block);
4553
4554     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4555     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4556
4557     /*
4558      * We succeeded to sync bitmap for current ramblock. If this is
4559      * the last one to sync, we need to notify the main send thread.
4560      */
4561     ram_dirty_bitmap_reload_notify(s);
4562
4563     ret = 0;
4564 out:
4565     g_free(le_bitmap);
4566     return ret;
4567 }
4568
4569 static int ram_resume_prepare(MigrationState *s, void *opaque)
4570 {
4571     RAMState *rs = *(RAMState **)opaque;
4572     int ret;
4573
4574     ret = ram_dirty_bitmap_sync_all(s, rs);
4575     if (ret) {
4576         return ret;
4577     }
4578
4579     ram_state_resume_prepare(rs, s->to_dst_file);
4580
4581     return 0;
4582 }
4583
4584 void postcopy_preempt_shutdown_file(MigrationState *s)
4585 {
4586     qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4587     qemu_fflush(s->postcopy_qemufile_src);
4588 }
4589
4590 static SaveVMHandlers savevm_ram_handlers = {
4591     .save_setup = ram_save_setup,
4592     .save_live_iterate = ram_save_iterate,
4593     .save_live_complete_postcopy = ram_save_complete,
4594     .save_live_complete_precopy = ram_save_complete,
4595     .has_postcopy = ram_has_postcopy,
4596     .state_pending_exact = ram_state_pending_exact,
4597     .state_pending_estimate = ram_state_pending_estimate,
4598     .load_state = ram_load,
4599     .save_cleanup = ram_save_cleanup,
4600     .load_setup = ram_load_setup,
4601     .load_cleanup = ram_load_cleanup,
4602     .resume_prepare = ram_resume_prepare,
4603 };
4604
4605 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4606                                       size_t old_size, size_t new_size)
4607 {
4608     PostcopyState ps = postcopy_state_get();
4609     ram_addr_t offset;
4610     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4611     Error *err = NULL;
4612
4613     if (ramblock_is_ignored(rb)) {
4614         return;
4615     }
4616
4617     if (!migration_is_idle()) {
4618         /*
4619          * Precopy code on the source cannot deal with the size of RAM blocks
4620          * changing at random points in time - especially after sending the
4621          * RAM block sizes in the migration stream, they must no longer change.
4622          * Abort and indicate a proper reason.
4623          */
4624         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4625         migration_cancel(err);
4626         error_free(err);
4627     }
4628
4629     switch (ps) {
4630     case POSTCOPY_INCOMING_ADVISE:
4631         /*
4632          * Update what ram_postcopy_incoming_init()->init_range() does at the
4633          * time postcopy was advised. Syncing RAM blocks with the source will
4634          * result in RAM resizes.
4635          */
4636         if (old_size < new_size) {
4637             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4638                 error_report("RAM block '%s' discard of resized RAM failed",
4639                              rb->idstr);
4640             }
4641         }
4642         rb->postcopy_length = new_size;
4643         break;
4644     case POSTCOPY_INCOMING_NONE:
4645     case POSTCOPY_INCOMING_RUNNING:
4646     case POSTCOPY_INCOMING_END:
4647         /*
4648          * Once our guest is running, postcopy does no longer care about
4649          * resizes. When growing, the new memory was not available on the
4650          * source, no handler needed.
4651          */
4652         break;
4653     default:
4654         error_report("RAM block '%s' resized during postcopy state: %d",
4655                      rb->idstr, ps);
4656         exit(-1);
4657     }
4658 }
4659
4660 static RAMBlockNotifier ram_mig_ram_notifier = {
4661     .ram_block_resized = ram_mig_ram_block_resized,
4662 };
4663
4664 void ram_mig_init(void)
4665 {
4666     qemu_mutex_init(&XBZRLE.lock);
4667     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4668     ram_block_notifier_add(&ram_mig_ram_notifier);
4669 }