migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/madvise.h"
  34 #include "qemu/main-loop.h"
  35 #include "xbzrle.h"
  36 #include "ram.h"
  37 #include "migration.h"
  38 #include "migration/register.h"
  39 #include "migration/misc.h"
  40 #include "qemu-file.h"
  41 #include "postcopy-ram.h"
  42 #include "page_cache.h"
  43 #include "qemu/error-report.h"
  44 #include "qapi/error.h"
  45 #include "qapi/qapi-types-migration.h"
  46 #include "qapi/qapi-events-migration.h"
  47 #include "qapi/qmp/qerror.h"
  48 #include "trace.h"
  49 #include "exec/ram_addr.h"
  50 #include "exec/target_page.h"
  51 #include "qemu/rcu_queue.h"
  52 #include "migration/colo.h"
  53 #include "block.h"
  54 #include "sysemu/cpu-throttle.h"
  55 #include "savevm.h"
  56 #include "qemu/iov.h"
  57 #include "multifd.h"
  58 #include "sysemu/runstate.h"
  59
  60 #include "hw/boards.h" /* for machine_dump_guest_core() */
  61
  62 #if defined(__linux__)
  63 #include "qemu/userfaultfd.h"
  64 #endif /* defined(__linux__) */
  65
  66 /***********************************************************/
  67 /* ram save/restore */
  68
  69 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  70  * worked for pages that where filled with the same char.  We switched
  71  * it to only search for the zero value.  And to avoid confusion with
  72  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  73  */
  74
  75 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  76 #define RAM_SAVE_FLAG_ZERO     0x02
  77 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  78 #define RAM_SAVE_FLAG_PAGE     0x08
  79 #define RAM_SAVE_FLAG_EOS      0x10
  80 #define RAM_SAVE_FLAG_CONTINUE 0x20
  81 #define RAM_SAVE_FLAG_XBZRLE   0x40
  82 /* 0x80 is reserved in migration.h start with 0x100 next */
  83 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  84
  85 XBZRLECacheStats xbzrle_counters;
  86
  87 /* struct contains XBZRLE cache and a static page
  88    used by the compression */
  89 static struct {
  90     /* buffer used for XBZRLE encoding */
  91     uint8_t *encoded_buf;
  92     /* buffer for storing page content */
  93     uint8_t *current_buf;
  94     /* Cache for XBZRLE, Protected by lock. */
  95     PageCache *cache;
  96     QemuMutex lock;
  97     /* it will store a page full of zeros */
  98     uint8_t *zero_target_page;
  99     /* buffer used for XBZRLE decoding */
 100     uint8_t *decoded_buf;
 101 } XBZRLE;
 102
 103 static void XBZRLE_cache_lock(void)
 104 {
 105     if (migrate_use_xbzrle()) {
 106         qemu_mutex_lock(&XBZRLE.lock);
 107     }
 108 }
 109
 110 static void XBZRLE_cache_unlock(void)
 111 {
 112     if (migrate_use_xbzrle()) {
 113         qemu_mutex_unlock(&XBZRLE.lock);
 114     }
 115 }
 116
 117 /**
 118  * xbzrle_cache_resize: resize the xbzrle cache
 119  *
 120  * This function is called from migrate_params_apply in main
 121  * thread, possibly while a migration is in progress.  A running
 122  * migration may be using the cache and might finish during this call,
 123  * hence changes to the cache are protected by XBZRLE.lock().
 124  *
 125  * Returns 0 for success or -1 for error
 126  *
 127  * @new_size: new cache size
 128  * @errp: set *errp if the check failed, with reason
 129  */
 130 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 131 {
 132     PageCache *new_cache;
 133     int64_t ret = 0;
 134
 135     /* Check for truncation */
 136     if (new_size != (size_t)new_size) {
 137         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 138                    "exceeding address space");
 139         return -1;
 140     }
 141
 142     if (new_size == migrate_xbzrle_cache_size()) {
 143         /* nothing to do */
 144         return 0;
 145     }
 146
 147     XBZRLE_cache_lock();
 148
 149     if (XBZRLE.cache != NULL) {
 150         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 151         if (!new_cache) {
 152             ret = -1;
 153             goto out;
 154         }
 155
 156         cache_fini(XBZRLE.cache);
 157         XBZRLE.cache = new_cache;
 158     }
 159 out:
 160     XBZRLE_cache_unlock();
 161     return ret;
 162 }
 163
 164 bool ramblock_is_ignored(RAMBlock *block)
 165 {
 166     return !qemu_ram_is_migratable(block) ||
 167            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 168 }
 169
 170 #undef RAMBLOCK_FOREACH
 171
 172 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 173 {
 174     RAMBlock *block;
 175     int ret = 0;
 176
 177     RCU_READ_LOCK_GUARD();
 178
 179     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 180         ret = func(block, opaque);
 181         if (ret) {
 182             break;
 183         }
 184     }
 185     return ret;
 186 }
 187
 188 static void ramblock_recv_map_init(void)
 189 {
 190     RAMBlock *rb;
 191
 192     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 193         assert(!rb->receivedmap);
 194         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 195     }
 196 }
 197
 198 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 199 {
 200     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 201                     rb->receivedmap);
 202 }
 203
 204 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 205 {
 206     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 207 }
 208
 209 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 210 {
 211     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 212 }
 213
 214 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 215                                     size_t nr)
 216 {
 217     bitmap_set_atomic(rb->receivedmap,
 218                       ramblock_recv_bitmap_offset(host_addr, rb),
 219                       nr);
 220 }
 221
 222 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 223
 224 /*
 225  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 226  *
 227  * Returns >0 if success with sent bytes, or <0 if error.
 228  */
 229 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 230                                   const char *block_name)
 231 {
 232     RAMBlock *block = qemu_ram_block_by_name(block_name);
 233     unsigned long *le_bitmap, nbits;
 234     uint64_t size;
 235
 236     if (!block) {
 237         error_report("%s: invalid block name: %s", __func__, block_name);
 238         return -1;
 239     }
 240
 241     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 242
 243     /*
 244      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 245      * machines we may need 4 more bytes for padding (see below
 246      * comment). So extend it a bit before hand.
 247      */
 248     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 249
 250     /*
 251      * Always use little endian when sending the bitmap. This is
 252      * required that when source and destination VMs are not using the
 253      * same endianness. (Note: big endian won't work.)
 254      */
 255     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 256
 257     /* Size of the bitmap, in bytes */
 258     size = DIV_ROUND_UP(nbits, 8);
 259
 260     /*
 261      * size is always aligned to 8 bytes for 64bit machines, but it
 262      * may not be true for 32bit machines. We need this padding to
 263      * make sure the migration can survive even between 32bit and
 264      * 64bit machines.
 265      */
 266     size = ROUND_UP(size, 8);
 267
 268     qemu_put_be64(file, size);
 269     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 270     /*
 271      * Mark as an end, in case the middle part is screwed up due to
 272      * some "mysterious" reason.
 273      */
 274     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 275     qemu_fflush(file);
 276
 277     g_free(le_bitmap);
 278
 279     if (qemu_file_get_error(file)) {
 280         return qemu_file_get_error(file);
 281     }
 282
 283     return size + sizeof(size);
 284 }
 285
 286 /*
 287  * An outstanding page request, on the source, having been received
 288  * and queued
 289  */
 290 struct RAMSrcPageRequest {
 291     RAMBlock *rb;
 292     hwaddr    offset;
 293     hwaddr    len;
 294
 295     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 296 };
 297
 298 /* State of RAM for migration */
 299 struct RAMState {
 300     /* QEMUFile used for this migration */
 301     QEMUFile *f;
 302     /* UFFD file descriptor, used in 'write-tracking' migration */
 303     int uffdio_fd;
 304     /* Last block that we have visited searching for dirty pages */
 305     RAMBlock *last_seen_block;
 306     /* Last block from where we have sent data */
 307     RAMBlock *last_sent_block;
 308     /* Last dirty target page we have sent */
 309     ram_addr_t last_page;
 310     /* last ram version we have seen */
 311     uint32_t last_version;
 312     /* How many times we have dirty too many pages */
 313     int dirty_rate_high_cnt;
 314     /* these variables are used for bitmap sync */
 315     /* last time we did a full bitmap_sync */
 316     int64_t time_last_bitmap_sync;
 317     /* bytes transferred at start_time */
 318     uint64_t bytes_xfer_prev;
 319     /* number of dirty pages since start_time */
 320     uint64_t num_dirty_pages_period;
 321     /* xbzrle misses since the beginning of the period */
 322     uint64_t xbzrle_cache_miss_prev;
 323     /* Amount of xbzrle pages since the beginning of the period */
 324     uint64_t xbzrle_pages_prev;
 325     /* Amount of xbzrle encoded bytes since the beginning of the period */
 326     uint64_t xbzrle_bytes_prev;
 327     /* Start using XBZRLE (e.g., after the first round). */
 328     bool xbzrle_enabled;
 329     /* Are we on the last stage of migration */
 330     bool last_stage;
 331     /* compression statistics since the beginning of the period */
 332     /* amount of count that no free thread to compress data */
 333     uint64_t compress_thread_busy_prev;
 334     /* amount bytes after compression */
 335     uint64_t compressed_size_prev;
 336     /* amount of compressed pages */
 337     uint64_t compress_pages_prev;
 338
 339     /* total handled target pages at the beginning of period */
 340     uint64_t target_page_count_prev;
 341     /* total handled target pages since start */
 342     uint64_t target_page_count;
 343     /* number of dirty bits in the bitmap */
 344     uint64_t migration_dirty_pages;
 345     /* Protects modification of the bitmap and migration dirty pages */
 346     QemuMutex bitmap_mutex;
 347     /* The RAMBlock used in the last src_page_requests */
 348     RAMBlock *last_req_rb;
 349     /* Queue of outstanding page requests from the destination */
 350     QemuMutex src_page_req_mutex;
 351     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 352 };
 353 typedef struct RAMState RAMState;
 354
 355 static RAMState *ram_state;
 356
 357 static NotifierWithReturnList precopy_notifier_list;
 358
 359 /* Whether postcopy has queued requests? */
 360 static bool postcopy_has_request(RAMState *rs)
 361 {
 362     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
 363 }
 364
 365 void precopy_infrastructure_init(void)
 366 {
 367     notifier_with_return_list_init(&precopy_notifier_list);
 368 }
 369
 370 void precopy_add_notifier(NotifierWithReturn *n)
 371 {
 372     notifier_with_return_list_add(&precopy_notifier_list, n);
 373 }
 374
 375 void precopy_remove_notifier(NotifierWithReturn *n)
 376 {
 377     notifier_with_return_remove(n);
 378 }
 379
 380 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 381 {
 382     PrecopyNotifyData pnd;
 383     pnd.reason = reason;
 384     pnd.errp = errp;
 385
 386     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 387 }
 388
 389 uint64_t ram_bytes_remaining(void)
 390 {
 391     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 392                        0;
 393 }
 394
 395 MigrationStats ram_counters;
 396
 397 static void ram_transferred_add(uint64_t bytes)
 398 {
 399     if (runstate_is_running()) {
 400         ram_counters.precopy_bytes += bytes;
 401     } else if (migration_in_postcopy()) {
 402         ram_counters.postcopy_bytes += bytes;
 403     } else {
 404         ram_counters.downtime_bytes += bytes;
 405     }
 406     ram_counters.transferred += bytes;
 407 }
 408
 409 /* used by the search for pages to send */
 410 struct PageSearchStatus {
 411     /* Current block being searched */
 412     RAMBlock    *block;
 413     /* Current page to search from */
 414     unsigned long page;
 415     /* Set once we wrap around */
 416     bool         complete_round;
 417 };
 418 typedef struct PageSearchStatus PageSearchStatus;
 419
 420 CompressionStats compression_counters;
 421
 422 struct CompressParam {
 423     bool done;
 424     bool quit;
 425     bool zero_page;
 426     QEMUFile *file;
 427     QemuMutex mutex;
 428     QemuCond cond;
 429     RAMBlock *block;
 430     ram_addr_t offset;
 431
 432     /* internally used fields */
 433     z_stream stream;
 434     uint8_t *originbuf;
 435 };
 436 typedef struct CompressParam CompressParam;
 437
 438 struct DecompressParam {
 439     bool done;
 440     bool quit;
 441     QemuMutex mutex;
 442     QemuCond cond;
 443     void *des;
 444     uint8_t *compbuf;
 445     int len;
 446     z_stream stream;
 447 };
 448 typedef struct DecompressParam DecompressParam;
 449
 450 static CompressParam *comp_param;
 451 static QemuThread *compress_threads;
 452 /* comp_done_cond is used to wake up the migration thread when
 453  * one of the compression threads has finished the compression.
 454  * comp_done_lock is used to co-work with comp_done_cond.
 455  */
 456 static QemuMutex comp_done_lock;
 457 static QemuCond comp_done_cond;
 458 /* The empty QEMUFileOps will be used by file in CompressParam */
 459 static const QEMUFileOps empty_ops = { };
 460
 461 static QEMUFile *decomp_file;
 462 static DecompressParam *decomp_param;
 463 static QemuThread *decompress_threads;
 464 static QemuMutex decomp_done_lock;
 465 static QemuCond decomp_done_cond;
 466
 467 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 468                                  ram_addr_t offset, uint8_t *source_buf);
 469
 470 static void *do_data_compress(void *opaque)
 471 {
 472     CompressParam *param = opaque;
 473     RAMBlock *block;
 474     ram_addr_t offset;
 475     bool zero_page;
 476
 477     qemu_mutex_lock(&param->mutex);
 478     while (!param->quit) {
 479         if (param->block) {
 480             block = param->block;
 481             offset = param->offset;
 482             param->block = NULL;
 483             qemu_mutex_unlock(&param->mutex);
 484
 485             zero_page = do_compress_ram_page(param->file, &param->stream,
 486                                              block, offset, param->originbuf);
 487
 488             qemu_mutex_lock(&comp_done_lock);
 489             param->done = true;
 490             param->zero_page = zero_page;
 491             qemu_cond_signal(&comp_done_cond);
 492             qemu_mutex_unlock(&comp_done_lock);
 493
 494             qemu_mutex_lock(&param->mutex);
 495         } else {
 496             qemu_cond_wait(&param->cond, &param->mutex);
 497         }
 498     }
 499     qemu_mutex_unlock(&param->mutex);
 500
 501     return NULL;
 502 }
 503
 504 static void compress_threads_save_cleanup(void)
 505 {
 506     int i, thread_count;
 507
 508     if (!migrate_use_compression() || !comp_param) {
 509         return;
 510     }
 511
 512     thread_count = migrate_compress_threads();
 513     for (i = 0; i < thread_count; i++) {
 514         /*
 515          * we use it as a indicator which shows if the thread is
 516          * properly init'd or not
 517          */
 518         if (!comp_param[i].file) {
 519             break;
 520         }
 521
 522         qemu_mutex_lock(&comp_param[i].mutex);
 523         comp_param[i].quit = true;
 524         qemu_cond_signal(&comp_param[i].cond);
 525         qemu_mutex_unlock(&comp_param[i].mutex);
 526
 527         qemu_thread_join(compress_threads + i);
 528         qemu_mutex_destroy(&comp_param[i].mutex);
 529         qemu_cond_destroy(&comp_param[i].cond);
 530         deflateEnd(&comp_param[i].stream);
 531         g_free(comp_param[i].originbuf);
 532         qemu_fclose(comp_param[i].file);
 533         comp_param[i].file = NULL;
 534     }
 535     qemu_mutex_destroy(&comp_done_lock);
 536     qemu_cond_destroy(&comp_done_cond);
 537     g_free(compress_threads);
 538     g_free(comp_param);
 539     compress_threads = NULL;
 540     comp_param = NULL;
 541 }
 542
 543 static int compress_threads_save_setup(void)
 544 {
 545     int i, thread_count;
 546
 547     if (!migrate_use_compression()) {
 548         return 0;
 549     }
 550     thread_count = migrate_compress_threads();
 551     compress_threads = g_new0(QemuThread, thread_count);
 552     comp_param = g_new0(CompressParam, thread_count);
 553     qemu_cond_init(&comp_done_cond);
 554     qemu_mutex_init(&comp_done_lock);
 555     for (i = 0; i < thread_count; i++) {
 556         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 557         if (!comp_param[i].originbuf) {
 558             goto exit;
 559         }
 560
 561         if (deflateInit(&comp_param[i].stream,
 562                         migrate_compress_level()) != Z_OK) {
 563             g_free(comp_param[i].originbuf);
 564             goto exit;
 565         }
 566
 567         /* comp_param[i].file is just used as a dummy buffer to save data,
 568          * set its ops to empty.
 569          */
 570         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
 571         comp_param[i].done = true;
 572         comp_param[i].quit = false;
 573         qemu_mutex_init(&comp_param[i].mutex);
 574         qemu_cond_init(&comp_param[i].cond);
 575         qemu_thread_create(compress_threads + i, "compress",
 576                            do_data_compress, comp_param + i,
 577                            QEMU_THREAD_JOINABLE);
 578     }
 579     return 0;
 580
 581 exit:
 582     compress_threads_save_cleanup();
 583     return -1;
 584 }
 585
 586 /**
 587  * save_page_header: write page header to wire
 588  *
 589  * If this is the 1st block, it also writes the block identification
 590  *
 591  * Returns the number of bytes written
 592  *
 593  * @f: QEMUFile where to send the data
 594  * @block: block that contains the page we want to send
 595  * @offset: offset inside the block for the page
 596  *          in the lower bits, it contains flags
 597  */
 598 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 599                                ram_addr_t offset)
 600 {
 601     size_t size, len;
 602
 603     if (block == rs->last_sent_block) {
 604         offset |= RAM_SAVE_FLAG_CONTINUE;
 605     }
 606     qemu_put_be64(f, offset);
 607     size = 8;
 608
 609     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 610         len = strlen(block->idstr);
 611         qemu_put_byte(f, len);
 612         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 613         size += 1 + len;
 614         rs->last_sent_block = block;
 615     }
 616     return size;
 617 }
 618
 619 /**
 620  * mig_throttle_guest_down: throttle down the guest
 621  *
 622  * Reduce amount of guest cpu execution to hopefully slow down memory
 623  * writes. If guest dirty memory rate is reduced below the rate at
 624  * which we can transfer pages to the destination then we should be
 625  * able to complete migration. Some workloads dirty memory way too
 626  * fast and will not effectively converge, even with auto-converge.
 627  */
 628 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 629                                     uint64_t bytes_dirty_threshold)
 630 {
 631     MigrationState *s = migrate_get_current();
 632     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 633     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 634     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 635     int pct_max = s->parameters.max_cpu_throttle;
 636
 637     uint64_t throttle_now = cpu_throttle_get_percentage();
 638     uint64_t cpu_now, cpu_ideal, throttle_inc;
 639
 640     /* We have not started throttling yet. Let's start it. */
 641     if (!cpu_throttle_active()) {
 642         cpu_throttle_set(pct_initial);
 643     } else {
 644         /* Throttling already on, just increase the rate */
 645         if (!pct_tailslow) {
 646             throttle_inc = pct_increment;
 647         } else {
 648             /* Compute the ideal CPU percentage used by Guest, which may
 649              * make the dirty rate match the dirty rate threshold. */
 650             cpu_now = 100 - throttle_now;
 651             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 652                         bytes_dirty_period);
 653             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 654         }
 655         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 656     }
 657 }
 658
 659 void mig_throttle_counter_reset(void)
 660 {
 661     RAMState *rs = ram_state;
 662
 663     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 664     rs->num_dirty_pages_period = 0;
 665     rs->bytes_xfer_prev = ram_counters.transferred;
 666 }
 667
 668 /**
 669  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 670  *
 671  * @rs: current RAM state
 672  * @current_addr: address for the zero page
 673  *
 674  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 675  * The important thing is that a stale (not-yet-0'd) page be replaced
 676  * by the new data.
 677  * As a bonus, if the page wasn't in the cache it gets added so that
 678  * when a small write is made into the 0'd page it gets XBZRLE sent.
 679  */
 680 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 681 {
 682     if (!rs->xbzrle_enabled) {
 683         return;
 684     }
 685
 686     /* We don't care if this fails to allocate a new cache page
 687      * as long as it updated an old one */
 688     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 689                  ram_counters.dirty_sync_count);
 690 }
 691
 692 #define ENCODING_FLAG_XBZRLE 0x1
 693
 694 /**
 695  * save_xbzrle_page: compress and send current page
 696  *
 697  * Returns: 1 means that we wrote the page
 698  *          0 means that page is identical to the one already sent
 699  *          -1 means that xbzrle would be longer than normal
 700  *
 701  * @rs: current RAM state
 702  * @current_data: pointer to the address of the page contents
 703  * @current_addr: addr of the page
 704  * @block: block that contains the page we want to send
 705  * @offset: offset inside the block for the page
 706  */
 707 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 708                             ram_addr_t current_addr, RAMBlock *block,
 709                             ram_addr_t offset)
 710 {
 711     int encoded_len = 0, bytes_xbzrle;
 712     uint8_t *prev_cached_page;
 713
 714     if (!cache_is_cached(XBZRLE.cache, current_addr,
 715                          ram_counters.dirty_sync_count)) {
 716         xbzrle_counters.cache_miss++;
 717         if (!rs->last_stage) {
 718             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 719                              ram_counters.dirty_sync_count) == -1) {
 720                 return -1;
 721             } else {
 722                 /* update *current_data when the page has been
 723                    inserted into cache */
 724                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 725             }
 726         }
 727         return -1;
 728     }
 729
 730     /*
 731      * Reaching here means the page has hit the xbzrle cache, no matter what
 732      * encoding result it is (normal encoding, overflow or skipping the page),
 733      * count the page as encoded. This is used to calculate the encoding rate.
 734      *
 735      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 736      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 737      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 738      * skipped page included. In this way, the encoding rate can tell if the
 739      * guest page is good for xbzrle encoding.
 740      */
 741     xbzrle_counters.pages++;
 742     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 743
 744     /* save current buffer into memory */
 745     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 746
 747     /* XBZRLE encoding (if there is no overflow) */
 748     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 749                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 750                                        TARGET_PAGE_SIZE);
 751
 752     /*
 753      * Update the cache contents, so that it corresponds to the data
 754      * sent, in all cases except where we skip the page.
 755      */
 756     if (!rs->last_stage && encoded_len != 0) {
 757         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 758         /*
 759          * In the case where we couldn't compress, ensure that the caller
 760          * sends the data from the cache, since the guest might have
 761          * changed the RAM since we copied it.
 762          */
 763         *current_data = prev_cached_page;
 764     }
 765
 766     if (encoded_len == 0) {
 767         trace_save_xbzrle_page_skipping();
 768         return 0;
 769     } else if (encoded_len == -1) {
 770         trace_save_xbzrle_page_overflow();
 771         xbzrle_counters.overflow++;
 772         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 773         return -1;
 774     }
 775
 776     /* Send XBZRLE based compressed page */
 777     bytes_xbzrle = save_page_header(rs, rs->f, block,
 778                                     offset | RAM_SAVE_FLAG_XBZRLE);
 779     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 780     qemu_put_be16(rs->f, encoded_len);
 781     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 782     bytes_xbzrle += encoded_len + 1 + 2;
 783     /*
 784      * Like compressed_size (please see update_compress_thread_counts),
 785      * the xbzrle encoded bytes don't count the 8 byte header with
 786      * RAM_SAVE_FLAG_CONTINUE.
 787      */
 788     xbzrle_counters.bytes += bytes_xbzrle - 8;
 789     ram_transferred_add(bytes_xbzrle);
 790
 791     return 1;
 792 }
 793
 794 /**
 795  * migration_bitmap_find_dirty: find the next dirty page from start
 796  *
 797  * Returns the page offset within memory region of the start of a dirty page
 798  *
 799  * @rs: current RAM state
 800  * @rb: RAMBlock where to search for dirty pages
 801  * @start: page where we start the search
 802  */
 803 static inline
 804 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 805                                           unsigned long start)
 806 {
 807     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 808     unsigned long *bitmap = rb->bmap;
 809
 810     if (ramblock_is_ignored(rb)) {
 811         return size;
 812     }
 813
 814     return find_next_bit(bitmap, size, start);
 815 }
 816
 817 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
 818                                                        unsigned long page)
 819 {
 820     uint8_t shift;
 821     hwaddr size, start;
 822
 823     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
 824         return;
 825     }
 826
 827     shift = rb->clear_bmap_shift;
 828     /*
 829      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 830      * can make things easier sometimes since then start address
 831      * of the small chunk will always be 64 pages aligned so the
 832      * bitmap will always be aligned to unsigned long. We should
 833      * even be able to remove this restriction but I'm simply
 834      * keeping it.
 835      */
 836     assert(shift >= 6);
 837
 838     size = 1ULL << (TARGET_PAGE_BITS + shift);
 839     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
 840     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 841     memory_region_clear_dirty_bitmap(rb->mr, start, size);
 842 }
 843
 844 static void
 845 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
 846                                                  unsigned long start,
 847                                                  unsigned long npages)
 848 {
 849     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
 850     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
 851     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
 852
 853     /*
 854      * Clear pages from start to start + npages - 1, so the end boundary is
 855      * exclusive.
 856      */
 857     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
 858         migration_clear_memory_region_dirty_bitmap(rb, i);
 859     }
 860 }
 861
 862 /*
 863  * colo_bitmap_find_diry:find contiguous dirty pages from start
 864  *
 865  * Returns the page offset within memory region of the start of the contiguout
 866  * dirty page
 867  *
 868  * @rs: current RAM state
 869  * @rb: RAMBlock where to search for dirty pages
 870  * @start: page where we start the search
 871  * @num: the number of contiguous dirty pages
 872  */
 873 static inline
 874 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 875                                      unsigned long start, unsigned long *num)
 876 {
 877     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 878     unsigned long *bitmap = rb->bmap;
 879     unsigned long first, next;
 880
 881     *num = 0;
 882
 883     if (ramblock_is_ignored(rb)) {
 884         return size;
 885     }
 886
 887     first = find_next_bit(bitmap, size, start);
 888     if (first >= size) {
 889         return first;
 890     }
 891     next = find_next_zero_bit(bitmap, size, first + 1);
 892     assert(next >= first);
 893     *num = next - first;
 894     return first;
 895 }
 896
 897 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 898                                                 RAMBlock *rb,
 899                                                 unsigned long page)
 900 {
 901     bool ret;
 902
 903     /*
 904      * Clear dirty bitmap if needed.  This _must_ be called before we
 905      * send any of the page in the chunk because we need to make sure
 906      * we can capture further page content changes when we sync dirty
 907      * log the next time.  So as long as we are going to send any of
 908      * the page in the chunk we clear the remote dirty bitmap for all.
 909      * Clearing it earlier won't be a problem, but too late will.
 910      */
 911     migration_clear_memory_region_dirty_bitmap(rb, page);
 912
 913     ret = test_and_clear_bit(page, rb->bmap);
 914     if (ret) {
 915         rs->migration_dirty_pages--;
 916     }
 917
 918     return ret;
 919 }
 920
 921 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
 922                                        void *opaque)
 923 {
 924     const hwaddr offset = section->offset_within_region;
 925     const hwaddr size = int128_get64(section->size);
 926     const unsigned long start = offset >> TARGET_PAGE_BITS;
 927     const unsigned long npages = size >> TARGET_PAGE_BITS;
 928     RAMBlock *rb = section->mr->ram_block;
 929     uint64_t *cleared_bits = opaque;
 930
 931     /*
 932      * We don't grab ram_state->bitmap_mutex because we expect to run
 933      * only when starting migration or during postcopy recovery where
 934      * we don't have concurrent access.
 935      */
 936     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
 937         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
 938     }
 939     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
 940     bitmap_clear(rb->bmap, start, npages);
 941 }
 942
 943 /*
 944  * Exclude all dirty pages from migration that fall into a discarded range as
 945  * managed by a RamDiscardManager responsible for the mapped memory region of
 946  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
 947  *
 948  * Discarded pages ("logically unplugged") have undefined content and must
 949  * not get migrated, because even reading these pages for migration might
 950  * result in undesired behavior.
 951  *
 952  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
 953  *
 954  * Note: The result is only stable while migrating (precopy/postcopy).
 955  */
 956 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
 957 {
 958     uint64_t cleared_bits = 0;
 959
 960     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
 961         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 962         MemoryRegionSection section = {
 963             .mr = rb->mr,
 964             .offset_within_region = 0,
 965             .size = int128_make64(qemu_ram_get_used_length(rb)),
 966         };
 967
 968         ram_discard_manager_replay_discarded(rdm, &section,
 969                                              dirty_bitmap_clear_section,
 970                                              &cleared_bits);
 971     }
 972     return cleared_bits;
 973 }
 974
 975 /*
 976  * Check if a host-page aligned page falls into a discarded range as managed by
 977  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
 978  *
 979  * Note: The result is only stable while migrating (precopy/postcopy).
 980  */
 981 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
 982 {
 983     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
 984         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 985         MemoryRegionSection section = {
 986             .mr = rb->mr,
 987             .offset_within_region = start,
 988             .size = int128_make64(qemu_ram_pagesize(rb)),
 989         };
 990
 991         return !ram_discard_manager_is_populated(rdm, &section);
 992     }
 993     return false;
 994 }
 995
 996 /* Called with RCU critical section */
 997 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 998 {
 999     uint64_t new_dirty_pages =
1000         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1001
1002     rs->migration_dirty_pages += new_dirty_pages;
1003     rs->num_dirty_pages_period += new_dirty_pages;
1004 }
1005
1006 /**
1007  * ram_pagesize_summary: calculate all the pagesizes of a VM
1008  *
1009  * Returns a summary bitmap of the page sizes of all RAMBlocks
1010  *
1011  * For VMs with just normal pages this is equivalent to the host page
1012  * size. If it's got some huge pages then it's the OR of all the
1013  * different page sizes.
1014  */
1015 uint64_t ram_pagesize_summary(void)
1016 {
1017     RAMBlock *block;
1018     uint64_t summary = 0;
1019
1020     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1021         summary |= block->page_size;
1022     }
1023
1024     return summary;
1025 }
1026
1027 uint64_t ram_get_total_transferred_pages(void)
1028 {
1029     return  ram_counters.normal + ram_counters.duplicate +
1030                 compression_counters.pages + xbzrle_counters.pages;
1031 }
1032
1033 static void migration_update_rates(RAMState *rs, int64_t end_time)
1034 {
1035     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1036     double compressed_size;
1037
1038     /* calculate period counters */
1039     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1040                 / (end_time - rs->time_last_bitmap_sync);
1041
1042     if (!page_count) {
1043         return;
1044     }
1045
1046     if (migrate_use_xbzrle()) {
1047         double encoded_size, unencoded_size;
1048
1049         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1050             rs->xbzrle_cache_miss_prev) / page_count;
1051         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1052         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1053                          TARGET_PAGE_SIZE;
1054         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1055         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1056             xbzrle_counters.encoding_rate = 0;
1057         } else {
1058             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1059         }
1060         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1061         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1062     }
1063
1064     if (migrate_use_compression()) {
1065         compression_counters.busy_rate = (double)(compression_counters.busy -
1066             rs->compress_thread_busy_prev) / page_count;
1067         rs->compress_thread_busy_prev = compression_counters.busy;
1068
1069         compressed_size = compression_counters.compressed_size -
1070                           rs->compressed_size_prev;
1071         if (compressed_size) {
1072             double uncompressed_size = (compression_counters.pages -
1073                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1074
1075             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1076             compression_counters.compression_rate =
1077                                         uncompressed_size / compressed_size;
1078
1079             rs->compress_pages_prev = compression_counters.pages;
1080             rs->compressed_size_prev = compression_counters.compressed_size;
1081         }
1082     }
1083 }
1084
1085 static void migration_trigger_throttle(RAMState *rs)
1086 {
1087     MigrationState *s = migrate_get_current();
1088     uint64_t threshold = s->parameters.throttle_trigger_threshold;
1089
1090     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1091     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1092     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1093
1094     /* During block migration the auto-converge logic incorrectly detects
1095      * that ram migration makes no progress. Avoid this by disabling the
1096      * throttling logic during the bulk phase of block migration. */
1097     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1098         /* The following detection logic can be refined later. For now:
1099            Check to see if the ratio between dirtied bytes and the approx.
1100            amount of bytes that just got transferred since the last time
1101            we were in this routine reaches the threshold. If that happens
1102            twice, start or increase throttling. */
1103
1104         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1105             (++rs->dirty_rate_high_cnt >= 2)) {
1106             trace_migration_throttle();
1107             rs->dirty_rate_high_cnt = 0;
1108             mig_throttle_guest_down(bytes_dirty_period,
1109                                     bytes_dirty_threshold);
1110         }
1111     }
1112 }
1113
1114 static void migration_bitmap_sync(RAMState *rs)
1115 {
1116     RAMBlock *block;
1117     int64_t end_time;
1118
1119     ram_counters.dirty_sync_count++;
1120
1121     if (!rs->time_last_bitmap_sync) {
1122         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1123     }
1124
1125     trace_migration_bitmap_sync_start();
1126     memory_global_dirty_log_sync();
1127
1128     qemu_mutex_lock(&rs->bitmap_mutex);
1129     WITH_RCU_READ_LOCK_GUARD() {
1130         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1131             ramblock_sync_dirty_bitmap(rs, block);
1132         }
1133         ram_counters.remaining = ram_bytes_remaining();
1134     }
1135     qemu_mutex_unlock(&rs->bitmap_mutex);
1136
1137     memory_global_after_dirty_log_sync();
1138     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1139
1140     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1141
1142     /* more than 1 second = 1000 millisecons */
1143     if (end_time > rs->time_last_bitmap_sync + 1000) {
1144         migration_trigger_throttle(rs);
1145
1146         migration_update_rates(rs, end_time);
1147
1148         rs->target_page_count_prev = rs->target_page_count;
1149
1150         /* reset period counters */
1151         rs->time_last_bitmap_sync = end_time;
1152         rs->num_dirty_pages_period = 0;
1153         rs->bytes_xfer_prev = ram_counters.transferred;
1154     }
1155     if (migrate_use_events()) {
1156         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1157     }
1158 }
1159
1160 static void migration_bitmap_sync_precopy(RAMState *rs)
1161 {
1162     Error *local_err = NULL;
1163
1164     /*
1165      * The current notifier usage is just an optimization to migration, so we
1166      * don't stop the normal migration process in the error case.
1167      */
1168     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1169         error_report_err(local_err);
1170         local_err = NULL;
1171     }
1172
1173     migration_bitmap_sync(rs);
1174
1175     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1176         error_report_err(local_err);
1177     }
1178 }
1179
1180 static void ram_release_page(const char *rbname, uint64_t offset)
1181 {
1182     if (!migrate_release_ram() || !migration_in_postcopy()) {
1183         return;
1184     }
1185
1186     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1187 }
1188
1189 /**
1190  * save_zero_page_to_file: send the zero page to the file
1191  *
1192  * Returns the size of data written to the file, 0 means the page is not
1193  * a zero page
1194  *
1195  * @rs: current RAM state
1196  * @file: the file where the data is saved
1197  * @block: block that contains the page we want to send
1198  * @offset: offset inside the block for the page
1199  */
1200 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1201                                   RAMBlock *block, ram_addr_t offset)
1202 {
1203     uint8_t *p = block->host + offset;
1204     int len = 0;
1205
1206     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1207         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1208         qemu_put_byte(file, 0);
1209         len += 1;
1210         ram_release_page(block->idstr, offset);
1211     }
1212     return len;
1213 }
1214
1215 /**
1216  * save_zero_page: send the zero page to the stream
1217  *
1218  * Returns the number of pages written.
1219  *
1220  * @rs: current RAM state
1221  * @block: block that contains the page we want to send
1222  * @offset: offset inside the block for the page
1223  */
1224 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1225 {
1226     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1227
1228     if (len) {
1229         ram_counters.duplicate++;
1230         ram_transferred_add(len);
1231         return 1;
1232     }
1233     return -1;
1234 }
1235
1236 /*
1237  * @pages: the number of pages written by the control path,
1238  *        < 0 - error
1239  *        > 0 - number of pages written
1240  *
1241  * Return true if the pages has been saved, otherwise false is returned.
1242  */
1243 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1244                               int *pages)
1245 {
1246     uint64_t bytes_xmit = 0;
1247     int ret;
1248
1249     *pages = -1;
1250     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1251                                 &bytes_xmit);
1252     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1253         return false;
1254     }
1255
1256     if (bytes_xmit) {
1257         ram_transferred_add(bytes_xmit);
1258         *pages = 1;
1259     }
1260
1261     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1262         return true;
1263     }
1264
1265     if (bytes_xmit > 0) {
1266         ram_counters.normal++;
1267     } else if (bytes_xmit == 0) {
1268         ram_counters.duplicate++;
1269     }
1270
1271     return true;
1272 }
1273
1274 /*
1275  * directly send the page to the stream
1276  *
1277  * Returns the number of pages written.
1278  *
1279  * @rs: current RAM state
1280  * @block: block that contains the page we want to send
1281  * @offset: offset inside the block for the page
1282  * @buf: the page to be sent
1283  * @async: send to page asyncly
1284  */
1285 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1286                             uint8_t *buf, bool async)
1287 {
1288     ram_transferred_add(save_page_header(rs, rs->f, block,
1289                                          offset | RAM_SAVE_FLAG_PAGE));
1290     if (async) {
1291         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1292                               migrate_release_ram() &
1293                               migration_in_postcopy());
1294     } else {
1295         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1296     }
1297     ram_transferred_add(TARGET_PAGE_SIZE);
1298     ram_counters.normal++;
1299     return 1;
1300 }
1301
1302 /**
1303  * ram_save_page: send the given page to the stream
1304  *
1305  * Returns the number of pages written.
1306  *          < 0 - error
1307  *          >=0 - Number of pages written - this might legally be 0
1308  *                if xbzrle noticed the page was the same.
1309  *
1310  * @rs: current RAM state
1311  * @block: block that contains the page we want to send
1312  * @offset: offset inside the block for the page
1313  */
1314 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1315 {
1316     int pages = -1;
1317     uint8_t *p;
1318     bool send_async = true;
1319     RAMBlock *block = pss->block;
1320     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1321     ram_addr_t current_addr = block->offset + offset;
1322
1323     p = block->host + offset;
1324     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1325
1326     XBZRLE_cache_lock();
1327     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1328         pages = save_xbzrle_page(rs, &p, current_addr, block,
1329                                  offset);
1330         if (!rs->last_stage) {
1331             /* Can't send this cached data async, since the cache page
1332              * might get updated before it gets to the wire
1333              */
1334             send_async = false;
1335         }
1336     }
1337
1338     /* XBZRLE overflow or normal page */
1339     if (pages == -1) {
1340         pages = save_normal_page(rs, block, offset, p, send_async);
1341     }
1342
1343     XBZRLE_cache_unlock();
1344
1345     return pages;
1346 }
1347
1348 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1349                                  ram_addr_t offset)
1350 {
1351     if (multifd_queue_page(rs->f, block, offset) < 0) {
1352         return -1;
1353     }
1354     ram_counters.normal++;
1355
1356     return 1;
1357 }
1358
1359 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1360                                  ram_addr_t offset, uint8_t *source_buf)
1361 {
1362     RAMState *rs = ram_state;
1363     uint8_t *p = block->host + offset;
1364     int ret;
1365
1366     if (save_zero_page_to_file(rs, f, block, offset)) {
1367         return true;
1368     }
1369
1370     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1371
1372     /*
1373      * copy it to a internal buffer to avoid it being modified by VM
1374      * so that we can catch up the error during compression and
1375      * decompression
1376      */
1377     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1378     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1379     if (ret < 0) {
1380         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1381         error_report("compressed data failed!");
1382     }
1383     return false;
1384 }
1385
1386 static void
1387 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1388 {
1389     ram_transferred_add(bytes_xmit);
1390
1391     if (param->zero_page) {
1392         ram_counters.duplicate++;
1393         return;
1394     }
1395
1396     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1397     compression_counters.compressed_size += bytes_xmit - 8;
1398     compression_counters.pages++;
1399 }
1400
1401 static bool save_page_use_compression(RAMState *rs);
1402
1403 static void flush_compressed_data(RAMState *rs)
1404 {
1405     int idx, len, thread_count;
1406
1407     if (!save_page_use_compression(rs)) {
1408         return;
1409     }
1410     thread_count = migrate_compress_threads();
1411
1412     qemu_mutex_lock(&comp_done_lock);
1413     for (idx = 0; idx < thread_count; idx++) {
1414         while (!comp_param[idx].done) {
1415             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1416         }
1417     }
1418     qemu_mutex_unlock(&comp_done_lock);
1419
1420     for (idx = 0; idx < thread_count; idx++) {
1421         qemu_mutex_lock(&comp_param[idx].mutex);
1422         if (!comp_param[idx].quit) {
1423             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1424             /*
1425              * it's safe to fetch zero_page without holding comp_done_lock
1426              * as there is no further request submitted to the thread,
1427              * i.e, the thread should be waiting for a request at this point.
1428              */
1429             update_compress_thread_counts(&comp_param[idx], len);
1430         }
1431         qemu_mutex_unlock(&comp_param[idx].mutex);
1432     }
1433 }
1434
1435 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1436                                        ram_addr_t offset)
1437 {
1438     param->block = block;
1439     param->offset = offset;
1440 }
1441
1442 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1443                                            ram_addr_t offset)
1444 {
1445     int idx, thread_count, bytes_xmit = -1, pages = -1;
1446     bool wait = migrate_compress_wait_thread();
1447
1448     thread_count = migrate_compress_threads();
1449     qemu_mutex_lock(&comp_done_lock);
1450 retry:
1451     for (idx = 0; idx < thread_count; idx++) {
1452         if (comp_param[idx].done) {
1453             comp_param[idx].done = false;
1454             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1455             qemu_mutex_lock(&comp_param[idx].mutex);
1456             set_compress_params(&comp_param[idx], block, offset);
1457             qemu_cond_signal(&comp_param[idx].cond);
1458             qemu_mutex_unlock(&comp_param[idx].mutex);
1459             pages = 1;
1460             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1461             break;
1462         }
1463     }
1464
1465     /*
1466      * wait for the free thread if the user specifies 'compress-wait-thread',
1467      * otherwise we will post the page out in the main thread as normal page.
1468      */
1469     if (pages < 0 && wait) {
1470         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1471         goto retry;
1472     }
1473     qemu_mutex_unlock(&comp_done_lock);
1474
1475     return pages;
1476 }
1477
1478 /**
1479  * find_dirty_block: find the next dirty page and update any state
1480  * associated with the search process.
1481  *
1482  * Returns true if a page is found
1483  *
1484  * @rs: current RAM state
1485  * @pss: data about the state of the current dirty page scan
1486  * @again: set to false if the search has scanned the whole of RAM
1487  */
1488 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1489 {
1490     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1491     if (pss->complete_round && pss->block == rs->last_seen_block &&
1492         pss->page >= rs->last_page) {
1493         /*
1494          * We've been once around the RAM and haven't found anything.
1495          * Give up.
1496          */
1497         *again = false;
1498         return false;
1499     }
1500     if (!offset_in_ramblock(pss->block,
1501                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1502         /* Didn't find anything in this RAM Block */
1503         pss->page = 0;
1504         pss->block = QLIST_NEXT_RCU(pss->block, next);
1505         if (!pss->block) {
1506             /*
1507              * If memory migration starts over, we will meet a dirtied page
1508              * which may still exists in compression threads's ring, so we
1509              * should flush the compressed data to make sure the new page
1510              * is not overwritten by the old one in the destination.
1511              *
1512              * Also If xbzrle is on, stop using the data compression at this
1513              * point. In theory, xbzrle can do better than compression.
1514              */
1515             flush_compressed_data(rs);
1516
1517             /* Hit the end of the list */
1518             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1519             /* Flag that we've looped */
1520             pss->complete_round = true;
1521             /* After the first round, enable XBZRLE. */
1522             if (migrate_use_xbzrle()) {
1523                 rs->xbzrle_enabled = true;
1524             }
1525         }
1526         /* Didn't find anything this time, but try again on the new block */
1527         *again = true;
1528         return false;
1529     } else {
1530         /* Can go around again, but... */
1531         *again = true;
1532         /* We've found something so probably don't need to */
1533         return true;
1534     }
1535 }
1536
1537 /**
1538  * unqueue_page: gets a page of the queue
1539  *
1540  * Helper for 'get_queued_page' - gets a page off the queue
1541  *
1542  * Returns the block of the page (or NULL if none available)
1543  *
1544  * @rs: current RAM state
1545  * @offset: used to return the offset within the RAMBlock
1546  */
1547 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1548 {
1549     struct RAMSrcPageRequest *entry;
1550     RAMBlock *block = NULL;
1551     size_t page_size;
1552
1553     if (!postcopy_has_request(rs)) {
1554         return NULL;
1555     }
1556
1557     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1558
1559     /*
1560      * This should _never_ change even after we take the lock, because no one
1561      * should be taking anything off the request list other than us.
1562      */
1563     assert(postcopy_has_request(rs));
1564
1565     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1566     block = entry->rb;
1567     *offset = entry->offset;
1568     page_size = qemu_ram_pagesize(block);
1569     /* Each page request should only be multiple page size of the ramblock */
1570     assert((entry->len % page_size) == 0);
1571
1572     if (entry->len > page_size) {
1573         entry->len -= page_size;
1574         entry->offset += page_size;
1575     } else {
1576         memory_region_unref(block->mr);
1577         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1578         g_free(entry);
1579         migration_consume_urgent_request();
1580     }
1581
1582     trace_unqueue_page(block->idstr, *offset,
1583                        test_bit((*offset >> TARGET_PAGE_BITS), block->bmap));
1584
1585     return block;
1586 }
1587
1588 #if defined(__linux__)
1589 /**
1590  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1591  *   is found, return RAM block pointer and page offset
1592  *
1593  * Returns pointer to the RAMBlock containing faulting page,
1594  *   NULL if no write faults are pending
1595  *
1596  * @rs: current RAM state
1597  * @offset: page offset from the beginning of the block
1598  */
1599 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1600 {
1601     struct uffd_msg uffd_msg;
1602     void *page_address;
1603     RAMBlock *block;
1604     int res;
1605
1606     if (!migrate_background_snapshot()) {
1607         return NULL;
1608     }
1609
1610     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1611     if (res <= 0) {
1612         return NULL;
1613     }
1614
1615     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1616     block = qemu_ram_block_from_host(page_address, false, offset);
1617     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1618     return block;
1619 }
1620
1621 /**
1622  * ram_save_release_protection: release UFFD write protection after
1623  *   a range of pages has been saved
1624  *
1625  * @rs: current RAM state
1626  * @pss: page-search-status structure
1627  * @start_page: index of the first page in the range relative to pss->block
1628  *
1629  * Returns 0 on success, negative value in case of an error
1630 */
1631 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1632         unsigned long start_page)
1633 {
1634     int res = 0;
1635
1636     /* Check if page is from UFFD-managed region. */
1637     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1638         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1639         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1640
1641         /* Flush async buffers before un-protect. */
1642         qemu_fflush(rs->f);
1643         /* Un-protect memory range. */
1644         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1645                 false, false);
1646     }
1647
1648     return res;
1649 }
1650
1651 /* ram_write_tracking_available: check if kernel supports required UFFD features
1652  *
1653  * Returns true if supports, false otherwise
1654  */
1655 bool ram_write_tracking_available(void)
1656 {
1657     uint64_t uffd_features;
1658     int res;
1659
1660     res = uffd_query_features(&uffd_features);
1661     return (res == 0 &&
1662             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1663 }
1664
1665 /* ram_write_tracking_compatible: check if guest configuration is
1666  *   compatible with 'write-tracking'
1667  *
1668  * Returns true if compatible, false otherwise
1669  */
1670 bool ram_write_tracking_compatible(void)
1671 {
1672     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1673     int uffd_fd;
1674     RAMBlock *block;
1675     bool ret = false;
1676
1677     /* Open UFFD file descriptor */
1678     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1679     if (uffd_fd < 0) {
1680         return false;
1681     }
1682
1683     RCU_READ_LOCK_GUARD();
1684
1685     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1686         uint64_t uffd_ioctls;
1687
1688         /* Nothing to do with read-only and MMIO-writable regions */
1689         if (block->mr->readonly || block->mr->rom_device) {
1690             continue;
1691         }
1692         /* Try to register block memory via UFFD-IO to track writes */
1693         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1694                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1695             goto out;
1696         }
1697         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1698             goto out;
1699         }
1700     }
1701     ret = true;
1702
1703 out:
1704     uffd_close_fd(uffd_fd);
1705     return ret;
1706 }
1707
1708 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1709                                        ram_addr_t size)
1710 {
1711     /*
1712      * We read one byte of each page; this will preallocate page tables if
1713      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1714      * where no page was populated yet. This might require adaption when
1715      * supporting other mappings, like shmem.
1716      */
1717     for (; offset < size; offset += block->page_size) {
1718         char tmp = *((char *)block->host + offset);
1719
1720         /* Don't optimize the read out */
1721         asm volatile("" : "+r" (tmp));
1722     }
1723 }
1724
1725 static inline int populate_read_section(MemoryRegionSection *section,
1726                                         void *opaque)
1727 {
1728     const hwaddr size = int128_get64(section->size);
1729     hwaddr offset = section->offset_within_region;
1730     RAMBlock *block = section->mr->ram_block;
1731
1732     populate_read_range(block, offset, size);
1733     return 0;
1734 }
1735
1736 /*
1737  * ram_block_populate_read: preallocate page tables and populate pages in the
1738  *   RAM block by reading a byte of each page.
1739  *
1740  * Since it's solely used for userfault_fd WP feature, here we just
1741  *   hardcode page size to qemu_real_host_page_size.
1742  *
1743  * @block: RAM block to populate
1744  */
1745 static void ram_block_populate_read(RAMBlock *rb)
1746 {
1747     /*
1748      * Skip populating all pages that fall into a discarded range as managed by
1749      * a RamDiscardManager responsible for the mapped memory region of the
1750      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1751      * must not get populated automatically. We don't have to track
1752      * modifications via userfaultfd WP reliably, because these pages will
1753      * not be part of the migration stream either way -- see
1754      * ramblock_dirty_bitmap_exclude_discarded_pages().
1755      *
1756      * Note: The result is only stable while migrating (precopy/postcopy).
1757      */
1758     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1759         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1760         MemoryRegionSection section = {
1761             .mr = rb->mr,
1762             .offset_within_region = 0,
1763             .size = rb->mr->size,
1764         };
1765
1766         ram_discard_manager_replay_populated(rdm, &section,
1767                                              populate_read_section, NULL);
1768     } else {
1769         populate_read_range(rb, 0, rb->used_length);
1770     }
1771 }
1772
1773 /*
1774  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1775  */
1776 void ram_write_tracking_prepare(void)
1777 {
1778     RAMBlock *block;
1779
1780     RCU_READ_LOCK_GUARD();
1781
1782     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1783         /* Nothing to do with read-only and MMIO-writable regions */
1784         if (block->mr->readonly || block->mr->rom_device) {
1785             continue;
1786         }
1787
1788         /*
1789          * Populate pages of the RAM block before enabling userfault_fd
1790          * write protection.
1791          *
1792          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1793          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1794          * pages with pte_none() entries in page table.
1795          */
1796         ram_block_populate_read(block);
1797     }
1798 }
1799
1800 /*
1801  * ram_write_tracking_start: start UFFD-WP memory tracking
1802  *
1803  * Returns 0 for success or negative value in case of error
1804  */
1805 int ram_write_tracking_start(void)
1806 {
1807     int uffd_fd;
1808     RAMState *rs = ram_state;
1809     RAMBlock *block;
1810
1811     /* Open UFFD file descriptor */
1812     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1813     if (uffd_fd < 0) {
1814         return uffd_fd;
1815     }
1816     rs->uffdio_fd = uffd_fd;
1817
1818     RCU_READ_LOCK_GUARD();
1819
1820     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1821         /* Nothing to do with read-only and MMIO-writable regions */
1822         if (block->mr->readonly || block->mr->rom_device) {
1823             continue;
1824         }
1825
1826         /* Register block memory with UFFD to track writes */
1827         if (uffd_register_memory(rs->uffdio_fd, block->host,
1828                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1829             goto fail;
1830         }
1831         /* Apply UFFD write protection to the block memory range */
1832         if (uffd_change_protection(rs->uffdio_fd, block->host,
1833                 block->max_length, true, false)) {
1834             goto fail;
1835         }
1836         block->flags |= RAM_UF_WRITEPROTECT;
1837         memory_region_ref(block->mr);
1838
1839         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1840                 block->host, block->max_length);
1841     }
1842
1843     return 0;
1844
1845 fail:
1846     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1847
1848     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1849         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1850             continue;
1851         }
1852         /*
1853          * In case some memory block failed to be write-protected
1854          * remove protection and unregister all succeeded RAM blocks
1855          */
1856         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1857                 false, false);
1858         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1859         /* Cleanup flags and remove reference */
1860         block->flags &= ~RAM_UF_WRITEPROTECT;
1861         memory_region_unref(block->mr);
1862     }
1863
1864     uffd_close_fd(uffd_fd);
1865     rs->uffdio_fd = -1;
1866     return -1;
1867 }
1868
1869 /**
1870  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1871  */
1872 void ram_write_tracking_stop(void)
1873 {
1874     RAMState *rs = ram_state;
1875     RAMBlock *block;
1876
1877     RCU_READ_LOCK_GUARD();
1878
1879     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1880         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1881             continue;
1882         }
1883         /* Remove protection and unregister all affected RAM blocks */
1884         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1885                 false, false);
1886         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1887
1888         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1889                 block->host, block->max_length);
1890
1891         /* Cleanup flags and remove reference */
1892         block->flags &= ~RAM_UF_WRITEPROTECT;
1893         memory_region_unref(block->mr);
1894     }
1895
1896     /* Finally close UFFD file descriptor */
1897     uffd_close_fd(rs->uffdio_fd);
1898     rs->uffdio_fd = -1;
1899 }
1900
1901 #else
1902 /* No target OS support, stubs just fail or ignore */
1903
1904 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1905 {
1906     (void) rs;
1907     (void) offset;
1908
1909     return NULL;
1910 }
1911
1912 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1913         unsigned long start_page)
1914 {
1915     (void) rs;
1916     (void) pss;
1917     (void) start_page;
1918
1919     return 0;
1920 }
1921
1922 bool ram_write_tracking_available(void)
1923 {
1924     return false;
1925 }
1926
1927 bool ram_write_tracking_compatible(void)
1928 {
1929     assert(0);
1930     return false;
1931 }
1932
1933 int ram_write_tracking_start(void)
1934 {
1935     assert(0);
1936     return -1;
1937 }
1938
1939 void ram_write_tracking_stop(void)
1940 {
1941     assert(0);
1942 }
1943 #endif /* defined(__linux__) */
1944
1945 /**
1946  * get_queued_page: unqueue a page from the postcopy requests
1947  *
1948  * Skips pages that are already sent (!dirty)
1949  *
1950  * Returns true if a queued page is found
1951  *
1952  * @rs: current RAM state
1953  * @pss: data about the state of the current dirty page scan
1954  */
1955 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1956 {
1957     RAMBlock  *block;
1958     ram_addr_t offset;
1959
1960     block = unqueue_page(rs, &offset);
1961
1962     if (!block) {
1963         /*
1964          * Poll write faults too if background snapshot is enabled; that's
1965          * when we have vcpus got blocked by the write protected pages.
1966          */
1967         block = poll_fault_page(rs, &offset);
1968     }
1969
1970     if (block) {
1971         /*
1972          * We want the background search to continue from the queued page
1973          * since the guest is likely to want other pages near to the page
1974          * it just requested.
1975          */
1976         pss->block = block;
1977         pss->page = offset >> TARGET_PAGE_BITS;
1978
1979         /*
1980          * This unqueued page would break the "one round" check, even is
1981          * really rare.
1982          */
1983         pss->complete_round = false;
1984     }
1985
1986     return !!block;
1987 }
1988
1989 /**
1990  * migration_page_queue_free: drop any remaining pages in the ram
1991  * request queue
1992  *
1993  * It should be empty at the end anyway, but in error cases there may
1994  * be some left.  in case that there is any page left, we drop it.
1995  *
1996  */
1997 static void migration_page_queue_free(RAMState *rs)
1998 {
1999     struct RAMSrcPageRequest *mspr, *next_mspr;
2000     /* This queue generally should be empty - but in the case of a failed
2001      * migration might have some droppings in.
2002      */
2003     RCU_READ_LOCK_GUARD();
2004     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2005         memory_region_unref(mspr->rb->mr);
2006         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2007         g_free(mspr);
2008     }
2009 }
2010
2011 /**
2012  * ram_save_queue_pages: queue the page for transmission
2013  *
2014  * A request from postcopy destination for example.
2015  *
2016  * Returns zero on success or negative on error
2017  *
2018  * @rbname: Name of the RAMBLock of the request. NULL means the
2019  *          same that last one.
2020  * @start: starting address from the start of the RAMBlock
2021  * @len: length (in bytes) to send
2022  */
2023 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2024 {
2025     RAMBlock *ramblock;
2026     RAMState *rs = ram_state;
2027
2028     ram_counters.postcopy_requests++;
2029     RCU_READ_LOCK_GUARD();
2030
2031     if (!rbname) {
2032         /* Reuse last RAMBlock */
2033         ramblock = rs->last_req_rb;
2034
2035         if (!ramblock) {
2036             /*
2037              * Shouldn't happen, we can't reuse the last RAMBlock if
2038              * it's the 1st request.
2039              */
2040             error_report("ram_save_queue_pages no previous block");
2041             return -1;
2042         }
2043     } else {
2044         ramblock = qemu_ram_block_by_name(rbname);
2045
2046         if (!ramblock) {
2047             /* We shouldn't be asked for a non-existent RAMBlock */
2048             error_report("ram_save_queue_pages no block '%s'", rbname);
2049             return -1;
2050         }
2051         rs->last_req_rb = ramblock;
2052     }
2053     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2054     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2055         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2056                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2057                      __func__, start, len, ramblock->used_length);
2058         return -1;
2059     }
2060
2061     struct RAMSrcPageRequest *new_entry =
2062         g_malloc0(sizeof(struct RAMSrcPageRequest));
2063     new_entry->rb = ramblock;
2064     new_entry->offset = start;
2065     new_entry->len = len;
2066
2067     memory_region_ref(ramblock->mr);
2068     qemu_mutex_lock(&rs->src_page_req_mutex);
2069     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2070     migration_make_urgent_request();
2071     qemu_mutex_unlock(&rs->src_page_req_mutex);
2072
2073     return 0;
2074 }
2075
2076 static bool save_page_use_compression(RAMState *rs)
2077 {
2078     if (!migrate_use_compression()) {
2079         return false;
2080     }
2081
2082     /*
2083      * If xbzrle is enabled (e.g., after first round of migration), stop
2084      * using the data compression. In theory, xbzrle can do better than
2085      * compression.
2086      */
2087     if (rs->xbzrle_enabled) {
2088         return false;
2089     }
2090
2091     return true;
2092 }
2093
2094 /*
2095  * try to compress the page before posting it out, return true if the page
2096  * has been properly handled by compression, otherwise needs other
2097  * paths to handle it
2098  */
2099 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2100 {
2101     if (!save_page_use_compression(rs)) {
2102         return false;
2103     }
2104
2105     /*
2106      * When starting the process of a new block, the first page of
2107      * the block should be sent out before other pages in the same
2108      * block, and all the pages in last block should have been sent
2109      * out, keeping this order is important, because the 'cont' flag
2110      * is used to avoid resending the block name.
2111      *
2112      * We post the fist page as normal page as compression will take
2113      * much CPU resource.
2114      */
2115     if (block != rs->last_sent_block) {
2116         flush_compressed_data(rs);
2117         return false;
2118     }
2119
2120     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2121         return true;
2122     }
2123
2124     compression_counters.busy++;
2125     return false;
2126 }
2127
2128 /**
2129  * ram_save_target_page: save one target page
2130  *
2131  * Returns the number of pages written
2132  *
2133  * @rs: current RAM state
2134  * @pss: data about the page we want to send
2135  */
2136 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
2137 {
2138     RAMBlock *block = pss->block;
2139     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2140     int res;
2141
2142     if (control_save_page(rs, block, offset, &res)) {
2143         return res;
2144     }
2145
2146     if (save_compress_page(rs, block, offset)) {
2147         return 1;
2148     }
2149
2150     res = save_zero_page(rs, block, offset);
2151     if (res > 0) {
2152         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2153          * page would be stale
2154          */
2155         if (!save_page_use_compression(rs)) {
2156             XBZRLE_cache_lock();
2157             xbzrle_cache_zero_page(rs, block->offset + offset);
2158             XBZRLE_cache_unlock();
2159         }
2160         return res;
2161     }
2162
2163     /*
2164      * Do not use multifd for:
2165      * 1. Compression as the first page in the new block should be posted out
2166      *    before sending the compressed page
2167      * 2. In postcopy as one whole host page should be placed
2168      */
2169     if (!save_page_use_compression(rs) && migrate_use_multifd()
2170         && !migration_in_postcopy()) {
2171         return ram_save_multifd_page(rs, block, offset);
2172     }
2173
2174     return ram_save_page(rs, pss);
2175 }
2176
2177 /**
2178  * ram_save_host_page: save a whole host page
2179  *
2180  * Starting at *offset send pages up to the end of the current host
2181  * page. It's valid for the initial offset to point into the middle of
2182  * a host page in which case the remainder of the hostpage is sent.
2183  * Only dirty target pages are sent. Note that the host page size may
2184  * be a huge page for this block.
2185  * The saving stops at the boundary of the used_length of the block
2186  * if the RAMBlock isn't a multiple of the host page size.
2187  *
2188  * Returns the number of pages written or negative on error
2189  *
2190  * @rs: current RAM state
2191  * @pss: data about the page we want to send
2192  */
2193 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2194 {
2195     int tmppages, pages = 0;
2196     size_t pagesize_bits =
2197         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2198     unsigned long hostpage_boundary =
2199         QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2200     unsigned long start_page = pss->page;
2201     int res;
2202
2203     if (ramblock_is_ignored(pss->block)) {
2204         error_report("block %s should not be migrated !", pss->block->idstr);
2205         return 0;
2206     }
2207
2208     do {
2209         /* Check the pages is dirty and if it is send it */
2210         if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2211             tmppages = ram_save_target_page(rs, pss);
2212             if (tmppages < 0) {
2213                 return tmppages;
2214             }
2215
2216             pages += tmppages;
2217             /*
2218              * Allow rate limiting to happen in the middle of huge pages if
2219              * something is sent in the current iteration.
2220              */
2221             if (pagesize_bits > 1 && tmppages > 0) {
2222                 migration_rate_limit();
2223             }
2224         }
2225         pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2226     } while ((pss->page < hostpage_boundary) &&
2227              offset_in_ramblock(pss->block,
2228                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2229     /* The offset we leave with is the min boundary of host page and block */
2230     pss->page = MIN(pss->page, hostpage_boundary);
2231
2232     res = ram_save_release_protection(rs, pss, start_page);
2233     return (res < 0 ? res : pages);
2234 }
2235
2236 /**
2237  * ram_find_and_save_block: finds a dirty page and sends it to f
2238  *
2239  * Called within an RCU critical section.
2240  *
2241  * Returns the number of pages written where zero means no dirty pages,
2242  * or negative on error
2243  *
2244  * @rs: current RAM state
2245  *
2246  * On systems where host-page-size > target-page-size it will send all the
2247  * pages in a host page that are dirty.
2248  */
2249 static int ram_find_and_save_block(RAMState *rs)
2250 {
2251     PageSearchStatus pss;
2252     int pages = 0;
2253     bool again, found;
2254
2255     /* No dirty page as there is zero RAM */
2256     if (!ram_bytes_total()) {
2257         return pages;
2258     }
2259
2260     pss.block = rs->last_seen_block;
2261     pss.page = rs->last_page;
2262     pss.complete_round = false;
2263
2264     if (!pss.block) {
2265         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2266     }
2267
2268     do {
2269         again = true;
2270         found = get_queued_page(rs, &pss);
2271
2272         if (!found) {
2273             /* priority queue empty, so just search for something dirty */
2274             found = find_dirty_block(rs, &pss, &again);
2275         }
2276
2277         if (found) {
2278             pages = ram_save_host_page(rs, &pss);
2279         }
2280     } while (!pages && again);
2281
2282     rs->last_seen_block = pss.block;
2283     rs->last_page = pss.page;
2284
2285     return pages;
2286 }
2287
2288 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2289 {
2290     uint64_t pages = size / TARGET_PAGE_SIZE;
2291
2292     if (zero) {
2293         ram_counters.duplicate += pages;
2294     } else {
2295         ram_counters.normal += pages;
2296         ram_transferred_add(size);
2297         qemu_update_position(f, size);
2298     }
2299 }
2300
2301 static uint64_t ram_bytes_total_common(bool count_ignored)
2302 {
2303     RAMBlock *block;
2304     uint64_t total = 0;
2305
2306     RCU_READ_LOCK_GUARD();
2307
2308     if (count_ignored) {
2309         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2310             total += block->used_length;
2311         }
2312     } else {
2313         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2314             total += block->used_length;
2315         }
2316     }
2317     return total;
2318 }
2319
2320 uint64_t ram_bytes_total(void)
2321 {
2322     return ram_bytes_total_common(false);
2323 }
2324
2325 static void xbzrle_load_setup(void)
2326 {
2327     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2328 }
2329
2330 static void xbzrle_load_cleanup(void)
2331 {
2332     g_free(XBZRLE.decoded_buf);
2333     XBZRLE.decoded_buf = NULL;
2334 }
2335
2336 static void ram_state_cleanup(RAMState **rsp)
2337 {
2338     if (*rsp) {
2339         migration_page_queue_free(*rsp);
2340         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2341         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2342         g_free(*rsp);
2343         *rsp = NULL;
2344     }
2345 }
2346
2347 static void xbzrle_cleanup(void)
2348 {
2349     XBZRLE_cache_lock();
2350     if (XBZRLE.cache) {
2351         cache_fini(XBZRLE.cache);
2352         g_free(XBZRLE.encoded_buf);
2353         g_free(XBZRLE.current_buf);
2354         g_free(XBZRLE.zero_target_page);
2355         XBZRLE.cache = NULL;
2356         XBZRLE.encoded_buf = NULL;
2357         XBZRLE.current_buf = NULL;
2358         XBZRLE.zero_target_page = NULL;
2359     }
2360     XBZRLE_cache_unlock();
2361 }
2362
2363 static void ram_save_cleanup(void *opaque)
2364 {
2365     RAMState **rsp = opaque;
2366     RAMBlock *block;
2367
2368     /* We don't use dirty log with background snapshots */
2369     if (!migrate_background_snapshot()) {
2370         /* caller have hold iothread lock or is in a bh, so there is
2371          * no writing race against the migration bitmap
2372          */
2373         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2374             /*
2375              * do not stop dirty log without starting it, since
2376              * memory_global_dirty_log_stop will assert that
2377              * memory_global_dirty_log_start/stop used in pairs
2378              */
2379             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2380         }
2381     }
2382
2383     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2384         g_free(block->clear_bmap);
2385         block->clear_bmap = NULL;
2386         g_free(block->bmap);
2387         block->bmap = NULL;
2388     }
2389
2390     xbzrle_cleanup();
2391     compress_threads_save_cleanup();
2392     ram_state_cleanup(rsp);
2393 }
2394
2395 static void ram_state_reset(RAMState *rs)
2396 {
2397     rs->last_seen_block = NULL;
2398     rs->last_sent_block = NULL;
2399     rs->last_page = 0;
2400     rs->last_version = ram_list.version;
2401     rs->xbzrle_enabled = false;
2402 }
2403
2404 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2405
2406 /* **** functions for postcopy ***** */
2407
2408 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2409 {
2410     struct RAMBlock *block;
2411
2412     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2413         unsigned long *bitmap = block->bmap;
2414         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2415         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2416
2417         while (run_start < range) {
2418             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2419             ram_discard_range(block->idstr,
2420                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2421                               ((ram_addr_t)(run_end - run_start))
2422                                 << TARGET_PAGE_BITS);
2423             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2424         }
2425     }
2426 }
2427
2428 /**
2429  * postcopy_send_discard_bm_ram: discard a RAMBlock
2430  *
2431  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2432  *
2433  * @ms: current migration state
2434  * @block: RAMBlock to discard
2435  */
2436 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2437 {
2438     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2439     unsigned long current;
2440     unsigned long *bitmap = block->bmap;
2441
2442     for (current = 0; current < end; ) {
2443         unsigned long one = find_next_bit(bitmap, end, current);
2444         unsigned long zero, discard_length;
2445
2446         if (one >= end) {
2447             break;
2448         }
2449
2450         zero = find_next_zero_bit(bitmap, end, one + 1);
2451
2452         if (zero >= end) {
2453             discard_length = end - one;
2454         } else {
2455             discard_length = zero - one;
2456         }
2457         postcopy_discard_send_range(ms, one, discard_length);
2458         current = one + discard_length;
2459     }
2460 }
2461
2462 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2463
2464 /**
2465  * postcopy_each_ram_send_discard: discard all RAMBlocks
2466  *
2467  * Utility for the outgoing postcopy code.
2468  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2469  *   passing it bitmap indexes and name.
2470  * (qemu_ram_foreach_block ends up passing unscaled lengths
2471  *  which would mean postcopy code would have to deal with target page)
2472  *
2473  * @ms: current migration state
2474  */
2475 static void postcopy_each_ram_send_discard(MigrationState *ms)
2476 {
2477     struct RAMBlock *block;
2478
2479     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2480         postcopy_discard_send_init(ms, block->idstr);
2481
2482         /*
2483          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2484          * host-page size chunks, mark any partially dirty host-page size
2485          * chunks as all dirty.  In this case the host-page is the host-page
2486          * for the particular RAMBlock, i.e. it might be a huge page.
2487          */
2488         postcopy_chunk_hostpages_pass(ms, block);
2489
2490         /*
2491          * Postcopy sends chunks of bitmap over the wire, but it
2492          * just needs indexes at this point, avoids it having
2493          * target page specific code.
2494          */
2495         postcopy_send_discard_bm_ram(ms, block);
2496         postcopy_discard_send_finish(ms);
2497     }
2498 }
2499
2500 /**
2501  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2502  *
2503  * Helper for postcopy_chunk_hostpages; it's called twice to
2504  * canonicalize the two bitmaps, that are similar, but one is
2505  * inverted.
2506  *
2507  * Postcopy requires that all target pages in a hostpage are dirty or
2508  * clean, not a mix.  This function canonicalizes the bitmaps.
2509  *
2510  * @ms: current migration state
2511  * @block: block that contains the page we want to canonicalize
2512  */
2513 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2514 {
2515     RAMState *rs = ram_state;
2516     unsigned long *bitmap = block->bmap;
2517     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2518     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2519     unsigned long run_start;
2520
2521     if (block->page_size == TARGET_PAGE_SIZE) {
2522         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2523         return;
2524     }
2525
2526     /* Find a dirty page */
2527     run_start = find_next_bit(bitmap, pages, 0);
2528
2529     while (run_start < pages) {
2530
2531         /*
2532          * If the start of this run of pages is in the middle of a host
2533          * page, then we need to fixup this host page.
2534          */
2535         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2536             /* Find the end of this run */
2537             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2538             /*
2539              * If the end isn't at the start of a host page, then the
2540              * run doesn't finish at the end of a host page
2541              * and we need to discard.
2542              */
2543         }
2544
2545         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2546             unsigned long page;
2547             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2548                                                              host_ratio);
2549             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2550
2551             /* Clean up the bitmap */
2552             for (page = fixup_start_addr;
2553                  page < fixup_start_addr + host_ratio; page++) {
2554                 /*
2555                  * Remark them as dirty, updating the count for any pages
2556                  * that weren't previously dirty.
2557                  */
2558                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2559             }
2560         }
2561
2562         /* Find the next dirty page for the next iteration */
2563         run_start = find_next_bit(bitmap, pages, run_start);
2564     }
2565 }
2566
2567 /**
2568  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2569  *
2570  * Transmit the set of pages to be discarded after precopy to the target
2571  * these are pages that:
2572  *     a) Have been previously transmitted but are now dirty again
2573  *     b) Pages that have never been transmitted, this ensures that
2574  *        any pages on the destination that have been mapped by background
2575  *        tasks get discarded (transparent huge pages is the specific concern)
2576  * Hopefully this is pretty sparse
2577  *
2578  * @ms: current migration state
2579  */
2580 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2581 {
2582     RAMState *rs = ram_state;
2583
2584     RCU_READ_LOCK_GUARD();
2585
2586     /* This should be our last sync, the src is now paused */
2587     migration_bitmap_sync(rs);
2588
2589     /* Easiest way to make sure we don't resume in the middle of a host-page */
2590     rs->last_seen_block = NULL;
2591     rs->last_sent_block = NULL;
2592     rs->last_page = 0;
2593
2594     postcopy_each_ram_send_discard(ms);
2595
2596     trace_ram_postcopy_send_discard_bitmap();
2597 }
2598
2599 /**
2600  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2601  *
2602  * Returns zero on success
2603  *
2604  * @rbname: name of the RAMBlock of the request. NULL means the
2605  *          same that last one.
2606  * @start: RAMBlock starting page
2607  * @length: RAMBlock size
2608  */
2609 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2610 {
2611     trace_ram_discard_range(rbname, start, length);
2612
2613     RCU_READ_LOCK_GUARD();
2614     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2615
2616     if (!rb) {
2617         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2618         return -1;
2619     }
2620
2621     /*
2622      * On source VM, we don't need to update the received bitmap since
2623      * we don't even have one.
2624      */
2625     if (rb->receivedmap) {
2626         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2627                      length >> qemu_target_page_bits());
2628     }
2629
2630     return ram_block_discard_range(rb, start, length);
2631 }
2632
2633 /*
2634  * For every allocation, we will try not to crash the VM if the
2635  * allocation failed.
2636  */
2637 static int xbzrle_init(void)
2638 {
2639     Error *local_err = NULL;
2640
2641     if (!migrate_use_xbzrle()) {
2642         return 0;
2643     }
2644
2645     XBZRLE_cache_lock();
2646
2647     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2648     if (!XBZRLE.zero_target_page) {
2649         error_report("%s: Error allocating zero page", __func__);
2650         goto err_out;
2651     }
2652
2653     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2654                               TARGET_PAGE_SIZE, &local_err);
2655     if (!XBZRLE.cache) {
2656         error_report_err(local_err);
2657         goto free_zero_page;
2658     }
2659
2660     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2661     if (!XBZRLE.encoded_buf) {
2662         error_report("%s: Error allocating encoded_buf", __func__);
2663         goto free_cache;
2664     }
2665
2666     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2667     if (!XBZRLE.current_buf) {
2668         error_report("%s: Error allocating current_buf", __func__);
2669         goto free_encoded_buf;
2670     }
2671
2672     /* We are all good */
2673     XBZRLE_cache_unlock();
2674     return 0;
2675
2676 free_encoded_buf:
2677     g_free(XBZRLE.encoded_buf);
2678     XBZRLE.encoded_buf = NULL;
2679 free_cache:
2680     cache_fini(XBZRLE.cache);
2681     XBZRLE.cache = NULL;
2682 free_zero_page:
2683     g_free(XBZRLE.zero_target_page);
2684     XBZRLE.zero_target_page = NULL;
2685 err_out:
2686     XBZRLE_cache_unlock();
2687     return -ENOMEM;
2688 }
2689
2690 static int ram_state_init(RAMState **rsp)
2691 {
2692     *rsp = g_try_new0(RAMState, 1);
2693
2694     if (!*rsp) {
2695         error_report("%s: Init ramstate fail", __func__);
2696         return -1;
2697     }
2698
2699     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2700     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2701     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2702
2703     /*
2704      * Count the total number of pages used by ram blocks not including any
2705      * gaps due to alignment or unplugs.
2706      * This must match with the initial values of dirty bitmap.
2707      */
2708     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2709     ram_state_reset(*rsp);
2710
2711     return 0;
2712 }
2713
2714 static void ram_list_init_bitmaps(void)
2715 {
2716     MigrationState *ms = migrate_get_current();
2717     RAMBlock *block;
2718     unsigned long pages;
2719     uint8_t shift;
2720
2721     /* Skip setting bitmap if there is no RAM */
2722     if (ram_bytes_total()) {
2723         shift = ms->clear_bitmap_shift;
2724         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2725             error_report("clear_bitmap_shift (%u) too big, using "
2726                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2727             shift = CLEAR_BITMAP_SHIFT_MAX;
2728         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2729             error_report("clear_bitmap_shift (%u) too small, using "
2730                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2731             shift = CLEAR_BITMAP_SHIFT_MIN;
2732         }
2733
2734         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2735             pages = block->max_length >> TARGET_PAGE_BITS;
2736             /*
2737              * The initial dirty bitmap for migration must be set with all
2738              * ones to make sure we'll migrate every guest RAM page to
2739              * destination.
2740              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2741              * new migration after a failed migration, ram_list.
2742              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2743              * guest memory.
2744              */
2745             block->bmap = bitmap_new(pages);
2746             bitmap_set(block->bmap, 0, pages);
2747             block->clear_bmap_shift = shift;
2748             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2749         }
2750     }
2751 }
2752
2753 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2754 {
2755     unsigned long pages;
2756     RAMBlock *rb;
2757
2758     RCU_READ_LOCK_GUARD();
2759
2760     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2761             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2762             rs->migration_dirty_pages -= pages;
2763     }
2764 }
2765
2766 static void ram_init_bitmaps(RAMState *rs)
2767 {
2768     /* For memory_global_dirty_log_start below.  */
2769     qemu_mutex_lock_iothread();
2770     qemu_mutex_lock_ramlist();
2771
2772     WITH_RCU_READ_LOCK_GUARD() {
2773         ram_list_init_bitmaps();
2774         /* We don't use dirty log with background snapshots */
2775         if (!migrate_background_snapshot()) {
2776             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2777             migration_bitmap_sync_precopy(rs);
2778         }
2779     }
2780     qemu_mutex_unlock_ramlist();
2781     qemu_mutex_unlock_iothread();
2782
2783     /*
2784      * After an eventual first bitmap sync, fixup the initial bitmap
2785      * containing all 1s to exclude any discarded pages from migration.
2786      */
2787     migration_bitmap_clear_discarded_pages(rs);
2788 }
2789
2790 static int ram_init_all(RAMState **rsp)
2791 {
2792     if (ram_state_init(rsp)) {
2793         return -1;
2794     }
2795
2796     if (xbzrle_init()) {
2797         ram_state_cleanup(rsp);
2798         return -1;
2799     }
2800
2801     ram_init_bitmaps(*rsp);
2802
2803     return 0;
2804 }
2805
2806 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2807 {
2808     RAMBlock *block;
2809     uint64_t pages = 0;
2810
2811     /*
2812      * Postcopy is not using xbzrle/compression, so no need for that.
2813      * Also, since source are already halted, we don't need to care
2814      * about dirty page logging as well.
2815      */
2816
2817     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2818         pages += bitmap_count_one(block->bmap,
2819                                   block->used_length >> TARGET_PAGE_BITS);
2820     }
2821
2822     /* This may not be aligned with current bitmaps. Recalculate. */
2823     rs->migration_dirty_pages = pages;
2824
2825     ram_state_reset(rs);
2826
2827     /* Update RAMState cache of output QEMUFile */
2828     rs->f = out;
2829
2830     trace_ram_state_resume_prepare(pages);
2831 }
2832
2833 /*
2834  * This function clears bits of the free pages reported by the caller from the
2835  * migration dirty bitmap. @addr is the host address corresponding to the
2836  * start of the continuous guest free pages, and @len is the total bytes of
2837  * those pages.
2838  */
2839 void qemu_guest_free_page_hint(void *addr, size_t len)
2840 {
2841     RAMBlock *block;
2842     ram_addr_t offset;
2843     size_t used_len, start, npages;
2844     MigrationState *s = migrate_get_current();
2845
2846     /* This function is currently expected to be used during live migration */
2847     if (!migration_is_setup_or_active(s->state)) {
2848         return;
2849     }
2850
2851     for (; len > 0; len -= used_len, addr += used_len) {
2852         block = qemu_ram_block_from_host(addr, false, &offset);
2853         if (unlikely(!block || offset >= block->used_length)) {
2854             /*
2855              * The implementation might not support RAMBlock resize during
2856              * live migration, but it could happen in theory with future
2857              * updates. So we add a check here to capture that case.
2858              */
2859             error_report_once("%s unexpected error", __func__);
2860             return;
2861         }
2862
2863         if (len <= block->used_length - offset) {
2864             used_len = len;
2865         } else {
2866             used_len = block->used_length - offset;
2867         }
2868
2869         start = offset >> TARGET_PAGE_BITS;
2870         npages = used_len >> TARGET_PAGE_BITS;
2871
2872         qemu_mutex_lock(&ram_state->bitmap_mutex);
2873         /*
2874          * The skipped free pages are equavalent to be sent from clear_bmap's
2875          * perspective, so clear the bits from the memory region bitmap which
2876          * are initially set. Otherwise those skipped pages will be sent in
2877          * the next round after syncing from the memory region bitmap.
2878          */
2879         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2880         ram_state->migration_dirty_pages -=
2881                       bitmap_count_one_with_offset(block->bmap, start, npages);
2882         bitmap_clear(block->bmap, start, npages);
2883         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2884     }
2885 }
2886
2887 /*
2888  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2889  * long-running RCU critical section.  When rcu-reclaims in the code
2890  * start to become numerous it will be necessary to reduce the
2891  * granularity of these critical sections.
2892  */
2893
2894 /**
2895  * ram_save_setup: Setup RAM for migration
2896  *
2897  * Returns zero to indicate success and negative for error
2898  *
2899  * @f: QEMUFile where to send the data
2900  * @opaque: RAMState pointer
2901  */
2902 static int ram_save_setup(QEMUFile *f, void *opaque)
2903 {
2904     RAMState **rsp = opaque;
2905     RAMBlock *block;
2906
2907     if (compress_threads_save_setup()) {
2908         return -1;
2909     }
2910
2911     /* migration has already setup the bitmap, reuse it. */
2912     if (!migration_in_colo_state()) {
2913         if (ram_init_all(rsp) != 0) {
2914             compress_threads_save_cleanup();
2915             return -1;
2916         }
2917     }
2918     (*rsp)->f = f;
2919
2920     WITH_RCU_READ_LOCK_GUARD() {
2921         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2922
2923         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2924             qemu_put_byte(f, strlen(block->idstr));
2925             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2926             qemu_put_be64(f, block->used_length);
2927             if (migrate_postcopy_ram() && block->page_size !=
2928                                           qemu_host_page_size) {
2929                 qemu_put_be64(f, block->page_size);
2930             }
2931             if (migrate_ignore_shared()) {
2932                 qemu_put_be64(f, block->mr->addr);
2933             }
2934         }
2935     }
2936
2937     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2938     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2939
2940     multifd_send_sync_main(f);
2941     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2942     qemu_fflush(f);
2943
2944     return 0;
2945 }
2946
2947 /**
2948  * ram_save_iterate: iterative stage for migration
2949  *
2950  * Returns zero to indicate success and negative for error
2951  *
2952  * @f: QEMUFile where to send the data
2953  * @opaque: RAMState pointer
2954  */
2955 static int ram_save_iterate(QEMUFile *f, void *opaque)
2956 {
2957     RAMState **temp = opaque;
2958     RAMState *rs = *temp;
2959     int ret = 0;
2960     int i;
2961     int64_t t0;
2962     int done = 0;
2963
2964     if (blk_mig_bulk_active()) {
2965         /* Avoid transferring ram during bulk phase of block migration as
2966          * the bulk phase will usually take a long time and transferring
2967          * ram updates during that time is pointless. */
2968         goto out;
2969     }
2970
2971     /*
2972      * We'll take this lock a little bit long, but it's okay for two reasons.
2973      * Firstly, the only possible other thread to take it is who calls
2974      * qemu_guest_free_page_hint(), which should be rare; secondly, see
2975      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
2976      * guarantees that we'll at least released it in a regular basis.
2977      */
2978     qemu_mutex_lock(&rs->bitmap_mutex);
2979     WITH_RCU_READ_LOCK_GUARD() {
2980         if (ram_list.version != rs->last_version) {
2981             ram_state_reset(rs);
2982         }
2983
2984         /* Read version before ram_list.blocks */
2985         smp_rmb();
2986
2987         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2988
2989         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2990         i = 0;
2991         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2992                postcopy_has_request(rs)) {
2993             int pages;
2994
2995             if (qemu_file_get_error(f)) {
2996                 break;
2997             }
2998
2999             pages = ram_find_and_save_block(rs);
3000             /* no more pages to sent */
3001             if (pages == 0) {
3002                 done = 1;
3003                 break;
3004             }
3005
3006             if (pages < 0) {
3007                 qemu_file_set_error(f, pages);
3008                 break;
3009             }
3010
3011             rs->target_page_count += pages;
3012
3013             /*
3014              * During postcopy, it is necessary to make sure one whole host
3015              * page is sent in one chunk.
3016              */
3017             if (migrate_postcopy_ram()) {
3018                 flush_compressed_data(rs);
3019             }
3020
3021             /*
3022              * we want to check in the 1st loop, just in case it was the 1st
3023              * time and we had to sync the dirty bitmap.
3024              * qemu_clock_get_ns() is a bit expensive, so we only check each
3025              * some iterations
3026              */
3027             if ((i & 63) == 0) {
3028                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3029                               1000000;
3030                 if (t1 > MAX_WAIT) {
3031                     trace_ram_save_iterate_big_wait(t1, i);
3032                     break;
3033                 }
3034             }
3035             i++;
3036         }
3037     }
3038     qemu_mutex_unlock(&rs->bitmap_mutex);
3039
3040     /*
3041      * Must occur before EOS (or any QEMUFile operation)
3042      * because of RDMA protocol.
3043      */
3044     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3045
3046 out:
3047     if (ret >= 0
3048         && migration_is_setup_or_active(migrate_get_current()->state)) {
3049         multifd_send_sync_main(rs->f);
3050         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3051         qemu_fflush(f);
3052         ram_transferred_add(8);
3053
3054         ret = qemu_file_get_error(f);
3055     }
3056     if (ret < 0) {
3057         return ret;
3058     }
3059
3060     return done;
3061 }
3062
3063 /**
3064  * ram_save_complete: function called to send the remaining amount of ram
3065  *
3066  * Returns zero to indicate success or negative on error
3067  *
3068  * Called with iothread lock
3069  *
3070  * @f: QEMUFile where to send the data
3071  * @opaque: RAMState pointer
3072  */
3073 static int ram_save_complete(QEMUFile *f, void *opaque)
3074 {
3075     RAMState **temp = opaque;
3076     RAMState *rs = *temp;
3077     int ret = 0;
3078
3079     rs->last_stage = !migration_in_colo_state();
3080
3081     WITH_RCU_READ_LOCK_GUARD() {
3082         if (!migration_in_postcopy()) {
3083             migration_bitmap_sync_precopy(rs);
3084         }
3085
3086         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3087
3088         /* try transferring iterative blocks of memory */
3089
3090         /* flush all remaining blocks regardless of rate limiting */
3091         while (true) {
3092             int pages;
3093
3094             pages = ram_find_and_save_block(rs);
3095             /* no more blocks to sent */
3096             if (pages == 0) {
3097                 break;
3098             }
3099             if (pages < 0) {
3100                 ret = pages;
3101                 break;
3102             }
3103         }
3104
3105         flush_compressed_data(rs);
3106         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3107     }
3108
3109     if (ret >= 0) {
3110         multifd_send_sync_main(rs->f);
3111         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3112         qemu_fflush(f);
3113     }
3114
3115     return ret;
3116 }
3117
3118 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3119                              uint64_t *res_precopy_only,
3120                              uint64_t *res_compatible,
3121                              uint64_t *res_postcopy_only)
3122 {
3123     RAMState **temp = opaque;
3124     RAMState *rs = *temp;
3125     uint64_t remaining_size;
3126
3127     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3128
3129     if (!migration_in_postcopy() &&
3130         remaining_size < max_size) {
3131         qemu_mutex_lock_iothread();
3132         WITH_RCU_READ_LOCK_GUARD() {
3133             migration_bitmap_sync_precopy(rs);
3134         }
3135         qemu_mutex_unlock_iothread();
3136         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3137     }
3138
3139     if (migrate_postcopy_ram()) {
3140         /* We can do postcopy, and all the data is postcopiable */
3141         *res_compatible += remaining_size;
3142     } else {
3143         *res_precopy_only += remaining_size;
3144     }
3145 }
3146
3147 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3148 {
3149     unsigned int xh_len;
3150     int xh_flags;
3151     uint8_t *loaded_data;
3152
3153     /* extract RLE header */
3154     xh_flags = qemu_get_byte(f);
3155     xh_len = qemu_get_be16(f);
3156
3157     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3158         error_report("Failed to load XBZRLE page - wrong compression!");
3159         return -1;
3160     }
3161
3162     if (xh_len > TARGET_PAGE_SIZE) {
3163         error_report("Failed to load XBZRLE page - len overflow!");
3164         return -1;
3165     }
3166     loaded_data = XBZRLE.decoded_buf;
3167     /* load data and decode */
3168     /* it can change loaded_data to point to an internal buffer */
3169     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3170
3171     /* decode RLE */
3172     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3173                              TARGET_PAGE_SIZE) == -1) {
3174         error_report("Failed to load XBZRLE page - decode error!");
3175         return -1;
3176     }
3177
3178     return 0;
3179 }
3180
3181 /**
3182  * ram_block_from_stream: read a RAMBlock id from the migration stream
3183  *
3184  * Must be called from within a rcu critical section.
3185  *
3186  * Returns a pointer from within the RCU-protected ram_list.
3187  *
3188  * @f: QEMUFile where to read the data from
3189  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3190  */
3191 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3192 {
3193     static RAMBlock *block;
3194     char id[256];
3195     uint8_t len;
3196
3197     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3198         if (!block) {
3199             error_report("Ack, bad migration stream!");
3200             return NULL;
3201         }
3202         return block;
3203     }
3204
3205     len = qemu_get_byte(f);
3206     qemu_get_buffer(f, (uint8_t *)id, len);
3207     id[len] = 0;
3208
3209     block = qemu_ram_block_by_name(id);
3210     if (!block) {
3211         error_report("Can't find block %s", id);
3212         return NULL;
3213     }
3214
3215     if (ramblock_is_ignored(block)) {
3216         error_report("block %s should not be migrated !", id);
3217         return NULL;
3218     }
3219
3220     return block;
3221 }
3222
3223 static inline void *host_from_ram_block_offset(RAMBlock *block,
3224                                                ram_addr_t offset)
3225 {
3226     if (!offset_in_ramblock(block, offset)) {
3227         return NULL;
3228     }
3229
3230     return block->host + offset;
3231 }
3232
3233 static void *host_page_from_ram_block_offset(RAMBlock *block,
3234                                              ram_addr_t offset)
3235 {
3236     /* Note: Explicitly no check against offset_in_ramblock(). */
3237     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3238                                    block->page_size);
3239 }
3240
3241 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3242                                                          ram_addr_t offset)
3243 {
3244     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3245 }
3246
3247 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3248                              ram_addr_t offset, bool record_bitmap)
3249 {
3250     if (!offset_in_ramblock(block, offset)) {
3251         return NULL;
3252     }
3253     if (!block->colo_cache) {
3254         error_report("%s: colo_cache is NULL in block :%s",
3255                      __func__, block->idstr);
3256         return NULL;
3257     }
3258
3259     /*
3260     * During colo checkpoint, we need bitmap of these migrated pages.
3261     * It help us to decide which pages in ram cache should be flushed
3262     * into VM's RAM later.
3263     */
3264     if (record_bitmap &&
3265         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3266         ram_state->migration_dirty_pages++;
3267     }
3268     return block->colo_cache + offset;
3269 }
3270
3271 /**
3272  * ram_handle_compressed: handle the zero page case
3273  *
3274  * If a page (or a whole RDMA chunk) has been
3275  * determined to be zero, then zap it.
3276  *
3277  * @host: host address for the zero page
3278  * @ch: what the page is filled from.  We only support zero
3279  * @size: size of the zero page
3280  */
3281 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3282 {
3283     if (ch != 0 || !buffer_is_zero(host, size)) {
3284         memset(host, ch, size);
3285     }
3286 }
3287
3288 /* return the size after decompression, or negative value on error */
3289 static int
3290 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3291                      const uint8_t *source, size_t source_len)
3292 {
3293     int err;
3294
3295     err = inflateReset(stream);
3296     if (err != Z_OK) {
3297         return -1;
3298     }
3299
3300     stream->avail_in = source_len;
3301     stream->next_in = (uint8_t *)source;
3302     stream->avail_out = dest_len;
3303     stream->next_out = dest;
3304
3305     err = inflate(stream, Z_NO_FLUSH);
3306     if (err != Z_STREAM_END) {
3307         return -1;
3308     }
3309
3310     return stream->total_out;
3311 }
3312
3313 static void *do_data_decompress(void *opaque)
3314 {
3315     DecompressParam *param = opaque;
3316     unsigned long pagesize;
3317     uint8_t *des;
3318     int len, ret;
3319
3320     qemu_mutex_lock(&param->mutex);
3321     while (!param->quit) {
3322         if (param->des) {
3323             des = param->des;
3324             len = param->len;
3325             param->des = 0;
3326             qemu_mutex_unlock(&param->mutex);
3327
3328             pagesize = TARGET_PAGE_SIZE;
3329
3330             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3331                                        param->compbuf, len);
3332             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3333                 error_report("decompress data failed");
3334                 qemu_file_set_error(decomp_file, ret);
3335             }
3336
3337             qemu_mutex_lock(&decomp_done_lock);
3338             param->done = true;
3339             qemu_cond_signal(&decomp_done_cond);
3340             qemu_mutex_unlock(&decomp_done_lock);
3341
3342             qemu_mutex_lock(&param->mutex);
3343         } else {
3344             qemu_cond_wait(&param->cond, &param->mutex);
3345         }
3346     }
3347     qemu_mutex_unlock(&param->mutex);
3348
3349     return NULL;
3350 }
3351
3352 static int wait_for_decompress_done(void)
3353 {
3354     int idx, thread_count;
3355
3356     if (!migrate_use_compression()) {
3357         return 0;
3358     }
3359
3360     thread_count = migrate_decompress_threads();
3361     qemu_mutex_lock(&decomp_done_lock);
3362     for (idx = 0; idx < thread_count; idx++) {
3363         while (!decomp_param[idx].done) {
3364             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3365         }
3366     }
3367     qemu_mutex_unlock(&decomp_done_lock);
3368     return qemu_file_get_error(decomp_file);
3369 }
3370
3371 static void compress_threads_load_cleanup(void)
3372 {
3373     int i, thread_count;
3374
3375     if (!migrate_use_compression()) {
3376         return;
3377     }
3378     thread_count = migrate_decompress_threads();
3379     for (i = 0; i < thread_count; i++) {
3380         /*
3381          * we use it as a indicator which shows if the thread is
3382          * properly init'd or not
3383          */
3384         if (!decomp_param[i].compbuf) {
3385             break;
3386         }
3387
3388         qemu_mutex_lock(&decomp_param[i].mutex);
3389         decomp_param[i].quit = true;
3390         qemu_cond_signal(&decomp_param[i].cond);
3391         qemu_mutex_unlock(&decomp_param[i].mutex);
3392     }
3393     for (i = 0; i < thread_count; i++) {
3394         if (!decomp_param[i].compbuf) {
3395             break;
3396         }
3397
3398         qemu_thread_join(decompress_threads + i);
3399         qemu_mutex_destroy(&decomp_param[i].mutex);
3400         qemu_cond_destroy(&decomp_param[i].cond);
3401         inflateEnd(&decomp_param[i].stream);
3402         g_free(decomp_param[i].compbuf);
3403         decomp_param[i].compbuf = NULL;
3404     }
3405     g_free(decompress_threads);
3406     g_free(decomp_param);
3407     decompress_threads = NULL;
3408     decomp_param = NULL;
3409     decomp_file = NULL;
3410 }
3411
3412 static int compress_threads_load_setup(QEMUFile *f)
3413 {
3414     int i, thread_count;
3415
3416     if (!migrate_use_compression()) {
3417         return 0;
3418     }
3419
3420     thread_count = migrate_decompress_threads();
3421     decompress_threads = g_new0(QemuThread, thread_count);
3422     decomp_param = g_new0(DecompressParam, thread_count);
3423     qemu_mutex_init(&decomp_done_lock);
3424     qemu_cond_init(&decomp_done_cond);
3425     decomp_file = f;
3426     for (i = 0; i < thread_count; i++) {
3427         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3428             goto exit;
3429         }
3430
3431         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3432         qemu_mutex_init(&decomp_param[i].mutex);
3433         qemu_cond_init(&decomp_param[i].cond);
3434         decomp_param[i].done = true;
3435         decomp_param[i].quit = false;
3436         qemu_thread_create(decompress_threads + i, "decompress",
3437                            do_data_decompress, decomp_param + i,
3438                            QEMU_THREAD_JOINABLE);
3439     }
3440     return 0;
3441 exit:
3442     compress_threads_load_cleanup();
3443     return -1;
3444 }
3445
3446 static void decompress_data_with_multi_threads(QEMUFile *f,
3447                                                void *host, int len)
3448 {
3449     int idx, thread_count;
3450
3451     thread_count = migrate_decompress_threads();
3452     QEMU_LOCK_GUARD(&decomp_done_lock);
3453     while (true) {
3454         for (idx = 0; idx < thread_count; idx++) {
3455             if (decomp_param[idx].done) {
3456                 decomp_param[idx].done = false;
3457                 qemu_mutex_lock(&decomp_param[idx].mutex);
3458                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3459                 decomp_param[idx].des = host;
3460                 decomp_param[idx].len = len;
3461                 qemu_cond_signal(&decomp_param[idx].cond);
3462                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3463                 break;
3464             }
3465         }
3466         if (idx < thread_count) {
3467             break;
3468         } else {
3469             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3470         }
3471     }
3472 }
3473
3474 static void colo_init_ram_state(void)
3475 {
3476     ram_state_init(&ram_state);
3477 }
3478
3479 /*
3480  * colo cache: this is for secondary VM, we cache the whole
3481  * memory of the secondary VM, it is need to hold the global lock
3482  * to call this helper.
3483  */
3484 int colo_init_ram_cache(void)
3485 {
3486     RAMBlock *block;
3487
3488     WITH_RCU_READ_LOCK_GUARD() {
3489         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3490             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3491                                                     NULL, false, false);
3492             if (!block->colo_cache) {
3493                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3494                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3495                              block->used_length);
3496                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3497                     if (block->colo_cache) {
3498                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3499                         block->colo_cache = NULL;
3500                     }
3501                 }
3502                 return -errno;
3503             }
3504             if (!machine_dump_guest_core(current_machine)) {
3505                 qemu_madvise(block->colo_cache, block->used_length,
3506                              QEMU_MADV_DONTDUMP);
3507             }
3508         }
3509     }
3510
3511     /*
3512     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3513     * with to decide which page in cache should be flushed into SVM's RAM. Here
3514     * we use the same name 'ram_bitmap' as for migration.
3515     */
3516     if (ram_bytes_total()) {
3517         RAMBlock *block;
3518
3519         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3520             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3521             block->bmap = bitmap_new(pages);
3522         }
3523     }
3524
3525     colo_init_ram_state();
3526     return 0;
3527 }
3528
3529 /* TODO: duplicated with ram_init_bitmaps */
3530 void colo_incoming_start_dirty_log(void)
3531 {
3532     RAMBlock *block = NULL;
3533     /* For memory_global_dirty_log_start below. */
3534     qemu_mutex_lock_iothread();
3535     qemu_mutex_lock_ramlist();
3536
3537     memory_global_dirty_log_sync();
3538     WITH_RCU_READ_LOCK_GUARD() {
3539         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3540             ramblock_sync_dirty_bitmap(ram_state, block);
3541             /* Discard this dirty bitmap record */
3542             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3543         }
3544         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3545     }
3546     ram_state->migration_dirty_pages = 0;
3547     qemu_mutex_unlock_ramlist();
3548     qemu_mutex_unlock_iothread();
3549 }
3550
3551 /* It is need to hold the global lock to call this helper */
3552 void colo_release_ram_cache(void)
3553 {
3554     RAMBlock *block;
3555
3556     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3557     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3558         g_free(block->bmap);
3559         block->bmap = NULL;
3560     }
3561
3562     WITH_RCU_READ_LOCK_GUARD() {
3563         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3564             if (block->colo_cache) {
3565                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3566                 block->colo_cache = NULL;
3567             }
3568         }
3569     }
3570     ram_state_cleanup(&ram_state);
3571 }
3572
3573 /**
3574  * ram_load_setup: Setup RAM for migration incoming side
3575  *
3576  * Returns zero to indicate success and negative for error
3577  *
3578  * @f: QEMUFile where to receive the data
3579  * @opaque: RAMState pointer
3580  */
3581 static int ram_load_setup(QEMUFile *f, void *opaque)
3582 {
3583     if (compress_threads_load_setup(f)) {
3584         return -1;
3585     }
3586
3587     xbzrle_load_setup();
3588     ramblock_recv_map_init();
3589
3590     return 0;
3591 }
3592
3593 static int ram_load_cleanup(void *opaque)
3594 {
3595     RAMBlock *rb;
3596
3597     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3598         qemu_ram_block_writeback(rb);
3599     }
3600
3601     xbzrle_load_cleanup();
3602     compress_threads_load_cleanup();
3603
3604     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3605         g_free(rb->receivedmap);
3606         rb->receivedmap = NULL;
3607     }
3608
3609     return 0;
3610 }
3611
3612 /**
3613  * ram_postcopy_incoming_init: allocate postcopy data structures
3614  *
3615  * Returns 0 for success and negative if there was one error
3616  *
3617  * @mis: current migration incoming state
3618  *
3619  * Allocate data structures etc needed by incoming migration with
3620  * postcopy-ram. postcopy-ram's similarly names
3621  * postcopy_ram_incoming_init does the work.
3622  */
3623 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3624 {
3625     return postcopy_ram_incoming_init(mis);
3626 }
3627
3628 /**
3629  * ram_load_postcopy: load a page in postcopy case
3630  *
3631  * Returns 0 for success or -errno in case of error
3632  *
3633  * Called in postcopy mode by ram_load().
3634  * rcu_read_lock is taken prior to this being called.
3635  *
3636  * @f: QEMUFile where to send the data
3637  */
3638 static int ram_load_postcopy(QEMUFile *f)
3639 {
3640     int flags = 0, ret = 0;
3641     bool place_needed = false;
3642     bool matches_target_page_size = false;
3643     MigrationIncomingState *mis = migration_incoming_get_current();
3644     /* Temporary page that is later 'placed' */
3645     void *postcopy_host_page = mis->postcopy_tmp_page;
3646     void *host_page = NULL;
3647     bool all_zero = true;
3648     int target_pages = 0;
3649
3650     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3651         ram_addr_t addr;
3652         void *page_buffer = NULL;
3653         void *place_source = NULL;
3654         RAMBlock *block = NULL;
3655         uint8_t ch;
3656         int len;
3657
3658         addr = qemu_get_be64(f);
3659
3660         /*
3661          * If qemu file error, we should stop here, and then "addr"
3662          * may be invalid
3663          */
3664         ret = qemu_file_get_error(f);
3665         if (ret) {
3666             break;
3667         }
3668
3669         flags = addr & ~TARGET_PAGE_MASK;
3670         addr &= TARGET_PAGE_MASK;
3671
3672         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3673         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3674                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3675             block = ram_block_from_stream(f, flags);
3676             if (!block) {
3677                 ret = -EINVAL;
3678                 break;
3679             }
3680
3681             /*
3682              * Relying on used_length is racy and can result in false positives.
3683              * We might place pages beyond used_length in case RAM was shrunk
3684              * while in postcopy, which is fine - trying to place via
3685              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3686              */
3687             if (!block->host || addr >= block->postcopy_length) {
3688                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3689                 ret = -EINVAL;
3690                 break;
3691             }
3692             target_pages++;
3693             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3694             /*
3695              * Postcopy requires that we place whole host pages atomically;
3696              * these may be huge pages for RAMBlocks that are backed by
3697              * hugetlbfs.
3698              * To make it atomic, the data is read into a temporary page
3699              * that's moved into place later.
3700              * The migration protocol uses,  possibly smaller, target-pages
3701              * however the source ensures it always sends all the components
3702              * of a host page in one chunk.
3703              */
3704             page_buffer = postcopy_host_page +
3705                           host_page_offset_from_ram_block_offset(block, addr);
3706             /* If all TP are zero then we can optimise the place */
3707             if (target_pages == 1) {
3708                 host_page = host_page_from_ram_block_offset(block, addr);
3709             } else if (host_page != host_page_from_ram_block_offset(block,
3710                                                                     addr)) {
3711                 /* not the 1st TP within the HP */
3712                 error_report("Non-same host page %p/%p", host_page,
3713                              host_page_from_ram_block_offset(block, addr));
3714                 ret = -EINVAL;
3715                 break;
3716             }
3717
3718             /*
3719              * If it's the last part of a host page then we place the host
3720              * page
3721              */
3722             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3723                 place_needed = true;
3724             }
3725             place_source = postcopy_host_page;
3726         }
3727
3728         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3729         case RAM_SAVE_FLAG_ZERO:
3730             ch = qemu_get_byte(f);
3731             /*
3732              * Can skip to set page_buffer when
3733              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3734              */
3735             if (ch || !matches_target_page_size) {
3736                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3737             }
3738             if (ch) {
3739                 all_zero = false;
3740             }
3741             break;
3742
3743         case RAM_SAVE_FLAG_PAGE:
3744             all_zero = false;
3745             if (!matches_target_page_size) {
3746                 /* For huge pages, we always use temporary buffer */
3747                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3748             } else {
3749                 /*
3750                  * For small pages that matches target page size, we
3751                  * avoid the qemu_file copy.  Instead we directly use
3752                  * the buffer of QEMUFile to place the page.  Note: we
3753                  * cannot do any QEMUFile operation before using that
3754                  * buffer to make sure the buffer is valid when
3755                  * placing the page.
3756                  */
3757                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3758                                          TARGET_PAGE_SIZE);
3759             }
3760             break;
3761         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3762             all_zero = false;
3763             len = qemu_get_be32(f);
3764             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3765                 error_report("Invalid compressed data length: %d", len);
3766                 ret = -EINVAL;
3767                 break;
3768             }
3769             decompress_data_with_multi_threads(f, page_buffer, len);
3770             break;
3771
3772         case RAM_SAVE_FLAG_EOS:
3773             /* normal exit */
3774             multifd_recv_sync_main();
3775             break;
3776         default:
3777             error_report("Unknown combination of migration flags: 0x%x"
3778                          " (postcopy mode)", flags);
3779             ret = -EINVAL;
3780             break;
3781         }
3782
3783         /* Got the whole host page, wait for decompress before placing. */
3784         if (place_needed) {
3785             ret |= wait_for_decompress_done();
3786         }
3787
3788         /* Detect for any possible file errors */
3789         if (!ret && qemu_file_get_error(f)) {
3790             ret = qemu_file_get_error(f);
3791         }
3792
3793         if (!ret && place_needed) {
3794             if (all_zero) {
3795                 ret = postcopy_place_page_zero(mis, host_page, block);
3796             } else {
3797                 ret = postcopy_place_page(mis, host_page, place_source,
3798                                           block);
3799             }
3800             place_needed = false;
3801             target_pages = 0;
3802             /* Assume we have a zero page until we detect something different */
3803             all_zero = true;
3804         }
3805     }
3806
3807     return ret;
3808 }
3809
3810 static bool postcopy_is_advised(void)
3811 {
3812     PostcopyState ps = postcopy_state_get();
3813     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3814 }
3815
3816 static bool postcopy_is_running(void)
3817 {
3818     PostcopyState ps = postcopy_state_get();
3819     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3820 }
3821
3822 /*
3823  * Flush content of RAM cache into SVM's memory.
3824  * Only flush the pages that be dirtied by PVM or SVM or both.
3825  */
3826 void colo_flush_ram_cache(void)
3827 {
3828     RAMBlock *block = NULL;
3829     void *dst_host;
3830     void *src_host;
3831     unsigned long offset = 0;
3832
3833     memory_global_dirty_log_sync();
3834     WITH_RCU_READ_LOCK_GUARD() {
3835         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3836             ramblock_sync_dirty_bitmap(ram_state, block);
3837         }
3838     }
3839
3840     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3841     WITH_RCU_READ_LOCK_GUARD() {
3842         block = QLIST_FIRST_RCU(&ram_list.blocks);
3843
3844         while (block) {
3845             unsigned long num = 0;
3846
3847             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
3848             if (!offset_in_ramblock(block,
3849                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3850                 offset = 0;
3851                 num = 0;
3852                 block = QLIST_NEXT_RCU(block, next);
3853             } else {
3854                 unsigned long i = 0;
3855
3856                 for (i = 0; i < num; i++) {
3857                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
3858                 }
3859                 dst_host = block->host
3860                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3861                 src_host = block->colo_cache
3862                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3863                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3864                 offset += num;
3865             }
3866         }
3867     }
3868     trace_colo_flush_ram_cache_end();
3869 }
3870
3871 /**
3872  * ram_load_precopy: load pages in precopy case
3873  *
3874  * Returns 0 for success or -errno in case of error
3875  *
3876  * Called in precopy mode by ram_load().
3877  * rcu_read_lock is taken prior to this being called.
3878  *
3879  * @f: QEMUFile where to send the data
3880  */
3881 static int ram_load_precopy(QEMUFile *f)
3882 {
3883     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3884     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3885     bool postcopy_advised = postcopy_is_advised();
3886     if (!migrate_use_compression()) {
3887         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3888     }
3889
3890     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3891         ram_addr_t addr, total_ram_bytes;
3892         void *host = NULL, *host_bak = NULL;
3893         uint8_t ch;
3894
3895         /*
3896          * Yield periodically to let main loop run, but an iteration of
3897          * the main loop is expensive, so do it each some iterations
3898          */
3899         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3900             aio_co_schedule(qemu_get_current_aio_context(),
3901                             qemu_coroutine_self());
3902             qemu_coroutine_yield();
3903         }
3904         i++;
3905
3906         addr = qemu_get_be64(f);
3907         flags = addr & ~TARGET_PAGE_MASK;
3908         addr &= TARGET_PAGE_MASK;
3909
3910         if (flags & invalid_flags) {
3911             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3912                 error_report("Received an unexpected compressed page");
3913             }
3914
3915             ret = -EINVAL;
3916             break;
3917         }
3918
3919         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3920                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3921             RAMBlock *block = ram_block_from_stream(f, flags);
3922
3923             host = host_from_ram_block_offset(block, addr);
3924             /*
3925              * After going into COLO stage, we should not load the page
3926              * into SVM's memory directly, we put them into colo_cache firstly.
3927              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3928              * Previously, we copied all these memory in preparing stage of COLO
3929              * while we need to stop VM, which is a time-consuming process.
3930              * Here we optimize it by a trick, back-up every page while in
3931              * migration process while COLO is enabled, though it affects the
3932              * speed of the migration, but it obviously reduce the downtime of
3933              * back-up all SVM'S memory in COLO preparing stage.
3934              */
3935             if (migration_incoming_colo_enabled()) {
3936                 if (migration_incoming_in_colo_state()) {
3937                     /* In COLO stage, put all pages into cache temporarily */
3938                     host = colo_cache_from_block_offset(block, addr, true);
3939                 } else {
3940                    /*
3941                     * In migration stage but before COLO stage,
3942                     * Put all pages into both cache and SVM's memory.
3943                     */
3944                     host_bak = colo_cache_from_block_offset(block, addr, false);
3945                 }
3946             }
3947             if (!host) {
3948                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3949                 ret = -EINVAL;
3950                 break;
3951             }
3952             if (!migration_incoming_in_colo_state()) {
3953                 ramblock_recv_bitmap_set(block, host);
3954             }
3955
3956             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3957         }
3958
3959         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3960         case RAM_SAVE_FLAG_MEM_SIZE:
3961             /* Synchronize RAM block list */
3962             total_ram_bytes = addr;
3963             while (!ret && total_ram_bytes) {
3964                 RAMBlock *block;
3965                 char id[256];
3966                 ram_addr_t length;
3967
3968                 len = qemu_get_byte(f);
3969                 qemu_get_buffer(f, (uint8_t *)id, len);
3970                 id[len] = 0;
3971                 length = qemu_get_be64(f);
3972
3973                 block = qemu_ram_block_by_name(id);
3974                 if (block && !qemu_ram_is_migratable(block)) {
3975                     error_report("block %s should not be migrated !", id);
3976                     ret = -EINVAL;
3977                 } else if (block) {
3978                     if (length != block->used_length) {
3979                         Error *local_err = NULL;
3980
3981                         ret = qemu_ram_resize(block, length,
3982                                               &local_err);
3983                         if (local_err) {
3984                             error_report_err(local_err);
3985                         }
3986                     }
3987                     /* For postcopy we need to check hugepage sizes match */
3988                     if (postcopy_advised && migrate_postcopy_ram() &&
3989                         block->page_size != qemu_host_page_size) {
3990                         uint64_t remote_page_size = qemu_get_be64(f);
3991                         if (remote_page_size != block->page_size) {
3992                             error_report("Mismatched RAM page size %s "
3993                                          "(local) %zd != %" PRId64,
3994                                          id, block->page_size,
3995                                          remote_page_size);
3996                             ret = -EINVAL;
3997                         }
3998                     }
3999                     if (migrate_ignore_shared()) {
4000                         hwaddr addr = qemu_get_be64(f);
4001                         if (ramblock_is_ignored(block) &&
4002                             block->mr->addr != addr) {
4003                             error_report("Mismatched GPAs for block %s "
4004                                          "%" PRId64 "!= %" PRId64,
4005                                          id, (uint64_t)addr,
4006                                          (uint64_t)block->mr->addr);
4007                             ret = -EINVAL;
4008                         }
4009                     }
4010                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4011                                           block->idstr);
4012                 } else {
4013                     error_report("Unknown ramblock \"%s\", cannot "
4014                                  "accept migration", id);
4015                     ret = -EINVAL;
4016                 }
4017
4018                 total_ram_bytes -= length;
4019             }
4020             break;
4021
4022         case RAM_SAVE_FLAG_ZERO:
4023             ch = qemu_get_byte(f);
4024             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4025             break;
4026
4027         case RAM_SAVE_FLAG_PAGE:
4028             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4029             break;
4030
4031         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4032             len = qemu_get_be32(f);
4033             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4034                 error_report("Invalid compressed data length: %d", len);
4035                 ret = -EINVAL;
4036                 break;
4037             }
4038             decompress_data_with_multi_threads(f, host, len);
4039             break;
4040
4041         case RAM_SAVE_FLAG_XBZRLE:
4042             if (load_xbzrle(f, addr, host) < 0) {
4043                 error_report("Failed to decompress XBZRLE page at "
4044                              RAM_ADDR_FMT, addr);
4045                 ret = -EINVAL;
4046                 break;
4047             }
4048             break;
4049         case RAM_SAVE_FLAG_EOS:
4050             /* normal exit */
4051             multifd_recv_sync_main();
4052             break;
4053         default:
4054             if (flags & RAM_SAVE_FLAG_HOOK) {
4055                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4056             } else {
4057                 error_report("Unknown combination of migration flags: 0x%x",
4058                              flags);
4059                 ret = -EINVAL;
4060             }
4061         }
4062         if (!ret) {
4063             ret = qemu_file_get_error(f);
4064         }
4065         if (!ret && host_bak) {
4066             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4067         }
4068     }
4069
4070     ret |= wait_for_decompress_done();
4071     return ret;
4072 }
4073
4074 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4075 {
4076     int ret = 0;
4077     static uint64_t seq_iter;
4078     /*
4079      * If system is running in postcopy mode, page inserts to host memory must
4080      * be atomic
4081      */
4082     bool postcopy_running = postcopy_is_running();
4083
4084     seq_iter++;
4085
4086     if (version_id != 4) {
4087         return -EINVAL;
4088     }
4089
4090     /*
4091      * This RCU critical section can be very long running.
4092      * When RCU reclaims in the code start to become numerous,
4093      * it will be necessary to reduce the granularity of this
4094      * critical section.
4095      */
4096     WITH_RCU_READ_LOCK_GUARD() {
4097         if (postcopy_running) {
4098             ret = ram_load_postcopy(f);
4099         } else {
4100             ret = ram_load_precopy(f);
4101         }
4102     }
4103     trace_ram_load_complete(ret, seq_iter);
4104
4105     return ret;
4106 }
4107
4108 static bool ram_has_postcopy(void *opaque)
4109 {
4110     RAMBlock *rb;
4111     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4112         if (ramblock_is_pmem(rb)) {
4113             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4114                          "is not supported now!", rb->idstr, rb->host);
4115             return false;
4116         }
4117     }
4118
4119     return migrate_postcopy_ram();
4120 }
4121
4122 /* Sync all the dirty bitmap with destination VM.  */
4123 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4124 {
4125     RAMBlock *block;
4126     QEMUFile *file = s->to_dst_file;
4127     int ramblock_count = 0;
4128
4129     trace_ram_dirty_bitmap_sync_start();
4130
4131     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4132         qemu_savevm_send_recv_bitmap(file, block->idstr);
4133         trace_ram_dirty_bitmap_request(block->idstr);
4134         ramblock_count++;
4135     }
4136
4137     trace_ram_dirty_bitmap_sync_wait();
4138
4139     /* Wait until all the ramblocks' dirty bitmap synced */
4140     while (ramblock_count--) {
4141         qemu_sem_wait(&s->rp_state.rp_sem);
4142     }
4143
4144     trace_ram_dirty_bitmap_sync_complete();
4145
4146     return 0;
4147 }
4148
4149 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4150 {
4151     qemu_sem_post(&s->rp_state.rp_sem);
4152 }
4153
4154 /*
4155  * Read the received bitmap, revert it as the initial dirty bitmap.
4156  * This is only used when the postcopy migration is paused but wants
4157  * to resume from a middle point.
4158  */
4159 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4160 {
4161     int ret = -EINVAL;
4162     /* from_dst_file is always valid because we're within rp_thread */
4163     QEMUFile *file = s->rp_state.from_dst_file;
4164     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4165     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4166     uint64_t size, end_mark;
4167
4168     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4169
4170     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4171         error_report("%s: incorrect state %s", __func__,
4172                      MigrationStatus_str(s->state));
4173         return -EINVAL;
4174     }
4175
4176     /*
4177      * Note: see comments in ramblock_recv_bitmap_send() on why we
4178      * need the endianness conversion, and the paddings.
4179      */
4180     local_size = ROUND_UP(local_size, 8);
4181
4182     /* Add paddings */
4183     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4184
4185     size = qemu_get_be64(file);
4186
4187     /* The size of the bitmap should match with our ramblock */
4188     if (size != local_size) {
4189         error_report("%s: ramblock '%s' bitmap size mismatch "
4190                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4191                      block->idstr, size, local_size);
4192         ret = -EINVAL;
4193         goto out;
4194     }
4195
4196     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4197     end_mark = qemu_get_be64(file);
4198
4199     ret = qemu_file_get_error(file);
4200     if (ret || size != local_size) {
4201         error_report("%s: read bitmap failed for ramblock '%s': %d"
4202                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4203                      __func__, block->idstr, ret, local_size, size);
4204         ret = -EIO;
4205         goto out;
4206     }
4207
4208     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4209         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4210                      __func__, block->idstr, end_mark);
4211         ret = -EINVAL;
4212         goto out;
4213     }
4214
4215     /*
4216      * Endianness conversion. We are during postcopy (though paused).
4217      * The dirty bitmap won't change. We can directly modify it.
4218      */
4219     bitmap_from_le(block->bmap, le_bitmap, nbits);
4220
4221     /*
4222      * What we received is "received bitmap". Revert it as the initial
4223      * dirty bitmap for this ramblock.
4224      */
4225     bitmap_complement(block->bmap, block->bmap, nbits);
4226
4227     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4228     ramblock_dirty_bitmap_clear_discarded_pages(block);
4229
4230     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4231     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4232
4233     /*
4234      * We succeeded to sync bitmap for current ramblock. If this is
4235      * the last one to sync, we need to notify the main send thread.
4236      */
4237     ram_dirty_bitmap_reload_notify(s);
4238
4239     ret = 0;
4240 out:
4241     g_free(le_bitmap);
4242     return ret;
4243 }
4244
4245 static int ram_resume_prepare(MigrationState *s, void *opaque)
4246 {
4247     RAMState *rs = *(RAMState **)opaque;
4248     int ret;
4249
4250     ret = ram_dirty_bitmap_sync_all(s, rs);
4251     if (ret) {
4252         return ret;
4253     }
4254
4255     ram_state_resume_prepare(rs, s->to_dst_file);
4256
4257     return 0;
4258 }
4259
4260 static SaveVMHandlers savevm_ram_handlers = {
4261     .save_setup = ram_save_setup,
4262     .save_live_iterate = ram_save_iterate,
4263     .save_live_complete_postcopy = ram_save_complete,
4264     .save_live_complete_precopy = ram_save_complete,
4265     .has_postcopy = ram_has_postcopy,
4266     .save_live_pending = ram_save_pending,
4267     .load_state = ram_load,
4268     .save_cleanup = ram_save_cleanup,
4269     .load_setup = ram_load_setup,
4270     .load_cleanup = ram_load_cleanup,
4271     .resume_prepare = ram_resume_prepare,
4272 };
4273
4274 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4275                                       size_t old_size, size_t new_size)
4276 {
4277     PostcopyState ps = postcopy_state_get();
4278     ram_addr_t offset;
4279     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4280     Error *err = NULL;
4281
4282     if (ramblock_is_ignored(rb)) {
4283         return;
4284     }
4285
4286     if (!migration_is_idle()) {
4287         /*
4288          * Precopy code on the source cannot deal with the size of RAM blocks
4289          * changing at random points in time - especially after sending the
4290          * RAM block sizes in the migration stream, they must no longer change.
4291          * Abort and indicate a proper reason.
4292          */
4293         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4294         migration_cancel(err);
4295         error_free(err);
4296     }
4297
4298     switch (ps) {
4299     case POSTCOPY_INCOMING_ADVISE:
4300         /*
4301          * Update what ram_postcopy_incoming_init()->init_range() does at the
4302          * time postcopy was advised. Syncing RAM blocks with the source will
4303          * result in RAM resizes.
4304          */
4305         if (old_size < new_size) {
4306             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4307                 error_report("RAM block '%s' discard of resized RAM failed",
4308                              rb->idstr);
4309             }
4310         }
4311         rb->postcopy_length = new_size;
4312         break;
4313     case POSTCOPY_INCOMING_NONE:
4314     case POSTCOPY_INCOMING_RUNNING:
4315     case POSTCOPY_INCOMING_END:
4316         /*
4317          * Once our guest is running, postcopy does no longer care about
4318          * resizes. When growing, the new memory was not available on the
4319          * source, no handler needed.
4320          */
4321         break;
4322     default:
4323         error_report("RAM block '%s' resized during postcopy state: %d",
4324                      rb->idstr, ps);
4325         exit(-1);
4326     }
4327 }
4328
4329 static RAMBlockNotifier ram_mig_ram_notifier = {
4330     .ram_block_resized = ram_mig_ram_block_resized,
4331 };
4332
4333 void ram_mig_init(void)
4334 {
4335     qemu_mutex_init(&XBZRLE.lock);
4336     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4337     ram_block_notifier_add(&ram_mig_ram_notifier);
4338 }