migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/madvise.h"
  34 #include "qemu/main-loop.h"
  35 #include "xbzrle.h"
  36 #include "ram.h"
  37 #include "migration.h"
  38 #include "migration/register.h"
  39 #include "migration/misc.h"
  40 #include "qemu-file.h"
  41 #include "postcopy-ram.h"
  42 #include "page_cache.h"
  43 #include "qemu/error-report.h"
  44 #include "qapi/error.h"
  45 #include "qapi/qapi-types-migration.h"
  46 #include "qapi/qapi-events-migration.h"
  47 #include "qapi/qmp/qerror.h"
  48 #include "trace.h"
  49 #include "exec/ram_addr.h"
  50 #include "exec/target_page.h"
  51 #include "qemu/rcu_queue.h"
  52 #include "migration/colo.h"
  53 #include "block.h"
  54 #include "sysemu/cpu-throttle.h"
  55 #include "savevm.h"
  56 #include "qemu/iov.h"
  57 #include "multifd.h"
  58 #include "sysemu/runstate.h"
  59
  60 #include "hw/boards.h" /* for machine_dump_guest_core() */
  61
  62 #if defined(__linux__)
  63 #include "qemu/userfaultfd.h"
  64 #endif /* defined(__linux__) */
  65
  66 /***********************************************************/
  67 /* ram save/restore */
  68
  69 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  70  * worked for pages that where filled with the same char.  We switched
  71  * it to only search for the zero value.  And to avoid confusion with
  72  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  73  */
  74
  75 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  76 #define RAM_SAVE_FLAG_ZERO     0x02
  77 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  78 #define RAM_SAVE_FLAG_PAGE     0x08
  79 #define RAM_SAVE_FLAG_EOS      0x10
  80 #define RAM_SAVE_FLAG_CONTINUE 0x20
  81 #define RAM_SAVE_FLAG_XBZRLE   0x40
  82 /* 0x80 is reserved in migration.h start with 0x100 next */
  83 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  84
  85 XBZRLECacheStats xbzrle_counters;
  86
  87 /* struct contains XBZRLE cache and a static page
  88    used by the compression */
  89 static struct {
  90     /* buffer used for XBZRLE encoding */
  91     uint8_t *encoded_buf;
  92     /* buffer for storing page content */
  93     uint8_t *current_buf;
  94     /* Cache for XBZRLE, Protected by lock. */
  95     PageCache *cache;
  96     QemuMutex lock;
  97     /* it will store a page full of zeros */
  98     uint8_t *zero_target_page;
  99     /* buffer used for XBZRLE decoding */
 100     uint8_t *decoded_buf;
 101 } XBZRLE;
 102
 103 static void XBZRLE_cache_lock(void)
 104 {
 105     if (migrate_use_xbzrle()) {
 106         qemu_mutex_lock(&XBZRLE.lock);
 107     }
 108 }
 109
 110 static void XBZRLE_cache_unlock(void)
 111 {
 112     if (migrate_use_xbzrle()) {
 113         qemu_mutex_unlock(&XBZRLE.lock);
 114     }
 115 }
 116
 117 /**
 118  * xbzrle_cache_resize: resize the xbzrle cache
 119  *
 120  * This function is called from migrate_params_apply in main
 121  * thread, possibly while a migration is in progress.  A running
 122  * migration may be using the cache and might finish during this call,
 123  * hence changes to the cache are protected by XBZRLE.lock().
 124  *
 125  * Returns 0 for success or -1 for error
 126  *
 127  * @new_size: new cache size
 128  * @errp: set *errp if the check failed, with reason
 129  */
 130 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 131 {
 132     PageCache *new_cache;
 133     int64_t ret = 0;
 134
 135     /* Check for truncation */
 136     if (new_size != (size_t)new_size) {
 137         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 138                    "exceeding address space");
 139         return -1;
 140     }
 141
 142     if (new_size == migrate_xbzrle_cache_size()) {
 143         /* nothing to do */
 144         return 0;
 145     }
 146
 147     XBZRLE_cache_lock();
 148
 149     if (XBZRLE.cache != NULL) {
 150         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 151         if (!new_cache) {
 152             ret = -1;
 153             goto out;
 154         }
 155
 156         cache_fini(XBZRLE.cache);
 157         XBZRLE.cache = new_cache;
 158     }
 159 out:
 160     XBZRLE_cache_unlock();
 161     return ret;
 162 }
 163
 164 bool ramblock_is_ignored(RAMBlock *block)
 165 {
 166     return !qemu_ram_is_migratable(block) ||
 167            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 168 }
 169
 170 #undef RAMBLOCK_FOREACH
 171
 172 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 173 {
 174     RAMBlock *block;
 175     int ret = 0;
 176
 177     RCU_READ_LOCK_GUARD();
 178
 179     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 180         ret = func(block, opaque);
 181         if (ret) {
 182             break;
 183         }
 184     }
 185     return ret;
 186 }
 187
 188 static void ramblock_recv_map_init(void)
 189 {
 190     RAMBlock *rb;
 191
 192     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 193         assert(!rb->receivedmap);
 194         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 195     }
 196 }
 197
 198 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 199 {
 200     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 201                     rb->receivedmap);
 202 }
 203
 204 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 205 {
 206     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 207 }
 208
 209 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 210 {
 211     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 212 }
 213
 214 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 215                                     size_t nr)
 216 {
 217     bitmap_set_atomic(rb->receivedmap,
 218                       ramblock_recv_bitmap_offset(host_addr, rb),
 219                       nr);
 220 }
 221
 222 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 223
 224 /*
 225  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 226  *
 227  * Returns >0 if success with sent bytes, or <0 if error.
 228  */
 229 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 230                                   const char *block_name)
 231 {
 232     RAMBlock *block = qemu_ram_block_by_name(block_name);
 233     unsigned long *le_bitmap, nbits;
 234     uint64_t size;
 235
 236     if (!block) {
 237         error_report("%s: invalid block name: %s", __func__, block_name);
 238         return -1;
 239     }
 240
 241     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 242
 243     /*
 244      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 245      * machines we may need 4 more bytes for padding (see below
 246      * comment). So extend it a bit before hand.
 247      */
 248     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 249
 250     /*
 251      * Always use little endian when sending the bitmap. This is
 252      * required that when source and destination VMs are not using the
 253      * same endianness. (Note: big endian won't work.)
 254      */
 255     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 256
 257     /* Size of the bitmap, in bytes */
 258     size = DIV_ROUND_UP(nbits, 8);
 259
 260     /*
 261      * size is always aligned to 8 bytes for 64bit machines, but it
 262      * may not be true for 32bit machines. We need this padding to
 263      * make sure the migration can survive even between 32bit and
 264      * 64bit machines.
 265      */
 266     size = ROUND_UP(size, 8);
 267
 268     qemu_put_be64(file, size);
 269     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 270     /*
 271      * Mark as an end, in case the middle part is screwed up due to
 272      * some "mysterious" reason.
 273      */
 274     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 275     qemu_fflush(file);
 276
 277     g_free(le_bitmap);
 278
 279     if (qemu_file_get_error(file)) {
 280         return qemu_file_get_error(file);
 281     }
 282
 283     return size + sizeof(size);
 284 }
 285
 286 /*
 287  * An outstanding page request, on the source, having been received
 288  * and queued
 289  */
 290 struct RAMSrcPageRequest {
 291     RAMBlock *rb;
 292     hwaddr    offset;
 293     hwaddr    len;
 294
 295     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 296 };
 297
 298 /* State of RAM for migration */
 299 struct RAMState {
 300     /* QEMUFile used for this migration */
 301     QEMUFile *f;
 302     /* UFFD file descriptor, used in 'write-tracking' migration */
 303     int uffdio_fd;
 304     /* Last block that we have visited searching for dirty pages */
 305     RAMBlock *last_seen_block;
 306     /* Last block from where we have sent data */
 307     RAMBlock *last_sent_block;
 308     /* Last dirty target page we have sent */
 309     ram_addr_t last_page;
 310     /* last ram version we have seen */
 311     uint32_t last_version;
 312     /* How many times we have dirty too many pages */
 313     int dirty_rate_high_cnt;
 314     /* these variables are used for bitmap sync */
 315     /* last time we did a full bitmap_sync */
 316     int64_t time_last_bitmap_sync;
 317     /* bytes transferred at start_time */
 318     uint64_t bytes_xfer_prev;
 319     /* number of dirty pages since start_time */
 320     uint64_t num_dirty_pages_period;
 321     /* xbzrle misses since the beginning of the period */
 322     uint64_t xbzrle_cache_miss_prev;
 323     /* Amount of xbzrle pages since the beginning of the period */
 324     uint64_t xbzrle_pages_prev;
 325     /* Amount of xbzrle encoded bytes since the beginning of the period */
 326     uint64_t xbzrle_bytes_prev;
 327     /* Start using XBZRLE (e.g., after the first round). */
 328     bool xbzrle_enabled;
 329     /* Are we on the last stage of migration */
 330     bool last_stage;
 331     /* compression statistics since the beginning of the period */
 332     /* amount of count that no free thread to compress data */
 333     uint64_t compress_thread_busy_prev;
 334     /* amount bytes after compression */
 335     uint64_t compressed_size_prev;
 336     /* amount of compressed pages */
 337     uint64_t compress_pages_prev;
 338
 339     /* total handled target pages at the beginning of period */
 340     uint64_t target_page_count_prev;
 341     /* total handled target pages since start */
 342     uint64_t target_page_count;
 343     /* number of dirty bits in the bitmap */
 344     uint64_t migration_dirty_pages;
 345     /* Protects modification of the bitmap and migration dirty pages */
 346     QemuMutex bitmap_mutex;
 347     /* The RAMBlock used in the last src_page_requests */
 348     RAMBlock *last_req_rb;
 349     /* Queue of outstanding page requests from the destination */
 350     QemuMutex src_page_req_mutex;
 351     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 352 };
 353 typedef struct RAMState RAMState;
 354
 355 static RAMState *ram_state;
 356
 357 static NotifierWithReturnList precopy_notifier_list;
 358
 359 /* Whether postcopy has queued requests? */
 360 static bool postcopy_has_request(RAMState *rs)
 361 {
 362     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
 363 }
 364
 365 void precopy_infrastructure_init(void)
 366 {
 367     notifier_with_return_list_init(&precopy_notifier_list);
 368 }
 369
 370 void precopy_add_notifier(NotifierWithReturn *n)
 371 {
 372     notifier_with_return_list_add(&precopy_notifier_list, n);
 373 }
 374
 375 void precopy_remove_notifier(NotifierWithReturn *n)
 376 {
 377     notifier_with_return_remove(n);
 378 }
 379
 380 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 381 {
 382     PrecopyNotifyData pnd;
 383     pnd.reason = reason;
 384     pnd.errp = errp;
 385
 386     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 387 }
 388
 389 uint64_t ram_bytes_remaining(void)
 390 {
 391     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 392                        0;
 393 }
 394
 395 MigrationStats ram_counters;
 396
 397 static void ram_transferred_add(uint64_t bytes)
 398 {
 399     if (runstate_is_running()) {
 400         ram_counters.precopy_bytes += bytes;
 401     } else if (migration_in_postcopy()) {
 402         ram_counters.postcopy_bytes += bytes;
 403     } else {
 404         ram_counters.downtime_bytes += bytes;
 405     }
 406     ram_counters.transferred += bytes;
 407 }
 408
 409 /* used by the search for pages to send */
 410 struct PageSearchStatus {
 411     /* Current block being searched */
 412     RAMBlock    *block;
 413     /* Current page to search from */
 414     unsigned long page;
 415     /* Set once we wrap around */
 416     bool         complete_round;
 417 };
 418 typedef struct PageSearchStatus PageSearchStatus;
 419
 420 CompressionStats compression_counters;
 421
 422 struct CompressParam {
 423     bool done;
 424     bool quit;
 425     bool zero_page;
 426     QEMUFile *file;
 427     QemuMutex mutex;
 428     QemuCond cond;
 429     RAMBlock *block;
 430     ram_addr_t offset;
 431
 432     /* internally used fields */
 433     z_stream stream;
 434     uint8_t *originbuf;
 435 };
 436 typedef struct CompressParam CompressParam;
 437
 438 struct DecompressParam {
 439     bool done;
 440     bool quit;
 441     QemuMutex mutex;
 442     QemuCond cond;
 443     void *des;
 444     uint8_t *compbuf;
 445     int len;
 446     z_stream stream;
 447 };
 448 typedef struct DecompressParam DecompressParam;
 449
 450 static CompressParam *comp_param;
 451 static QemuThread *compress_threads;
 452 /* comp_done_cond is used to wake up the migration thread when
 453  * one of the compression threads has finished the compression.
 454  * comp_done_lock is used to co-work with comp_done_cond.
 455  */
 456 static QemuMutex comp_done_lock;
 457 static QemuCond comp_done_cond;
 458 /* The empty QEMUFileOps will be used by file in CompressParam */
 459 static const QEMUFileOps empty_ops = { };
 460
 461 static QEMUFile *decomp_file;
 462 static DecompressParam *decomp_param;
 463 static QemuThread *decompress_threads;
 464 static QemuMutex decomp_done_lock;
 465 static QemuCond decomp_done_cond;
 466
 467 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 468                                  ram_addr_t offset, uint8_t *source_buf);
 469
 470 static void *do_data_compress(void *opaque)
 471 {
 472     CompressParam *param = opaque;
 473     RAMBlock *block;
 474     ram_addr_t offset;
 475     bool zero_page;
 476
 477     qemu_mutex_lock(&param->mutex);
 478     while (!param->quit) {
 479         if (param->block) {
 480             block = param->block;
 481             offset = param->offset;
 482             param->block = NULL;
 483             qemu_mutex_unlock(&param->mutex);
 484
 485             zero_page = do_compress_ram_page(param->file, &param->stream,
 486                                              block, offset, param->originbuf);
 487
 488             qemu_mutex_lock(&comp_done_lock);
 489             param->done = true;
 490             param->zero_page = zero_page;
 491             qemu_cond_signal(&comp_done_cond);
 492             qemu_mutex_unlock(&comp_done_lock);
 493
 494             qemu_mutex_lock(&param->mutex);
 495         } else {
 496             qemu_cond_wait(&param->cond, &param->mutex);
 497         }
 498     }
 499     qemu_mutex_unlock(&param->mutex);
 500
 501     return NULL;
 502 }
 503
 504 static void compress_threads_save_cleanup(void)
 505 {
 506     int i, thread_count;
 507
 508     if (!migrate_use_compression() || !comp_param) {
 509         return;
 510     }
 511
 512     thread_count = migrate_compress_threads();
 513     for (i = 0; i < thread_count; i++) {
 514         /*
 515          * we use it as a indicator which shows if the thread is
 516          * properly init'd or not
 517          */
 518         if (!comp_param[i].file) {
 519             break;
 520         }
 521
 522         qemu_mutex_lock(&comp_param[i].mutex);
 523         comp_param[i].quit = true;
 524         qemu_cond_signal(&comp_param[i].cond);
 525         qemu_mutex_unlock(&comp_param[i].mutex);
 526
 527         qemu_thread_join(compress_threads + i);
 528         qemu_mutex_destroy(&comp_param[i].mutex);
 529         qemu_cond_destroy(&comp_param[i].cond);
 530         deflateEnd(&comp_param[i].stream);
 531         g_free(comp_param[i].originbuf);
 532         qemu_fclose(comp_param[i].file);
 533         comp_param[i].file = NULL;
 534     }
 535     qemu_mutex_destroy(&comp_done_lock);
 536     qemu_cond_destroy(&comp_done_cond);
 537     g_free(compress_threads);
 538     g_free(comp_param);
 539     compress_threads = NULL;
 540     comp_param = NULL;
 541 }
 542
 543 static int compress_threads_save_setup(void)
 544 {
 545     int i, thread_count;
 546
 547     if (!migrate_use_compression()) {
 548         return 0;
 549     }
 550     thread_count = migrate_compress_threads();
 551     compress_threads = g_new0(QemuThread, thread_count);
 552     comp_param = g_new0(CompressParam, thread_count);
 553     qemu_cond_init(&comp_done_cond);
 554     qemu_mutex_init(&comp_done_lock);
 555     for (i = 0; i < thread_count; i++) {
 556         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 557         if (!comp_param[i].originbuf) {
 558             goto exit;
 559         }
 560
 561         if (deflateInit(&comp_param[i].stream,
 562                         migrate_compress_level()) != Z_OK) {
 563             g_free(comp_param[i].originbuf);
 564             goto exit;
 565         }
 566
 567         /* comp_param[i].file is just used as a dummy buffer to save data,
 568          * set its ops to empty.
 569          */
 570         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
 571         comp_param[i].done = true;
 572         comp_param[i].quit = false;
 573         qemu_mutex_init(&comp_param[i].mutex);
 574         qemu_cond_init(&comp_param[i].cond);
 575         qemu_thread_create(compress_threads + i, "compress",
 576                            do_data_compress, comp_param + i,
 577                            QEMU_THREAD_JOINABLE);
 578     }
 579     return 0;
 580
 581 exit:
 582     compress_threads_save_cleanup();
 583     return -1;
 584 }
 585
 586 /**
 587  * save_page_header: write page header to wire
 588  *
 589  * If this is the 1st block, it also writes the block identification
 590  *
 591  * Returns the number of bytes written
 592  *
 593  * @f: QEMUFile where to send the data
 594  * @block: block that contains the page we want to send
 595  * @offset: offset inside the block for the page
 596  *          in the lower bits, it contains flags
 597  */
 598 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 599                                ram_addr_t offset)
 600 {
 601     size_t size, len;
 602
 603     if (block == rs->last_sent_block) {
 604         offset |= RAM_SAVE_FLAG_CONTINUE;
 605     }
 606     qemu_put_be64(f, offset);
 607     size = 8;
 608
 609     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 610         len = strlen(block->idstr);
 611         qemu_put_byte(f, len);
 612         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 613         size += 1 + len;
 614         rs->last_sent_block = block;
 615     }
 616     return size;
 617 }
 618
 619 /**
 620  * mig_throttle_guest_down: throttle down the guest
 621  *
 622  * Reduce amount of guest cpu execution to hopefully slow down memory
 623  * writes. If guest dirty memory rate is reduced below the rate at
 624  * which we can transfer pages to the destination then we should be
 625  * able to complete migration. Some workloads dirty memory way too
 626  * fast and will not effectively converge, even with auto-converge.
 627  */
 628 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 629                                     uint64_t bytes_dirty_threshold)
 630 {
 631     MigrationState *s = migrate_get_current();
 632     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 633     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 634     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 635     int pct_max = s->parameters.max_cpu_throttle;
 636
 637     uint64_t throttle_now = cpu_throttle_get_percentage();
 638     uint64_t cpu_now, cpu_ideal, throttle_inc;
 639
 640     /* We have not started throttling yet. Let's start it. */
 641     if (!cpu_throttle_active()) {
 642         cpu_throttle_set(pct_initial);
 643     } else {
 644         /* Throttling already on, just increase the rate */
 645         if (!pct_tailslow) {
 646             throttle_inc = pct_increment;
 647         } else {
 648             /* Compute the ideal CPU percentage used by Guest, which may
 649              * make the dirty rate match the dirty rate threshold. */
 650             cpu_now = 100 - throttle_now;
 651             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 652                         bytes_dirty_period);
 653             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 654         }
 655         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 656     }
 657 }
 658
 659 void mig_throttle_counter_reset(void)
 660 {
 661     RAMState *rs = ram_state;
 662
 663     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 664     rs->num_dirty_pages_period = 0;
 665     rs->bytes_xfer_prev = ram_counters.transferred;
 666 }
 667
 668 /**
 669  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 670  *
 671  * @rs: current RAM state
 672  * @current_addr: address for the zero page
 673  *
 674  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 675  * The important thing is that a stale (not-yet-0'd) page be replaced
 676  * by the new data.
 677  * As a bonus, if the page wasn't in the cache it gets added so that
 678  * when a small write is made into the 0'd page it gets XBZRLE sent.
 679  */
 680 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 681 {
 682     if (!rs->xbzrle_enabled) {
 683         return;
 684     }
 685
 686     /* We don't care if this fails to allocate a new cache page
 687      * as long as it updated an old one */
 688     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 689                  ram_counters.dirty_sync_count);
 690 }
 691
 692 #define ENCODING_FLAG_XBZRLE 0x1
 693
 694 /**
 695  * save_xbzrle_page: compress and send current page
 696  *
 697  * Returns: 1 means that we wrote the page
 698  *          0 means that page is identical to the one already sent
 699  *          -1 means that xbzrle would be longer than normal
 700  *
 701  * @rs: current RAM state
 702  * @current_data: pointer to the address of the page contents
 703  * @current_addr: addr of the page
 704  * @block: block that contains the page we want to send
 705  * @offset: offset inside the block for the page
 706  */
 707 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 708                             ram_addr_t current_addr, RAMBlock *block,
 709                             ram_addr_t offset)
 710 {
 711     int encoded_len = 0, bytes_xbzrle;
 712     uint8_t *prev_cached_page;
 713
 714     if (!cache_is_cached(XBZRLE.cache, current_addr,
 715                          ram_counters.dirty_sync_count)) {
 716         xbzrle_counters.cache_miss++;
 717         if (!rs->last_stage) {
 718             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 719                              ram_counters.dirty_sync_count) == -1) {
 720                 return -1;
 721             } else {
 722                 /* update *current_data when the page has been
 723                    inserted into cache */
 724                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 725             }
 726         }
 727         return -1;
 728     }
 729
 730     /*
 731      * Reaching here means the page has hit the xbzrle cache, no matter what
 732      * encoding result it is (normal encoding, overflow or skipping the page),
 733      * count the page as encoded. This is used to calculate the encoding rate.
 734      *
 735      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 736      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 737      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 738      * skipped page included. In this way, the encoding rate can tell if the
 739      * guest page is good for xbzrle encoding.
 740      */
 741     xbzrle_counters.pages++;
 742     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 743
 744     /* save current buffer into memory */
 745     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 746
 747     /* XBZRLE encoding (if there is no overflow) */
 748     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 749                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 750                                        TARGET_PAGE_SIZE);
 751
 752     /*
 753      * Update the cache contents, so that it corresponds to the data
 754      * sent, in all cases except where we skip the page.
 755      */
 756     if (!rs->last_stage && encoded_len != 0) {
 757         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 758         /*
 759          * In the case where we couldn't compress, ensure that the caller
 760          * sends the data from the cache, since the guest might have
 761          * changed the RAM since we copied it.
 762          */
 763         *current_data = prev_cached_page;
 764     }
 765
 766     if (encoded_len == 0) {
 767         trace_save_xbzrle_page_skipping();
 768         return 0;
 769     } else if (encoded_len == -1) {
 770         trace_save_xbzrle_page_overflow();
 771         xbzrle_counters.overflow++;
 772         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 773         return -1;
 774     }
 775
 776     /* Send XBZRLE based compressed page */
 777     bytes_xbzrle = save_page_header(rs, rs->f, block,
 778                                     offset | RAM_SAVE_FLAG_XBZRLE);
 779     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 780     qemu_put_be16(rs->f, encoded_len);
 781     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 782     bytes_xbzrle += encoded_len + 1 + 2;
 783     /*
 784      * Like compressed_size (please see update_compress_thread_counts),
 785      * the xbzrle encoded bytes don't count the 8 byte header with
 786      * RAM_SAVE_FLAG_CONTINUE.
 787      */
 788     xbzrle_counters.bytes += bytes_xbzrle - 8;
 789     ram_transferred_add(bytes_xbzrle);
 790
 791     return 1;
 792 }
 793
 794 /**
 795  * migration_bitmap_find_dirty: find the next dirty page from start
 796  *
 797  * Returns the page offset within memory region of the start of a dirty page
 798  *
 799  * @rs: current RAM state
 800  * @rb: RAMBlock where to search for dirty pages
 801  * @start: page where we start the search
 802  */
 803 static inline
 804 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 805                                           unsigned long start)
 806 {
 807     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 808     unsigned long *bitmap = rb->bmap;
 809
 810     if (ramblock_is_ignored(rb)) {
 811         return size;
 812     }
 813
 814     return find_next_bit(bitmap, size, start);
 815 }
 816
 817 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
 818                                                        unsigned long page)
 819 {
 820     uint8_t shift;
 821     hwaddr size, start;
 822
 823     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
 824         return;
 825     }
 826
 827     shift = rb->clear_bmap_shift;
 828     /*
 829      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 830      * can make things easier sometimes since then start address
 831      * of the small chunk will always be 64 pages aligned so the
 832      * bitmap will always be aligned to unsigned long. We should
 833      * even be able to remove this restriction but I'm simply
 834      * keeping it.
 835      */
 836     assert(shift >= 6);
 837
 838     size = 1ULL << (TARGET_PAGE_BITS + shift);
 839     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
 840     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 841     memory_region_clear_dirty_bitmap(rb->mr, start, size);
 842 }
 843
 844 static void
 845 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
 846                                                  unsigned long start,
 847                                                  unsigned long npages)
 848 {
 849     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
 850     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
 851     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
 852
 853     /*
 854      * Clear pages from start to start + npages - 1, so the end boundary is
 855      * exclusive.
 856      */
 857     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
 858         migration_clear_memory_region_dirty_bitmap(rb, i);
 859     }
 860 }
 861
 862 /*
 863  * colo_bitmap_find_diry:find contiguous dirty pages from start
 864  *
 865  * Returns the page offset within memory region of the start of the contiguout
 866  * dirty page
 867  *
 868  * @rs: current RAM state
 869  * @rb: RAMBlock where to search for dirty pages
 870  * @start: page where we start the search
 871  * @num: the number of contiguous dirty pages
 872  */
 873 static inline
 874 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 875                                      unsigned long start, unsigned long *num)
 876 {
 877     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 878     unsigned long *bitmap = rb->bmap;
 879     unsigned long first, next;
 880
 881     *num = 0;
 882
 883     if (ramblock_is_ignored(rb)) {
 884         return size;
 885     }
 886
 887     first = find_next_bit(bitmap, size, start);
 888     if (first >= size) {
 889         return first;
 890     }
 891     next = find_next_zero_bit(bitmap, size, first + 1);
 892     assert(next >= first);
 893     *num = next - first;
 894     return first;
 895 }
 896
 897 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 898                                                 RAMBlock *rb,
 899                                                 unsigned long page)
 900 {
 901     bool ret;
 902
 903     /*
 904      * Clear dirty bitmap if needed.  This _must_ be called before we
 905      * send any of the page in the chunk because we need to make sure
 906      * we can capture further page content changes when we sync dirty
 907      * log the next time.  So as long as we are going to send any of
 908      * the page in the chunk we clear the remote dirty bitmap for all.
 909      * Clearing it earlier won't be a problem, but too late will.
 910      */
 911     migration_clear_memory_region_dirty_bitmap(rb, page);
 912
 913     ret = test_and_clear_bit(page, rb->bmap);
 914     if (ret) {
 915         rs->migration_dirty_pages--;
 916     }
 917
 918     return ret;
 919 }
 920
 921 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
 922                                        void *opaque)
 923 {
 924     const hwaddr offset = section->offset_within_region;
 925     const hwaddr size = int128_get64(section->size);
 926     const unsigned long start = offset >> TARGET_PAGE_BITS;
 927     const unsigned long npages = size >> TARGET_PAGE_BITS;
 928     RAMBlock *rb = section->mr->ram_block;
 929     uint64_t *cleared_bits = opaque;
 930
 931     /*
 932      * We don't grab ram_state->bitmap_mutex because we expect to run
 933      * only when starting migration or during postcopy recovery where
 934      * we don't have concurrent access.
 935      */
 936     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
 937         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
 938     }
 939     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
 940     bitmap_clear(rb->bmap, start, npages);
 941 }
 942
 943 /*
 944  * Exclude all dirty pages from migration that fall into a discarded range as
 945  * managed by a RamDiscardManager responsible for the mapped memory region of
 946  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
 947  *
 948  * Discarded pages ("logically unplugged") have undefined content and must
 949  * not get migrated, because even reading these pages for migration might
 950  * result in undesired behavior.
 951  *
 952  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
 953  *
 954  * Note: The result is only stable while migrating (precopy/postcopy).
 955  */
 956 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
 957 {
 958     uint64_t cleared_bits = 0;
 959
 960     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
 961         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 962         MemoryRegionSection section = {
 963             .mr = rb->mr,
 964             .offset_within_region = 0,
 965             .size = int128_make64(qemu_ram_get_used_length(rb)),
 966         };
 967
 968         ram_discard_manager_replay_discarded(rdm, &section,
 969                                              dirty_bitmap_clear_section,
 970                                              &cleared_bits);
 971     }
 972     return cleared_bits;
 973 }
 974
 975 /*
 976  * Check if a host-page aligned page falls into a discarded range as managed by
 977  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
 978  *
 979  * Note: The result is only stable while migrating (precopy/postcopy).
 980  */
 981 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
 982 {
 983     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
 984         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 985         MemoryRegionSection section = {
 986             .mr = rb->mr,
 987             .offset_within_region = start,
 988             .size = int128_make64(qemu_ram_pagesize(rb)),
 989         };
 990
 991         return !ram_discard_manager_is_populated(rdm, &section);
 992     }
 993     return false;
 994 }
 995
 996 /* Called with RCU critical section */
 997 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 998 {
 999     uint64_t new_dirty_pages =
1000         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1001
1002     rs->migration_dirty_pages += new_dirty_pages;
1003     rs->num_dirty_pages_period += new_dirty_pages;
1004 }
1005
1006 /**
1007  * ram_pagesize_summary: calculate all the pagesizes of a VM
1008  *
1009  * Returns a summary bitmap of the page sizes of all RAMBlocks
1010  *
1011  * For VMs with just normal pages this is equivalent to the host page
1012  * size. If it's got some huge pages then it's the OR of all the
1013  * different page sizes.
1014  */
1015 uint64_t ram_pagesize_summary(void)
1016 {
1017     RAMBlock *block;
1018     uint64_t summary = 0;
1019
1020     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1021         summary |= block->page_size;
1022     }
1023
1024     return summary;
1025 }
1026
1027 uint64_t ram_get_total_transferred_pages(void)
1028 {
1029     return  ram_counters.normal + ram_counters.duplicate +
1030                 compression_counters.pages + xbzrle_counters.pages;
1031 }
1032
1033 static void migration_update_rates(RAMState *rs, int64_t end_time)
1034 {
1035     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1036     double compressed_size;
1037
1038     /* calculate period counters */
1039     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1040                 / (end_time - rs->time_last_bitmap_sync);
1041
1042     if (!page_count) {
1043         return;
1044     }
1045
1046     if (migrate_use_xbzrle()) {
1047         double encoded_size, unencoded_size;
1048
1049         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1050             rs->xbzrle_cache_miss_prev) / page_count;
1051         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1052         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1053                          TARGET_PAGE_SIZE;
1054         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1055         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1056             xbzrle_counters.encoding_rate = 0;
1057         } else {
1058             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1059         }
1060         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1061         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1062     }
1063
1064     if (migrate_use_compression()) {
1065         compression_counters.busy_rate = (double)(compression_counters.busy -
1066             rs->compress_thread_busy_prev) / page_count;
1067         rs->compress_thread_busy_prev = compression_counters.busy;
1068
1069         compressed_size = compression_counters.compressed_size -
1070                           rs->compressed_size_prev;
1071         if (compressed_size) {
1072             double uncompressed_size = (compression_counters.pages -
1073                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1074
1075             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1076             compression_counters.compression_rate =
1077                                         uncompressed_size / compressed_size;
1078
1079             rs->compress_pages_prev = compression_counters.pages;
1080             rs->compressed_size_prev = compression_counters.compressed_size;
1081         }
1082     }
1083 }
1084
1085 static void migration_trigger_throttle(RAMState *rs)
1086 {
1087     MigrationState *s = migrate_get_current();
1088     uint64_t threshold = s->parameters.throttle_trigger_threshold;
1089
1090     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1091     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1092     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1093
1094     /* During block migration the auto-converge logic incorrectly detects
1095      * that ram migration makes no progress. Avoid this by disabling the
1096      * throttling logic during the bulk phase of block migration. */
1097     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1098         /* The following detection logic can be refined later. For now:
1099            Check to see if the ratio between dirtied bytes and the approx.
1100            amount of bytes that just got transferred since the last time
1101            we were in this routine reaches the threshold. If that happens
1102            twice, start or increase throttling. */
1103
1104         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1105             (++rs->dirty_rate_high_cnt >= 2)) {
1106             trace_migration_throttle();
1107             rs->dirty_rate_high_cnt = 0;
1108             mig_throttle_guest_down(bytes_dirty_period,
1109                                     bytes_dirty_threshold);
1110         }
1111     }
1112 }
1113
1114 static void migration_bitmap_sync(RAMState *rs)
1115 {
1116     RAMBlock *block;
1117     int64_t end_time;
1118
1119     ram_counters.dirty_sync_count++;
1120
1121     if (!rs->time_last_bitmap_sync) {
1122         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1123     }
1124
1125     trace_migration_bitmap_sync_start();
1126     memory_global_dirty_log_sync();
1127
1128     qemu_mutex_lock(&rs->bitmap_mutex);
1129     WITH_RCU_READ_LOCK_GUARD() {
1130         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1131             ramblock_sync_dirty_bitmap(rs, block);
1132         }
1133         ram_counters.remaining = ram_bytes_remaining();
1134     }
1135     qemu_mutex_unlock(&rs->bitmap_mutex);
1136
1137     memory_global_after_dirty_log_sync();
1138     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1139
1140     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1141
1142     /* more than 1 second = 1000 millisecons */
1143     if (end_time > rs->time_last_bitmap_sync + 1000) {
1144         migration_trigger_throttle(rs);
1145
1146         migration_update_rates(rs, end_time);
1147
1148         rs->target_page_count_prev = rs->target_page_count;
1149
1150         /* reset period counters */
1151         rs->time_last_bitmap_sync = end_time;
1152         rs->num_dirty_pages_period = 0;
1153         rs->bytes_xfer_prev = ram_counters.transferred;
1154     }
1155     if (migrate_use_events()) {
1156         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1157     }
1158 }
1159
1160 static void migration_bitmap_sync_precopy(RAMState *rs)
1161 {
1162     Error *local_err = NULL;
1163
1164     /*
1165      * The current notifier usage is just an optimization to migration, so we
1166      * don't stop the normal migration process in the error case.
1167      */
1168     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1169         error_report_err(local_err);
1170         local_err = NULL;
1171     }
1172
1173     migration_bitmap_sync(rs);
1174
1175     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1176         error_report_err(local_err);
1177     }
1178 }
1179
1180 static void ram_release_page(const char *rbname, uint64_t offset)
1181 {
1182     if (!migrate_release_ram() || !migration_in_postcopy()) {
1183         return;
1184     }
1185
1186     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1187 }
1188
1189 /**
1190  * save_zero_page_to_file: send the zero page to the file
1191  *
1192  * Returns the size of data written to the file, 0 means the page is not
1193  * a zero page
1194  *
1195  * @rs: current RAM state
1196  * @file: the file where the data is saved
1197  * @block: block that contains the page we want to send
1198  * @offset: offset inside the block for the page
1199  */
1200 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1201                                   RAMBlock *block, ram_addr_t offset)
1202 {
1203     uint8_t *p = block->host + offset;
1204     int len = 0;
1205
1206     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1207         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1208         qemu_put_byte(file, 0);
1209         len += 1;
1210         ram_release_page(block->idstr, offset);
1211     }
1212     return len;
1213 }
1214
1215 /**
1216  * save_zero_page: send the zero page to the stream
1217  *
1218  * Returns the number of pages written.
1219  *
1220  * @rs: current RAM state
1221  * @block: block that contains the page we want to send
1222  * @offset: offset inside the block for the page
1223  */
1224 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1225 {
1226     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1227
1228     if (len) {
1229         ram_counters.duplicate++;
1230         ram_transferred_add(len);
1231         return 1;
1232     }
1233     return -1;
1234 }
1235
1236 /*
1237  * @pages: the number of pages written by the control path,
1238  *        < 0 - error
1239  *        > 0 - number of pages written
1240  *
1241  * Return true if the pages has been saved, otherwise false is returned.
1242  */
1243 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1244                               int *pages)
1245 {
1246     uint64_t bytes_xmit = 0;
1247     int ret;
1248
1249     *pages = -1;
1250     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1251                                 &bytes_xmit);
1252     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1253         return false;
1254     }
1255
1256     if (bytes_xmit) {
1257         ram_transferred_add(bytes_xmit);
1258         *pages = 1;
1259     }
1260
1261     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1262         return true;
1263     }
1264
1265     if (bytes_xmit > 0) {
1266         ram_counters.normal++;
1267     } else if (bytes_xmit == 0) {
1268         ram_counters.duplicate++;
1269     }
1270
1271     return true;
1272 }
1273
1274 /*
1275  * directly send the page to the stream
1276  *
1277  * Returns the number of pages written.
1278  *
1279  * @rs: current RAM state
1280  * @block: block that contains the page we want to send
1281  * @offset: offset inside the block for the page
1282  * @buf: the page to be sent
1283  * @async: send to page asyncly
1284  */
1285 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1286                             uint8_t *buf, bool async)
1287 {
1288     ram_transferred_add(save_page_header(rs, rs->f, block,
1289                                          offset | RAM_SAVE_FLAG_PAGE));
1290     if (async) {
1291         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1292                               migrate_release_ram() &
1293                               migration_in_postcopy());
1294     } else {
1295         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1296     }
1297     ram_transferred_add(TARGET_PAGE_SIZE);
1298     ram_counters.normal++;
1299     return 1;
1300 }
1301
1302 /**
1303  * ram_save_page: send the given page to the stream
1304  *
1305  * Returns the number of pages written.
1306  *          < 0 - error
1307  *          >=0 - Number of pages written - this might legally be 0
1308  *                if xbzrle noticed the page was the same.
1309  *
1310  * @rs: current RAM state
1311  * @block: block that contains the page we want to send
1312  * @offset: offset inside the block for the page
1313  */
1314 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1315 {
1316     int pages = -1;
1317     uint8_t *p;
1318     bool send_async = true;
1319     RAMBlock *block = pss->block;
1320     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1321     ram_addr_t current_addr = block->offset + offset;
1322
1323     p = block->host + offset;
1324     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1325
1326     XBZRLE_cache_lock();
1327     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1328         pages = save_xbzrle_page(rs, &p, current_addr, block,
1329                                  offset);
1330         if (!rs->last_stage) {
1331             /* Can't send this cached data async, since the cache page
1332              * might get updated before it gets to the wire
1333              */
1334             send_async = false;
1335         }
1336     }
1337
1338     /* XBZRLE overflow or normal page */
1339     if (pages == -1) {
1340         pages = save_normal_page(rs, block, offset, p, send_async);
1341     }
1342
1343     XBZRLE_cache_unlock();
1344
1345     return pages;
1346 }
1347
1348 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1349                                  ram_addr_t offset)
1350 {
1351     if (multifd_queue_page(rs->f, block, offset) < 0) {
1352         return -1;
1353     }
1354     ram_counters.normal++;
1355
1356     return 1;
1357 }
1358
1359 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1360                                  ram_addr_t offset, uint8_t *source_buf)
1361 {
1362     RAMState *rs = ram_state;
1363     uint8_t *p = block->host + offset;
1364     int ret;
1365
1366     if (save_zero_page_to_file(rs, f, block, offset)) {
1367         return true;
1368     }
1369
1370     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1371
1372     /*
1373      * copy it to a internal buffer to avoid it being modified by VM
1374      * so that we can catch up the error during compression and
1375      * decompression
1376      */
1377     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1378     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1379     if (ret < 0) {
1380         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1381         error_report("compressed data failed!");
1382     }
1383     return false;
1384 }
1385
1386 static void
1387 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1388 {
1389     ram_transferred_add(bytes_xmit);
1390
1391     if (param->zero_page) {
1392         ram_counters.duplicate++;
1393         return;
1394     }
1395
1396     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1397     compression_counters.compressed_size += bytes_xmit - 8;
1398     compression_counters.pages++;
1399 }
1400
1401 static bool save_page_use_compression(RAMState *rs);
1402
1403 static void flush_compressed_data(RAMState *rs)
1404 {
1405     int idx, len, thread_count;
1406
1407     if (!save_page_use_compression(rs)) {
1408         return;
1409     }
1410     thread_count = migrate_compress_threads();
1411
1412     qemu_mutex_lock(&comp_done_lock);
1413     for (idx = 0; idx < thread_count; idx++) {
1414         while (!comp_param[idx].done) {
1415             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1416         }
1417     }
1418     qemu_mutex_unlock(&comp_done_lock);
1419
1420     for (idx = 0; idx < thread_count; idx++) {
1421         qemu_mutex_lock(&comp_param[idx].mutex);
1422         if (!comp_param[idx].quit) {
1423             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1424             /*
1425              * it's safe to fetch zero_page without holding comp_done_lock
1426              * as there is no further request submitted to the thread,
1427              * i.e, the thread should be waiting for a request at this point.
1428              */
1429             update_compress_thread_counts(&comp_param[idx], len);
1430         }
1431         qemu_mutex_unlock(&comp_param[idx].mutex);
1432     }
1433 }
1434
1435 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1436                                        ram_addr_t offset)
1437 {
1438     param->block = block;
1439     param->offset = offset;
1440 }
1441
1442 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1443                                            ram_addr_t offset)
1444 {
1445     int idx, thread_count, bytes_xmit = -1, pages = -1;
1446     bool wait = migrate_compress_wait_thread();
1447
1448     thread_count = migrate_compress_threads();
1449     qemu_mutex_lock(&comp_done_lock);
1450 retry:
1451     for (idx = 0; idx < thread_count; idx++) {
1452         if (comp_param[idx].done) {
1453             comp_param[idx].done = false;
1454             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1455             qemu_mutex_lock(&comp_param[idx].mutex);
1456             set_compress_params(&comp_param[idx], block, offset);
1457             qemu_cond_signal(&comp_param[idx].cond);
1458             qemu_mutex_unlock(&comp_param[idx].mutex);
1459             pages = 1;
1460             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1461             break;
1462         }
1463     }
1464
1465     /*
1466      * wait for the free thread if the user specifies 'compress-wait-thread',
1467      * otherwise we will post the page out in the main thread as normal page.
1468      */
1469     if (pages < 0 && wait) {
1470         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1471         goto retry;
1472     }
1473     qemu_mutex_unlock(&comp_done_lock);
1474
1475     return pages;
1476 }
1477
1478 /**
1479  * find_dirty_block: find the next dirty page and update any state
1480  * associated with the search process.
1481  *
1482  * Returns true if a page is found
1483  *
1484  * @rs: current RAM state
1485  * @pss: data about the state of the current dirty page scan
1486  * @again: set to false if the search has scanned the whole of RAM
1487  */
1488 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1489 {
1490     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1491     if (pss->complete_round && pss->block == rs->last_seen_block &&
1492         pss->page >= rs->last_page) {
1493         /*
1494          * We've been once around the RAM and haven't found anything.
1495          * Give up.
1496          */
1497         *again = false;
1498         return false;
1499     }
1500     if (!offset_in_ramblock(pss->block,
1501                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1502         /* Didn't find anything in this RAM Block */
1503         pss->page = 0;
1504         pss->block = QLIST_NEXT_RCU(pss->block, next);
1505         if (!pss->block) {
1506             /*
1507              * If memory migration starts over, we will meet a dirtied page
1508              * which may still exists in compression threads's ring, so we
1509              * should flush the compressed data to make sure the new page
1510              * is not overwritten by the old one in the destination.
1511              *
1512              * Also If xbzrle is on, stop using the data compression at this
1513              * point. In theory, xbzrle can do better than compression.
1514              */
1515             flush_compressed_data(rs);
1516
1517             /* Hit the end of the list */
1518             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1519             /* Flag that we've looped */
1520             pss->complete_round = true;
1521             /* After the first round, enable XBZRLE. */
1522             if (migrate_use_xbzrle()) {
1523                 rs->xbzrle_enabled = true;
1524             }
1525         }
1526         /* Didn't find anything this time, but try again on the new block */
1527         *again = true;
1528         return false;
1529     } else {
1530         /* Can go around again, but... */
1531         *again = true;
1532         /* We've found something so probably don't need to */
1533         return true;
1534     }
1535 }
1536
1537 /**
1538  * unqueue_page: gets a page of the queue
1539  *
1540  * Helper for 'get_queued_page' - gets a page off the queue
1541  *
1542  * Returns the block of the page (or NULL if none available)
1543  *
1544  * @rs: current RAM state
1545  * @offset: used to return the offset within the RAMBlock
1546  */
1547 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1548 {
1549     struct RAMSrcPageRequest *entry;
1550     RAMBlock *block = NULL;
1551     size_t page_size;
1552
1553     if (!postcopy_has_request(rs)) {
1554         return NULL;
1555     }
1556
1557     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1558
1559     /*
1560      * This should _never_ change even after we take the lock, because no one
1561      * should be taking anything off the request list other than us.
1562      */
1563     assert(postcopy_has_request(rs));
1564
1565     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1566     block = entry->rb;
1567     *offset = entry->offset;
1568     page_size = qemu_ram_pagesize(block);
1569     /* Each page request should only be multiple page size of the ramblock */
1570     assert((entry->len % page_size) == 0);
1571
1572     if (entry->len > page_size) {
1573         entry->len -= page_size;
1574         entry->offset += page_size;
1575     } else {
1576         memory_region_unref(block->mr);
1577         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1578         g_free(entry);
1579         migration_consume_urgent_request();
1580     }
1581
1582     trace_unqueue_page(block->idstr, *offset,
1583                        test_bit((*offset >> TARGET_PAGE_BITS), block->bmap));
1584
1585     return block;
1586 }
1587
1588 #if defined(__linux__)
1589 /**
1590  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1591  *   is found, return RAM block pointer and page offset
1592  *
1593  * Returns pointer to the RAMBlock containing faulting page,
1594  *   NULL if no write faults are pending
1595  *
1596  * @rs: current RAM state
1597  * @offset: page offset from the beginning of the block
1598  */
1599 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1600 {
1601     struct uffd_msg uffd_msg;
1602     void *page_address;
1603     RAMBlock *block;
1604     int res;
1605
1606     if (!migrate_background_snapshot()) {
1607         return NULL;
1608     }
1609
1610     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1611     if (res <= 0) {
1612         return NULL;
1613     }
1614
1615     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1616     block = qemu_ram_block_from_host(page_address, false, offset);
1617     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1618     return block;
1619 }
1620
1621 /**
1622  * ram_save_release_protection: release UFFD write protection after
1623  *   a range of pages has been saved
1624  *
1625  * @rs: current RAM state
1626  * @pss: page-search-status structure
1627  * @start_page: index of the first page in the range relative to pss->block
1628  *
1629  * Returns 0 on success, negative value in case of an error
1630 */
1631 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1632         unsigned long start_page)
1633 {
1634     int res = 0;
1635
1636     /* Check if page is from UFFD-managed region. */
1637     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1638         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1639         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1640
1641         /* Flush async buffers before un-protect. */
1642         qemu_fflush(rs->f);
1643         /* Un-protect memory range. */
1644         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1645                 false, false);
1646     }
1647
1648     return res;
1649 }
1650
1651 /* ram_write_tracking_available: check if kernel supports required UFFD features
1652  *
1653  * Returns true if supports, false otherwise
1654  */
1655 bool ram_write_tracking_available(void)
1656 {
1657     uint64_t uffd_features;
1658     int res;
1659
1660     res = uffd_query_features(&uffd_features);
1661     return (res == 0 &&
1662             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1663 }
1664
1665 /* ram_write_tracking_compatible: check if guest configuration is
1666  *   compatible with 'write-tracking'
1667  *
1668  * Returns true if compatible, false otherwise
1669  */
1670 bool ram_write_tracking_compatible(void)
1671 {
1672     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1673     int uffd_fd;
1674     RAMBlock *block;
1675     bool ret = false;
1676
1677     /* Open UFFD file descriptor */
1678     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1679     if (uffd_fd < 0) {
1680         return false;
1681     }
1682
1683     RCU_READ_LOCK_GUARD();
1684
1685     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1686         uint64_t uffd_ioctls;
1687
1688         /* Nothing to do with read-only and MMIO-writable regions */
1689         if (block->mr->readonly || block->mr->rom_device) {
1690             continue;
1691         }
1692         /* Try to register block memory via UFFD-IO to track writes */
1693         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1694                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1695             goto out;
1696         }
1697         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1698             goto out;
1699         }
1700     }
1701     ret = true;
1702
1703 out:
1704     uffd_close_fd(uffd_fd);
1705     return ret;
1706 }
1707
1708 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1709                                        ram_addr_t size)
1710 {
1711     /*
1712      * We read one byte of each page; this will preallocate page tables if
1713      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1714      * where no page was populated yet. This might require adaption when
1715      * supporting other mappings, like shmem.
1716      */
1717     for (; offset < size; offset += block->page_size) {
1718         char tmp = *((char *)block->host + offset);
1719
1720         /* Don't optimize the read out */
1721         asm volatile("" : "+r" (tmp));
1722     }
1723 }
1724
1725 static inline int populate_read_section(MemoryRegionSection *section,
1726                                         void *opaque)
1727 {
1728     const hwaddr size = int128_get64(section->size);
1729     hwaddr offset = section->offset_within_region;
1730     RAMBlock *block = section->mr->ram_block;
1731
1732     populate_read_range(block, offset, size);
1733     return 0;
1734 }
1735
1736 /*
1737  * ram_block_populate_read: preallocate page tables and populate pages in the
1738  *   RAM block by reading a byte of each page.
1739  *
1740  * Since it's solely used for userfault_fd WP feature, here we just
1741  *   hardcode page size to qemu_real_host_page_size.
1742  *
1743  * @block: RAM block to populate
1744  */
1745 static void ram_block_populate_read(RAMBlock *rb)
1746 {
1747     /*
1748      * Skip populating all pages that fall into a discarded range as managed by
1749      * a RamDiscardManager responsible for the mapped memory region of the
1750      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1751      * must not get populated automatically. We don't have to track
1752      * modifications via userfaultfd WP reliably, because these pages will
1753      * not be part of the migration stream either way -- see
1754      * ramblock_dirty_bitmap_exclude_discarded_pages().
1755      *
1756      * Note: The result is only stable while migrating (precopy/postcopy).
1757      */
1758     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1759         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1760         MemoryRegionSection section = {
1761             .mr = rb->mr,
1762             .offset_within_region = 0,
1763             .size = rb->mr->size,
1764         };
1765
1766         ram_discard_manager_replay_populated(rdm, &section,
1767                                              populate_read_section, NULL);
1768     } else {
1769         populate_read_range(rb, 0, rb->used_length);
1770     }
1771 }
1772
1773 /*
1774  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1775  */
1776 void ram_write_tracking_prepare(void)
1777 {
1778     RAMBlock *block;
1779
1780     RCU_READ_LOCK_GUARD();
1781
1782     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1783         /* Nothing to do with read-only and MMIO-writable regions */
1784         if (block->mr->readonly || block->mr->rom_device) {
1785             continue;
1786         }
1787
1788         /*
1789          * Populate pages of the RAM block before enabling userfault_fd
1790          * write protection.
1791          *
1792          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1793          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1794          * pages with pte_none() entries in page table.
1795          */
1796         ram_block_populate_read(block);
1797     }
1798 }
1799
1800 /*
1801  * ram_write_tracking_start: start UFFD-WP memory tracking
1802  *
1803  * Returns 0 for success or negative value in case of error
1804  */
1805 int ram_write_tracking_start(void)
1806 {
1807     int uffd_fd;
1808     RAMState *rs = ram_state;
1809     RAMBlock *block;
1810
1811     /* Open UFFD file descriptor */
1812     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1813     if (uffd_fd < 0) {
1814         return uffd_fd;
1815     }
1816     rs->uffdio_fd = uffd_fd;
1817
1818     RCU_READ_LOCK_GUARD();
1819
1820     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1821         /* Nothing to do with read-only and MMIO-writable regions */
1822         if (block->mr->readonly || block->mr->rom_device) {
1823             continue;
1824         }
1825
1826         /* Register block memory with UFFD to track writes */
1827         if (uffd_register_memory(rs->uffdio_fd, block->host,
1828                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1829             goto fail;
1830         }
1831         /* Apply UFFD write protection to the block memory range */
1832         if (uffd_change_protection(rs->uffdio_fd, block->host,
1833                 block->max_length, true, false)) {
1834             goto fail;
1835         }
1836         block->flags |= RAM_UF_WRITEPROTECT;
1837         memory_region_ref(block->mr);
1838
1839         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1840                 block->host, block->max_length);
1841     }
1842
1843     return 0;
1844
1845 fail:
1846     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1847
1848     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1849         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1850             continue;
1851         }
1852         /*
1853          * In case some memory block failed to be write-protected
1854          * remove protection and unregister all succeeded RAM blocks
1855          */
1856         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1857                 false, false);
1858         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1859         /* Cleanup flags and remove reference */
1860         block->flags &= ~RAM_UF_WRITEPROTECT;
1861         memory_region_unref(block->mr);
1862     }
1863
1864     uffd_close_fd(uffd_fd);
1865     rs->uffdio_fd = -1;
1866     return -1;
1867 }
1868
1869 /**
1870  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1871  */
1872 void ram_write_tracking_stop(void)
1873 {
1874     RAMState *rs = ram_state;
1875     RAMBlock *block;
1876
1877     RCU_READ_LOCK_GUARD();
1878
1879     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1880         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1881             continue;
1882         }
1883         /* Remove protection and unregister all affected RAM blocks */
1884         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1885                 false, false);
1886         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1887
1888         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1889                 block->host, block->max_length);
1890
1891         /* Cleanup flags and remove reference */
1892         block->flags &= ~RAM_UF_WRITEPROTECT;
1893         memory_region_unref(block->mr);
1894     }
1895
1896     /* Finally close UFFD file descriptor */
1897     uffd_close_fd(rs->uffdio_fd);
1898     rs->uffdio_fd = -1;
1899 }
1900
1901 #else
1902 /* No target OS support, stubs just fail or ignore */
1903
1904 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1905 {
1906     (void) rs;
1907     (void) offset;
1908
1909     return NULL;
1910 }
1911
1912 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1913         unsigned long start_page)
1914 {
1915     (void) rs;
1916     (void) pss;
1917     (void) start_page;
1918
1919     return 0;
1920 }
1921
1922 bool ram_write_tracking_available(void)
1923 {
1924     return false;
1925 }
1926
1927 bool ram_write_tracking_compatible(void)
1928 {
1929     assert(0);
1930     return false;
1931 }
1932
1933 int ram_write_tracking_start(void)
1934 {
1935     assert(0);
1936     return -1;
1937 }
1938
1939 void ram_write_tracking_stop(void)
1940 {
1941     assert(0);
1942 }
1943 #endif /* defined(__linux__) */
1944
1945 /**
1946  * get_queued_page: unqueue a page from the postcopy requests
1947  *
1948  * Skips pages that are already sent (!dirty)
1949  *
1950  * Returns true if a queued page is found
1951  *
1952  * @rs: current RAM state
1953  * @pss: data about the state of the current dirty page scan
1954  */
1955 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1956 {
1957     RAMBlock  *block;
1958     ram_addr_t offset;
1959
1960     block = unqueue_page(rs, &offset);
1961
1962     if (!block) {
1963         /*
1964          * Poll write faults too if background snapshot is enabled; that's
1965          * when we have vcpus got blocked by the write protected pages.
1966          */
1967         block = poll_fault_page(rs, &offset);
1968     }
1969
1970     if (block) {
1971         /*
1972          * We want the background search to continue from the queued page
1973          * since the guest is likely to want other pages near to the page
1974          * it just requested.
1975          */
1976         pss->block = block;
1977         pss->page = offset >> TARGET_PAGE_BITS;
1978
1979         /*
1980          * This unqueued page would break the "one round" check, even is
1981          * really rare.
1982          */
1983         pss->complete_round = false;
1984     }
1985
1986     return !!block;
1987 }
1988
1989 /**
1990  * migration_page_queue_free: drop any remaining pages in the ram
1991  * request queue
1992  *
1993  * It should be empty at the end anyway, but in error cases there may
1994  * be some left.  in case that there is any page left, we drop it.
1995  *
1996  */
1997 static void migration_page_queue_free(RAMState *rs)
1998 {
1999     struct RAMSrcPageRequest *mspr, *next_mspr;
2000     /* This queue generally should be empty - but in the case of a failed
2001      * migration might have some droppings in.
2002      */
2003     RCU_READ_LOCK_GUARD();
2004     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2005         memory_region_unref(mspr->rb->mr);
2006         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2007         g_free(mspr);
2008     }
2009 }
2010
2011 /**
2012  * ram_save_queue_pages: queue the page for transmission
2013  *
2014  * A request from postcopy destination for example.
2015  *
2016  * Returns zero on success or negative on error
2017  *
2018  * @rbname: Name of the RAMBLock of the request. NULL means the
2019  *          same that last one.
2020  * @start: starting address from the start of the RAMBlock
2021  * @len: length (in bytes) to send
2022  */
2023 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2024 {
2025     RAMBlock *ramblock;
2026     RAMState *rs = ram_state;
2027
2028     ram_counters.postcopy_requests++;
2029     RCU_READ_LOCK_GUARD();
2030
2031     if (!rbname) {
2032         /* Reuse last RAMBlock */
2033         ramblock = rs->last_req_rb;
2034
2035         if (!ramblock) {
2036             /*
2037              * Shouldn't happen, we can't reuse the last RAMBlock if
2038              * it's the 1st request.
2039              */
2040             error_report("ram_save_queue_pages no previous block");
2041             return -1;
2042         }
2043     } else {
2044         ramblock = qemu_ram_block_by_name(rbname);
2045
2046         if (!ramblock) {
2047             /* We shouldn't be asked for a non-existent RAMBlock */
2048             error_report("ram_save_queue_pages no block '%s'", rbname);
2049             return -1;
2050         }
2051         rs->last_req_rb = ramblock;
2052     }
2053     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2054     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2055         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2056                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2057                      __func__, start, len, ramblock->used_length);
2058         return -1;
2059     }
2060
2061     struct RAMSrcPageRequest *new_entry =
2062         g_malloc0(sizeof(struct RAMSrcPageRequest));
2063     new_entry->rb = ramblock;
2064     new_entry->offset = start;
2065     new_entry->len = len;
2066
2067     memory_region_ref(ramblock->mr);
2068     qemu_mutex_lock(&rs->src_page_req_mutex);
2069     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2070     migration_make_urgent_request();
2071     qemu_mutex_unlock(&rs->src_page_req_mutex);
2072
2073     return 0;
2074 }
2075
2076 static bool save_page_use_compression(RAMState *rs)
2077 {
2078     if (!migrate_use_compression()) {
2079         return false;
2080     }
2081
2082     /*
2083      * If xbzrle is enabled (e.g., after first round of migration), stop
2084      * using the data compression. In theory, xbzrle can do better than
2085      * compression.
2086      */
2087     if (rs->xbzrle_enabled) {
2088         return false;
2089     }
2090
2091     return true;
2092 }
2093
2094 /*
2095  * try to compress the page before posting it out, return true if the page
2096  * has been properly handled by compression, otherwise needs other
2097  * paths to handle it
2098  */
2099 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2100 {
2101     if (!save_page_use_compression(rs)) {
2102         return false;
2103     }
2104
2105     /*
2106      * When starting the process of a new block, the first page of
2107      * the block should be sent out before other pages in the same
2108      * block, and all the pages in last block should have been sent
2109      * out, keeping this order is important, because the 'cont' flag
2110      * is used to avoid resending the block name.
2111      *
2112      * We post the fist page as normal page as compression will take
2113      * much CPU resource.
2114      */
2115     if (block != rs->last_sent_block) {
2116         flush_compressed_data(rs);
2117         return false;
2118     }
2119
2120     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2121         return true;
2122     }
2123
2124     compression_counters.busy++;
2125     return false;
2126 }
2127
2128 /**
2129  * ram_save_target_page: save one target page
2130  *
2131  * Returns the number of pages written
2132  *
2133  * @rs: current RAM state
2134  * @pss: data about the page we want to send
2135  */
2136 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
2137 {
2138     RAMBlock *block = pss->block;
2139     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2140     int res;
2141
2142     if (control_save_page(rs, block, offset, &res)) {
2143         return res;
2144     }
2145
2146     if (save_compress_page(rs, block, offset)) {
2147         return 1;
2148     }
2149
2150     res = save_zero_page(rs, block, offset);
2151     if (res > 0) {
2152         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2153          * page would be stale
2154          */
2155         if (!save_page_use_compression(rs)) {
2156             XBZRLE_cache_lock();
2157             xbzrle_cache_zero_page(rs, block->offset + offset);
2158             XBZRLE_cache_unlock();
2159         }
2160         return res;
2161     }
2162
2163     /*
2164      * Do not use multifd for:
2165      * 1. Compression as the first page in the new block should be posted out
2166      *    before sending the compressed page
2167      * 2. In postcopy as one whole host page should be placed
2168      */
2169     if (!save_page_use_compression(rs) && migrate_use_multifd()
2170         && !migration_in_postcopy()) {
2171         return ram_save_multifd_page(rs, block, offset);
2172     }
2173
2174     return ram_save_page(rs, pss);
2175 }
2176
2177 /**
2178  * ram_save_host_page: save a whole host page
2179  *
2180  * Starting at *offset send pages up to the end of the current host
2181  * page. It's valid for the initial offset to point into the middle of
2182  * a host page in which case the remainder of the hostpage is sent.
2183  * Only dirty target pages are sent. Note that the host page size may
2184  * be a huge page for this block.
2185  * The saving stops at the boundary of the used_length of the block
2186  * if the RAMBlock isn't a multiple of the host page size.
2187  *
2188  * Returns the number of pages written or negative on error
2189  *
2190  * @rs: current RAM state
2191  * @pss: data about the page we want to send
2192  */
2193 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2194 {
2195     int tmppages, pages = 0;
2196     size_t pagesize_bits =
2197         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2198     unsigned long hostpage_boundary =
2199         QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2200     unsigned long start_page = pss->page;
2201     int res;
2202
2203     if (ramblock_is_ignored(pss->block)) {
2204         error_report("block %s should not be migrated !", pss->block->idstr);
2205         return 0;
2206     }
2207
2208     do {
2209         /* Check the pages is dirty and if it is send it */
2210         if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2211             tmppages = ram_save_target_page(rs, pss);
2212             if (tmppages < 0) {
2213                 return tmppages;
2214             }
2215
2216             pages += tmppages;
2217             /*
2218              * Allow rate limiting to happen in the middle of huge pages if
2219              * something is sent in the current iteration.
2220              */
2221             if (pagesize_bits > 1 && tmppages > 0) {
2222                 migration_rate_limit();
2223             }
2224         }
2225         pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2226     } while ((pss->page < hostpage_boundary) &&
2227              offset_in_ramblock(pss->block,
2228                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2229     /* The offset we leave with is the min boundary of host page and block */
2230     pss->page = MIN(pss->page, hostpage_boundary);
2231
2232     res = ram_save_release_protection(rs, pss, start_page);
2233     return (res < 0 ? res : pages);
2234 }
2235
2236 /**
2237  * ram_find_and_save_block: finds a dirty page and sends it to f
2238  *
2239  * Called within an RCU critical section.
2240  *
2241  * Returns the number of pages written where zero means no dirty pages,
2242  * or negative on error
2243  *
2244  * @rs: current RAM state
2245  *
2246  * On systems where host-page-size > target-page-size it will send all the
2247  * pages in a host page that are dirty.
2248  */
2249 static int ram_find_and_save_block(RAMState *rs)
2250 {
2251     PageSearchStatus pss;
2252     int pages = 0;
2253     bool again, found;
2254
2255     /* No dirty page as there is zero RAM */
2256     if (!ram_bytes_total()) {
2257         return pages;
2258     }
2259
2260     pss.block = rs->last_seen_block;
2261     pss.page = rs->last_page;
2262     pss.complete_round = false;
2263
2264     if (!pss.block) {
2265         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2266     }
2267
2268     do {
2269         again = true;
2270         found = get_queued_page(rs, &pss);
2271
2272         if (!found) {
2273             /* priority queue empty, so just search for something dirty */
2274             found = find_dirty_block(rs, &pss, &again);
2275         }
2276
2277         if (found) {
2278             pages = ram_save_host_page(rs, &pss);
2279         }
2280     } while (!pages && again);
2281
2282     rs->last_seen_block = pss.block;
2283     rs->last_page = pss.page;
2284
2285     return pages;
2286 }
2287
2288 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2289 {
2290     uint64_t pages = size / TARGET_PAGE_SIZE;
2291
2292     if (zero) {
2293         ram_counters.duplicate += pages;
2294     } else {
2295         ram_counters.normal += pages;
2296         ram_transferred_add(size);
2297         qemu_update_position(f, size);
2298     }
2299 }
2300
2301 static uint64_t ram_bytes_total_common(bool count_ignored)
2302 {
2303     RAMBlock *block;
2304     uint64_t total = 0;
2305
2306     RCU_READ_LOCK_GUARD();
2307
2308     if (count_ignored) {
2309         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2310             total += block->used_length;
2311         }
2312     } else {
2313         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2314             total += block->used_length;
2315         }
2316     }
2317     return total;
2318 }
2319
2320 uint64_t ram_bytes_total(void)
2321 {
2322     return ram_bytes_total_common(false);
2323 }
2324
2325 static void xbzrle_load_setup(void)
2326 {
2327     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2328 }
2329
2330 static void xbzrle_load_cleanup(void)
2331 {
2332     g_free(XBZRLE.decoded_buf);
2333     XBZRLE.decoded_buf = NULL;
2334 }
2335
2336 static void ram_state_cleanup(RAMState **rsp)
2337 {
2338     if (*rsp) {
2339         migration_page_queue_free(*rsp);
2340         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2341         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2342         g_free(*rsp);
2343         *rsp = NULL;
2344     }
2345 }
2346
2347 static void xbzrle_cleanup(void)
2348 {
2349     XBZRLE_cache_lock();
2350     if (XBZRLE.cache) {
2351         cache_fini(XBZRLE.cache);
2352         g_free(XBZRLE.encoded_buf);
2353         g_free(XBZRLE.current_buf);
2354         g_free(XBZRLE.zero_target_page);
2355         XBZRLE.cache = NULL;
2356         XBZRLE.encoded_buf = NULL;
2357         XBZRLE.current_buf = NULL;
2358         XBZRLE.zero_target_page = NULL;
2359     }
2360     XBZRLE_cache_unlock();
2361 }
2362
2363 static void ram_save_cleanup(void *opaque)
2364 {
2365     RAMState **rsp = opaque;
2366     RAMBlock *block;
2367
2368     /* We don't use dirty log with background snapshots */
2369     if (!migrate_background_snapshot()) {
2370         /* caller have hold iothread lock or is in a bh, so there is
2371          * no writing race against the migration bitmap
2372          */
2373         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2374             /*
2375              * do not stop dirty log without starting it, since
2376              * memory_global_dirty_log_stop will assert that
2377              * memory_global_dirty_log_start/stop used in pairs
2378              */
2379             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2380         }
2381     }
2382
2383     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2384         g_free(block->clear_bmap);
2385         block->clear_bmap = NULL;
2386         g_free(block->bmap);
2387         block->bmap = NULL;
2388     }
2389
2390     xbzrle_cleanup();
2391     compress_threads_save_cleanup();
2392     ram_state_cleanup(rsp);
2393 }
2394
2395 static void ram_state_reset(RAMState *rs)
2396 {
2397     rs->last_seen_block = NULL;
2398     rs->last_sent_block = NULL;
2399     rs->last_page = 0;
2400     rs->last_version = ram_list.version;
2401     rs->xbzrle_enabled = false;
2402 }
2403
2404 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2405
2406 /* **** functions for postcopy ***** */
2407
2408 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2409 {
2410     struct RAMBlock *block;
2411
2412     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2413         unsigned long *bitmap = block->bmap;
2414         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2415         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2416
2417         while (run_start < range) {
2418             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2419             ram_discard_range(block->idstr,
2420                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2421                               ((ram_addr_t)(run_end - run_start))
2422                                 << TARGET_PAGE_BITS);
2423             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2424         }
2425     }
2426 }
2427
2428 /**
2429  * postcopy_send_discard_bm_ram: discard a RAMBlock
2430  *
2431  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2432  *
2433  * @ms: current migration state
2434  * @block: RAMBlock to discard
2435  */
2436 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2437 {
2438     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2439     unsigned long current;
2440     unsigned long *bitmap = block->bmap;
2441
2442     for (current = 0; current < end; ) {
2443         unsigned long one = find_next_bit(bitmap, end, current);
2444         unsigned long zero, discard_length;
2445
2446         if (one >= end) {
2447             break;
2448         }
2449
2450         zero = find_next_zero_bit(bitmap, end, one + 1);
2451
2452         if (zero >= end) {
2453             discard_length = end - one;
2454         } else {
2455             discard_length = zero - one;
2456         }
2457         postcopy_discard_send_range(ms, one, discard_length);
2458         current = one + discard_length;
2459     }
2460 }
2461
2462 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2463
2464 /**
2465  * postcopy_each_ram_send_discard: discard all RAMBlocks
2466  *
2467  * Utility for the outgoing postcopy code.
2468  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2469  *   passing it bitmap indexes and name.
2470  * (qemu_ram_foreach_block ends up passing unscaled lengths
2471  *  which would mean postcopy code would have to deal with target page)
2472  *
2473  * @ms: current migration state
2474  */
2475 static void postcopy_each_ram_send_discard(MigrationState *ms)
2476 {
2477     struct RAMBlock *block;
2478
2479     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2480         postcopy_discard_send_init(ms, block->idstr);
2481
2482         /*
2483          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2484          * host-page size chunks, mark any partially dirty host-page size
2485          * chunks as all dirty.  In this case the host-page is the host-page
2486          * for the particular RAMBlock, i.e. it might be a huge page.
2487          */
2488         postcopy_chunk_hostpages_pass(ms, block);
2489
2490         /*
2491          * Postcopy sends chunks of bitmap over the wire, but it
2492          * just needs indexes at this point, avoids it having
2493          * target page specific code.
2494          */
2495         postcopy_send_discard_bm_ram(ms, block);
2496         postcopy_discard_send_finish(ms);
2497     }
2498 }
2499
2500 /**
2501  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2502  *
2503  * Helper for postcopy_chunk_hostpages; it's called twice to
2504  * canonicalize the two bitmaps, that are similar, but one is
2505  * inverted.
2506  *
2507  * Postcopy requires that all target pages in a hostpage are dirty or
2508  * clean, not a mix.  This function canonicalizes the bitmaps.
2509  *
2510  * @ms: current migration state
2511  * @block: block that contains the page we want to canonicalize
2512  */
2513 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2514 {
2515     RAMState *rs = ram_state;
2516     unsigned long *bitmap = block->bmap;
2517     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2518     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2519     unsigned long run_start;
2520
2521     if (block->page_size == TARGET_PAGE_SIZE) {
2522         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2523         return;
2524     }
2525
2526     /* Find a dirty page */
2527     run_start = find_next_bit(bitmap, pages, 0);
2528
2529     while (run_start < pages) {
2530
2531         /*
2532          * If the start of this run of pages is in the middle of a host
2533          * page, then we need to fixup this host page.
2534          */
2535         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2536             /* Find the end of this run */
2537             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2538             /*
2539              * If the end isn't at the start of a host page, then the
2540              * run doesn't finish at the end of a host page
2541              * and we need to discard.
2542              */
2543         }
2544
2545         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2546             unsigned long page;
2547             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2548                                                              host_ratio);
2549             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2550
2551             /* Clean up the bitmap */
2552             for (page = fixup_start_addr;
2553                  page < fixup_start_addr + host_ratio; page++) {
2554                 /*
2555                  * Remark them as dirty, updating the count for any pages
2556                  * that weren't previously dirty.
2557                  */
2558                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2559             }
2560         }
2561
2562         /* Find the next dirty page for the next iteration */
2563         run_start = find_next_bit(bitmap, pages, run_start);
2564     }
2565 }
2566
2567 /**
2568  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2569  *
2570  * Transmit the set of pages to be discarded after precopy to the target
2571  * these are pages that:
2572  *     a) Have been previously transmitted but are now dirty again
2573  *     b) Pages that have never been transmitted, this ensures that
2574  *        any pages on the destination that have been mapped by background
2575  *        tasks get discarded (transparent huge pages is the specific concern)
2576  * Hopefully this is pretty sparse
2577  *
2578  * @ms: current migration state
2579  */
2580 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2581 {
2582     RAMState *rs = ram_state;
2583
2584     RCU_READ_LOCK_GUARD();
2585
2586     /* This should be our last sync, the src is now paused */
2587     migration_bitmap_sync(rs);
2588
2589     /* Easiest way to make sure we don't resume in the middle of a host-page */
2590     rs->last_seen_block = NULL;
2591     rs->last_sent_block = NULL;
2592     rs->last_page = 0;
2593
2594     postcopy_each_ram_send_discard(ms);
2595
2596     trace_ram_postcopy_send_discard_bitmap();
2597 }
2598
2599 /**
2600  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2601  *
2602  * Returns zero on success
2603  *
2604  * @rbname: name of the RAMBlock of the request. NULL means the
2605  *          same that last one.
2606  * @start: RAMBlock starting page
2607  * @length: RAMBlock size
2608  */
2609 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2610 {
2611     trace_ram_discard_range(rbname, start, length);
2612
2613     RCU_READ_LOCK_GUARD();
2614     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2615
2616     if (!rb) {
2617         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2618         return -1;
2619     }
2620
2621     /*
2622      * On source VM, we don't need to update the received bitmap since
2623      * we don't even have one.
2624      */
2625     if (rb->receivedmap) {
2626         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2627                      length >> qemu_target_page_bits());
2628     }
2629
2630     return ram_block_discard_range(rb, start, length);
2631 }
2632
2633 /*
2634  * For every allocation, we will try not to crash the VM if the
2635  * allocation failed.
2636  */
2637 static int xbzrle_init(void)
2638 {
2639     Error *local_err = NULL;
2640
2641     if (!migrate_use_xbzrle()) {
2642         return 0;
2643     }
2644
2645     XBZRLE_cache_lock();
2646
2647     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2648     if (!XBZRLE.zero_target_page) {
2649         error_report("%s: Error allocating zero page", __func__);
2650         goto err_out;
2651     }
2652
2653     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2654                               TARGET_PAGE_SIZE, &local_err);
2655     if (!XBZRLE.cache) {
2656         error_report_err(local_err);
2657         goto free_zero_page;
2658     }
2659
2660     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2661     if (!XBZRLE.encoded_buf) {
2662         error_report("%s: Error allocating encoded_buf", __func__);
2663         goto free_cache;
2664     }
2665
2666     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2667     if (!XBZRLE.current_buf) {
2668         error_report("%s: Error allocating current_buf", __func__);
2669         goto free_encoded_buf;
2670     }
2671
2672     /* We are all good */
2673     XBZRLE_cache_unlock();
2674     return 0;
2675
2676 free_encoded_buf:
2677     g_free(XBZRLE.encoded_buf);
2678     XBZRLE.encoded_buf = NULL;
2679 free_cache:
2680     cache_fini(XBZRLE.cache);
2681     XBZRLE.cache = NULL;
2682 free_zero_page:
2683     g_free(XBZRLE.zero_target_page);
2684     XBZRLE.zero_target_page = NULL;
2685 err_out:
2686     XBZRLE_cache_unlock();
2687     return -ENOMEM;
2688 }
2689
2690 static int ram_state_init(RAMState **rsp)
2691 {
2692     *rsp = g_try_new0(RAMState, 1);
2693
2694     if (!*rsp) {
2695         error_report("%s: Init ramstate fail", __func__);
2696         return -1;
2697     }
2698
2699     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2700     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2701     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2702
2703     /*
2704      * Count the total number of pages used by ram blocks not including any
2705      * gaps due to alignment or unplugs.
2706      * This must match with the initial values of dirty bitmap.
2707      */
2708     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2709     ram_state_reset(*rsp);
2710
2711     return 0;
2712 }
2713
2714 static void ram_list_init_bitmaps(void)
2715 {
2716     MigrationState *ms = migrate_get_current();
2717     RAMBlock *block;
2718     unsigned long pages;
2719     uint8_t shift;
2720
2721     /* Skip setting bitmap if there is no RAM */
2722     if (ram_bytes_total()) {
2723         shift = ms->clear_bitmap_shift;
2724         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2725             error_report("clear_bitmap_shift (%u) too big, using "
2726                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2727             shift = CLEAR_BITMAP_SHIFT_MAX;
2728         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2729             error_report("clear_bitmap_shift (%u) too small, using "
2730                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2731             shift = CLEAR_BITMAP_SHIFT_MIN;
2732         }
2733
2734         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2735             pages = block->max_length >> TARGET_PAGE_BITS;
2736             /*
2737              * The initial dirty bitmap for migration must be set with all
2738              * ones to make sure we'll migrate every guest RAM page to
2739              * destination.
2740              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2741              * new migration after a failed migration, ram_list.
2742              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2743              * guest memory.
2744              */
2745             block->bmap = bitmap_new(pages);
2746             bitmap_set(block->bmap, 0, pages);
2747             block->clear_bmap_shift = shift;
2748             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2749         }
2750     }
2751 }
2752
2753 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2754 {
2755     unsigned long pages;
2756     RAMBlock *rb;
2757
2758     RCU_READ_LOCK_GUARD();
2759
2760     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2761             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2762             rs->migration_dirty_pages -= pages;
2763     }
2764 }
2765
2766 static void ram_init_bitmaps(RAMState *rs)
2767 {
2768     /* For memory_global_dirty_log_start below.  */
2769     qemu_mutex_lock_iothread();
2770     qemu_mutex_lock_ramlist();
2771
2772     WITH_RCU_READ_LOCK_GUARD() {
2773         ram_list_init_bitmaps();
2774         /* We don't use dirty log with background snapshots */
2775         if (!migrate_background_snapshot()) {
2776             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2777             migration_bitmap_sync_precopy(rs);
2778         }
2779     }
2780     qemu_mutex_unlock_ramlist();
2781     qemu_mutex_unlock_iothread();
2782
2783     /*
2784      * After an eventual first bitmap sync, fixup the initial bitmap
2785      * containing all 1s to exclude any discarded pages from migration.
2786      */
2787     migration_bitmap_clear_discarded_pages(rs);
2788 }
2789
2790 static int ram_init_all(RAMState **rsp)
2791 {
2792     if (ram_state_init(rsp)) {
2793         return -1;
2794     }
2795
2796     if (xbzrle_init()) {
2797         ram_state_cleanup(rsp);
2798         return -1;
2799     }
2800
2801     ram_init_bitmaps(*rsp);
2802
2803     return 0;
2804 }
2805
2806 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2807 {
2808     RAMBlock *block;
2809     uint64_t pages = 0;
2810
2811     /*
2812      * Postcopy is not using xbzrle/compression, so no need for that.
2813      * Also, since source are already halted, we don't need to care
2814      * about dirty page logging as well.
2815      */
2816
2817     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2818         pages += bitmap_count_one(block->bmap,
2819                                   block->used_length >> TARGET_PAGE_BITS);
2820     }
2821
2822     /* This may not be aligned with current bitmaps. Recalculate. */
2823     rs->migration_dirty_pages = pages;
2824
2825     ram_state_reset(rs);
2826
2827     /* Update RAMState cache of output QEMUFile */
2828     rs->f = out;
2829
2830     trace_ram_state_resume_prepare(pages);
2831 }
2832
2833 /*
2834  * This function clears bits of the free pages reported by the caller from the
2835  * migration dirty bitmap. @addr is the host address corresponding to the
2836  * start of the continuous guest free pages, and @len is the total bytes of
2837  * those pages.
2838  */
2839 void qemu_guest_free_page_hint(void *addr, size_t len)
2840 {
2841     RAMBlock *block;
2842     ram_addr_t offset;
2843     size_t used_len, start, npages;
2844     MigrationState *s = migrate_get_current();
2845
2846     /* This function is currently expected to be used during live migration */
2847     if (!migration_is_setup_or_active(s->state)) {
2848         return;
2849     }
2850
2851     for (; len > 0; len -= used_len, addr += used_len) {
2852         block = qemu_ram_block_from_host(addr, false, &offset);
2853         if (unlikely(!block || offset >= block->used_length)) {
2854             /*
2855              * The implementation might not support RAMBlock resize during
2856              * live migration, but it could happen in theory with future
2857              * updates. So we add a check here to capture that case.
2858              */
2859             error_report_once("%s unexpected error", __func__);
2860             return;
2861         }
2862
2863         if (len <= block->used_length - offset) {
2864             used_len = len;
2865         } else {
2866             used_len = block->used_length - offset;
2867         }
2868
2869         start = offset >> TARGET_PAGE_BITS;
2870         npages = used_len >> TARGET_PAGE_BITS;
2871
2872         qemu_mutex_lock(&ram_state->bitmap_mutex);
2873         /*
2874          * The skipped free pages are equavalent to be sent from clear_bmap's
2875          * perspective, so clear the bits from the memory region bitmap which
2876          * are initially set. Otherwise those skipped pages will be sent in
2877          * the next round after syncing from the memory region bitmap.
2878          */
2879         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2880         ram_state->migration_dirty_pages -=
2881                       bitmap_count_one_with_offset(block->bmap, start, npages);
2882         bitmap_clear(block->bmap, start, npages);
2883         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2884     }
2885 }
2886
2887 /*
2888  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2889  * long-running RCU critical section.  When rcu-reclaims in the code
2890  * start to become numerous it will be necessary to reduce the
2891  * granularity of these critical sections.
2892  */
2893
2894 /**
2895  * ram_save_setup: Setup RAM for migration
2896  *
2897  * Returns zero to indicate success and negative for error
2898  *
2899  * @f: QEMUFile where to send the data
2900  * @opaque: RAMState pointer
2901  */
2902 static int ram_save_setup(QEMUFile *f, void *opaque)
2903 {
2904     RAMState **rsp = opaque;
2905     RAMBlock *block;
2906
2907     if (compress_threads_save_setup()) {
2908         return -1;
2909     }
2910
2911     /* migration has already setup the bitmap, reuse it. */
2912     if (!migration_in_colo_state()) {
2913         if (ram_init_all(rsp) != 0) {
2914             compress_threads_save_cleanup();
2915             return -1;
2916         }
2917     }
2918     (*rsp)->f = f;
2919
2920     WITH_RCU_READ_LOCK_GUARD() {
2921         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2922
2923         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2924             qemu_put_byte(f, strlen(block->idstr));
2925             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2926             qemu_put_be64(f, block->used_length);
2927             if (migrate_postcopy_ram() && block->page_size !=
2928                                           qemu_host_page_size) {
2929                 qemu_put_be64(f, block->page_size);
2930             }
2931             if (migrate_ignore_shared()) {
2932                 qemu_put_be64(f, block->mr->addr);
2933             }
2934         }
2935     }
2936
2937     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2938     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2939
2940     multifd_send_sync_main(f);
2941     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2942     qemu_fflush(f);
2943
2944     return 0;
2945 }
2946
2947 /**
2948  * ram_save_iterate: iterative stage for migration
2949  *
2950  * Returns zero to indicate success and negative for error
2951  *
2952  * @f: QEMUFile where to send the data
2953  * @opaque: RAMState pointer
2954  */
2955 static int ram_save_iterate(QEMUFile *f, void *opaque)
2956 {
2957     RAMState **temp = opaque;
2958     RAMState *rs = *temp;
2959     int ret = 0;
2960     int i;
2961     int64_t t0;
2962     int done = 0;
2963
2964     if (blk_mig_bulk_active()) {
2965         /* Avoid transferring ram during bulk phase of block migration as
2966          * the bulk phase will usually take a long time and transferring
2967          * ram updates during that time is pointless. */
2968         goto out;
2969     }
2970
2971     /*
2972      * We'll take this lock a little bit long, but it's okay for two reasons.
2973      * Firstly, the only possible other thread to take it is who calls
2974      * qemu_guest_free_page_hint(), which should be rare; secondly, see
2975      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
2976      * guarantees that we'll at least released it in a regular basis.
2977      */
2978     qemu_mutex_lock(&rs->bitmap_mutex);
2979     WITH_RCU_READ_LOCK_GUARD() {
2980         if (ram_list.version != rs->last_version) {
2981             ram_state_reset(rs);
2982         }
2983
2984         /* Read version before ram_list.blocks */
2985         smp_rmb();
2986
2987         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2988
2989         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2990         i = 0;
2991         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2992                postcopy_has_request(rs)) {
2993             int pages;
2994
2995             if (qemu_file_get_error(f)) {
2996                 break;
2997             }
2998
2999             pages = ram_find_and_save_block(rs);
3000             /* no more pages to sent */
3001             if (pages == 0) {
3002                 done = 1;
3003                 break;
3004             }
3005
3006             if (pages < 0) {
3007                 qemu_file_set_error(f, pages);
3008                 break;
3009             }
3010
3011             rs->target_page_count += pages;
3012
3013             /*
3014              * During postcopy, it is necessary to make sure one whole host
3015              * page is sent in one chunk.
3016              */
3017             if (migrate_postcopy_ram()) {
3018                 flush_compressed_data(rs);
3019             }
3020
3021             /*
3022              * we want to check in the 1st loop, just in case it was the 1st
3023              * time and we had to sync the dirty bitmap.
3024              * qemu_clock_get_ns() is a bit expensive, so we only check each
3025              * some iterations
3026              */
3027             if ((i & 63) == 0) {
3028                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3029                               1000000;
3030                 if (t1 > MAX_WAIT) {
3031                     trace_ram_save_iterate_big_wait(t1, i);
3032                     break;
3033                 }
3034             }
3035             i++;
3036         }
3037     }
3038     qemu_mutex_unlock(&rs->bitmap_mutex);
3039
3040     /*
3041      * Must occur before EOS (or any QEMUFile operation)
3042      * because of RDMA protocol.
3043      */
3044     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3045
3046 out:
3047     if (ret >= 0
3048         && migration_is_setup_or_active(migrate_get_current()->state)) {
3049         multifd_send_sync_main(rs->f);
3050         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3051         qemu_fflush(f);
3052         ram_transferred_add(8);
3053
3054         ret = qemu_file_get_error(f);
3055     }
3056     if (ret < 0) {
3057         return ret;
3058     }
3059
3060     return done;
3061 }
3062
3063 /**
3064  * ram_save_complete: function called to send the remaining amount of ram
3065  *
3066  * Returns zero to indicate success or negative on error
3067  *
3068  * Called with iothread lock
3069  *
3070  * @f: QEMUFile where to send the data
3071  * @opaque: RAMState pointer
3072  */
3073 static int ram_save_complete(QEMUFile *f, void *opaque)
3074 {
3075     RAMState **temp = opaque;
3076     RAMState *rs = *temp;
3077     int ret = 0;
3078
3079     rs->last_stage = !migration_in_colo_state();
3080
3081     WITH_RCU_READ_LOCK_GUARD() {
3082         if (!migration_in_postcopy()) {
3083             migration_bitmap_sync_precopy(rs);
3084         }
3085
3086         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3087
3088         /* try transferring iterative blocks of memory */
3089
3090         /* flush all remaining blocks regardless of rate limiting */
3091         while (true) {
3092             int pages;
3093
3094             pages = ram_find_and_save_block(rs);
3095             /* no more blocks to sent */
3096             if (pages == 0) {
3097                 break;
3098             }
3099             if (pages < 0) {
3100                 ret = pages;
3101                 break;
3102             }
3103         }
3104
3105         flush_compressed_data(rs);
3106         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3107     }
3108
3109     if (ret >= 0) {
3110         multifd_send_sync_main(rs->f);
3111         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3112         qemu_fflush(f);
3113     }
3114
3115     return ret;
3116 }
3117
3118 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3119                              uint64_t *res_precopy_only,
3120                              uint64_t *res_compatible,
3121                              uint64_t *res_postcopy_only)
3122 {
3123     RAMState **temp = opaque;
3124     RAMState *rs = *temp;
3125     uint64_t remaining_size;
3126
3127     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3128
3129     if (!migration_in_postcopy() &&
3130         remaining_size < max_size) {
3131         qemu_mutex_lock_iothread();
3132         WITH_RCU_READ_LOCK_GUARD() {
3133             migration_bitmap_sync_precopy(rs);
3134         }
3135         qemu_mutex_unlock_iothread();
3136         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3137     }
3138
3139     if (migrate_postcopy_ram()) {
3140         /* We can do postcopy, and all the data is postcopiable */
3141         *res_compatible += remaining_size;
3142     } else {
3143         *res_precopy_only += remaining_size;
3144     }
3145 }
3146
3147 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3148 {
3149     unsigned int xh_len;
3150     int xh_flags;
3151     uint8_t *loaded_data;
3152
3153     /* extract RLE header */
3154     xh_flags = qemu_get_byte(f);
3155     xh_len = qemu_get_be16(f);
3156
3157     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3158         error_report("Failed to load XBZRLE page - wrong compression!");
3159         return -1;
3160     }
3161
3162     if (xh_len > TARGET_PAGE_SIZE) {
3163         error_report("Failed to load XBZRLE page - len overflow!");
3164         return -1;
3165     }
3166     loaded_data = XBZRLE.decoded_buf;
3167     /* load data and decode */
3168     /* it can change loaded_data to point to an internal buffer */
3169     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3170
3171     /* decode RLE */
3172     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3173                              TARGET_PAGE_SIZE) == -1) {
3174         error_report("Failed to load XBZRLE page - decode error!");
3175         return -1;
3176     }
3177
3178     return 0;
3179 }
3180
3181 /**
3182  * ram_block_from_stream: read a RAMBlock id from the migration stream
3183  *
3184  * Must be called from within a rcu critical section.
3185  *
3186  * Returns a pointer from within the RCU-protected ram_list.
3187  *
3188  * @mis: the migration incoming state pointer
3189  * @f: QEMUFile where to read the data from
3190  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3191  */
3192 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3193                                               QEMUFile *f, int flags)
3194 {
3195     RAMBlock *block = mis->last_recv_block;
3196     char id[256];
3197     uint8_t len;
3198
3199     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3200         if (!block) {
3201             error_report("Ack, bad migration stream!");
3202             return NULL;
3203         }
3204         return block;
3205     }
3206
3207     len = qemu_get_byte(f);
3208     qemu_get_buffer(f, (uint8_t *)id, len);
3209     id[len] = 0;
3210
3211     block = qemu_ram_block_by_name(id);
3212     if (!block) {
3213         error_report("Can't find block %s", id);
3214         return NULL;
3215     }
3216
3217     if (ramblock_is_ignored(block)) {
3218         error_report("block %s should not be migrated !", id);
3219         return NULL;
3220     }
3221
3222     mis->last_recv_block = block;
3223
3224     return block;
3225 }
3226
3227 static inline void *host_from_ram_block_offset(RAMBlock *block,
3228                                                ram_addr_t offset)
3229 {
3230     if (!offset_in_ramblock(block, offset)) {
3231         return NULL;
3232     }
3233
3234     return block->host + offset;
3235 }
3236
3237 static void *host_page_from_ram_block_offset(RAMBlock *block,
3238                                              ram_addr_t offset)
3239 {
3240     /* Note: Explicitly no check against offset_in_ramblock(). */
3241     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3242                                    block->page_size);
3243 }
3244
3245 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3246                                                          ram_addr_t offset)
3247 {
3248     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3249 }
3250
3251 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3252                              ram_addr_t offset, bool record_bitmap)
3253 {
3254     if (!offset_in_ramblock(block, offset)) {
3255         return NULL;
3256     }
3257     if (!block->colo_cache) {
3258         error_report("%s: colo_cache is NULL in block :%s",
3259                      __func__, block->idstr);
3260         return NULL;
3261     }
3262
3263     /*
3264     * During colo checkpoint, we need bitmap of these migrated pages.
3265     * It help us to decide which pages in ram cache should be flushed
3266     * into VM's RAM later.
3267     */
3268     if (record_bitmap &&
3269         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3270         ram_state->migration_dirty_pages++;
3271     }
3272     return block->colo_cache + offset;
3273 }
3274
3275 /**
3276  * ram_handle_compressed: handle the zero page case
3277  *
3278  * If a page (or a whole RDMA chunk) has been
3279  * determined to be zero, then zap it.
3280  *
3281  * @host: host address for the zero page
3282  * @ch: what the page is filled from.  We only support zero
3283  * @size: size of the zero page
3284  */
3285 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3286 {
3287     if (ch != 0 || !buffer_is_zero(host, size)) {
3288         memset(host, ch, size);
3289     }
3290 }
3291
3292 /* return the size after decompression, or negative value on error */
3293 static int
3294 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3295                      const uint8_t *source, size_t source_len)
3296 {
3297     int err;
3298
3299     err = inflateReset(stream);
3300     if (err != Z_OK) {
3301         return -1;
3302     }
3303
3304     stream->avail_in = source_len;
3305     stream->next_in = (uint8_t *)source;
3306     stream->avail_out = dest_len;
3307     stream->next_out = dest;
3308
3309     err = inflate(stream, Z_NO_FLUSH);
3310     if (err != Z_STREAM_END) {
3311         return -1;
3312     }
3313
3314     return stream->total_out;
3315 }
3316
3317 static void *do_data_decompress(void *opaque)
3318 {
3319     DecompressParam *param = opaque;
3320     unsigned long pagesize;
3321     uint8_t *des;
3322     int len, ret;
3323
3324     qemu_mutex_lock(&param->mutex);
3325     while (!param->quit) {
3326         if (param->des) {
3327             des = param->des;
3328             len = param->len;
3329             param->des = 0;
3330             qemu_mutex_unlock(&param->mutex);
3331
3332             pagesize = TARGET_PAGE_SIZE;
3333
3334             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3335                                        param->compbuf, len);
3336             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3337                 error_report("decompress data failed");
3338                 qemu_file_set_error(decomp_file, ret);
3339             }
3340
3341             qemu_mutex_lock(&decomp_done_lock);
3342             param->done = true;
3343             qemu_cond_signal(&decomp_done_cond);
3344             qemu_mutex_unlock(&decomp_done_lock);
3345
3346             qemu_mutex_lock(&param->mutex);
3347         } else {
3348             qemu_cond_wait(&param->cond, &param->mutex);
3349         }
3350     }
3351     qemu_mutex_unlock(&param->mutex);
3352
3353     return NULL;
3354 }
3355
3356 static int wait_for_decompress_done(void)
3357 {
3358     int idx, thread_count;
3359
3360     if (!migrate_use_compression()) {
3361         return 0;
3362     }
3363
3364     thread_count = migrate_decompress_threads();
3365     qemu_mutex_lock(&decomp_done_lock);
3366     for (idx = 0; idx < thread_count; idx++) {
3367         while (!decomp_param[idx].done) {
3368             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3369         }
3370     }
3371     qemu_mutex_unlock(&decomp_done_lock);
3372     return qemu_file_get_error(decomp_file);
3373 }
3374
3375 static void compress_threads_load_cleanup(void)
3376 {
3377     int i, thread_count;
3378
3379     if (!migrate_use_compression()) {
3380         return;
3381     }
3382     thread_count = migrate_decompress_threads();
3383     for (i = 0; i < thread_count; i++) {
3384         /*
3385          * we use it as a indicator which shows if the thread is
3386          * properly init'd or not
3387          */
3388         if (!decomp_param[i].compbuf) {
3389             break;
3390         }
3391
3392         qemu_mutex_lock(&decomp_param[i].mutex);
3393         decomp_param[i].quit = true;
3394         qemu_cond_signal(&decomp_param[i].cond);
3395         qemu_mutex_unlock(&decomp_param[i].mutex);
3396     }
3397     for (i = 0; i < thread_count; i++) {
3398         if (!decomp_param[i].compbuf) {
3399             break;
3400         }
3401
3402         qemu_thread_join(decompress_threads + i);
3403         qemu_mutex_destroy(&decomp_param[i].mutex);
3404         qemu_cond_destroy(&decomp_param[i].cond);
3405         inflateEnd(&decomp_param[i].stream);
3406         g_free(decomp_param[i].compbuf);
3407         decomp_param[i].compbuf = NULL;
3408     }
3409     g_free(decompress_threads);
3410     g_free(decomp_param);
3411     decompress_threads = NULL;
3412     decomp_param = NULL;
3413     decomp_file = NULL;
3414 }
3415
3416 static int compress_threads_load_setup(QEMUFile *f)
3417 {
3418     int i, thread_count;
3419
3420     if (!migrate_use_compression()) {
3421         return 0;
3422     }
3423
3424     thread_count = migrate_decompress_threads();
3425     decompress_threads = g_new0(QemuThread, thread_count);
3426     decomp_param = g_new0(DecompressParam, thread_count);
3427     qemu_mutex_init(&decomp_done_lock);
3428     qemu_cond_init(&decomp_done_cond);
3429     decomp_file = f;
3430     for (i = 0; i < thread_count; i++) {
3431         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3432             goto exit;
3433         }
3434
3435         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3436         qemu_mutex_init(&decomp_param[i].mutex);
3437         qemu_cond_init(&decomp_param[i].cond);
3438         decomp_param[i].done = true;
3439         decomp_param[i].quit = false;
3440         qemu_thread_create(decompress_threads + i, "decompress",
3441                            do_data_decompress, decomp_param + i,
3442                            QEMU_THREAD_JOINABLE);
3443     }
3444     return 0;
3445 exit:
3446     compress_threads_load_cleanup();
3447     return -1;
3448 }
3449
3450 static void decompress_data_with_multi_threads(QEMUFile *f,
3451                                                void *host, int len)
3452 {
3453     int idx, thread_count;
3454
3455     thread_count = migrate_decompress_threads();
3456     QEMU_LOCK_GUARD(&decomp_done_lock);
3457     while (true) {
3458         for (idx = 0; idx < thread_count; idx++) {
3459             if (decomp_param[idx].done) {
3460                 decomp_param[idx].done = false;
3461                 qemu_mutex_lock(&decomp_param[idx].mutex);
3462                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3463                 decomp_param[idx].des = host;
3464                 decomp_param[idx].len = len;
3465                 qemu_cond_signal(&decomp_param[idx].cond);
3466                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3467                 break;
3468             }
3469         }
3470         if (idx < thread_count) {
3471             break;
3472         } else {
3473             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3474         }
3475     }
3476 }
3477
3478 static void colo_init_ram_state(void)
3479 {
3480     ram_state_init(&ram_state);
3481 }
3482
3483 /*
3484  * colo cache: this is for secondary VM, we cache the whole
3485  * memory of the secondary VM, it is need to hold the global lock
3486  * to call this helper.
3487  */
3488 int colo_init_ram_cache(void)
3489 {
3490     RAMBlock *block;
3491
3492     WITH_RCU_READ_LOCK_GUARD() {
3493         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3494             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3495                                                     NULL, false, false);
3496             if (!block->colo_cache) {
3497                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3498                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3499                              block->used_length);
3500                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3501                     if (block->colo_cache) {
3502                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3503                         block->colo_cache = NULL;
3504                     }
3505                 }
3506                 return -errno;
3507             }
3508             if (!machine_dump_guest_core(current_machine)) {
3509                 qemu_madvise(block->colo_cache, block->used_length,
3510                              QEMU_MADV_DONTDUMP);
3511             }
3512         }
3513     }
3514
3515     /*
3516     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3517     * with to decide which page in cache should be flushed into SVM's RAM. Here
3518     * we use the same name 'ram_bitmap' as for migration.
3519     */
3520     if (ram_bytes_total()) {
3521         RAMBlock *block;
3522
3523         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3524             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3525             block->bmap = bitmap_new(pages);
3526         }
3527     }
3528
3529     colo_init_ram_state();
3530     return 0;
3531 }
3532
3533 /* TODO: duplicated with ram_init_bitmaps */
3534 void colo_incoming_start_dirty_log(void)
3535 {
3536     RAMBlock *block = NULL;
3537     /* For memory_global_dirty_log_start below. */
3538     qemu_mutex_lock_iothread();
3539     qemu_mutex_lock_ramlist();
3540
3541     memory_global_dirty_log_sync();
3542     WITH_RCU_READ_LOCK_GUARD() {
3543         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3544             ramblock_sync_dirty_bitmap(ram_state, block);
3545             /* Discard this dirty bitmap record */
3546             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3547         }
3548         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3549     }
3550     ram_state->migration_dirty_pages = 0;
3551     qemu_mutex_unlock_ramlist();
3552     qemu_mutex_unlock_iothread();
3553 }
3554
3555 /* It is need to hold the global lock to call this helper */
3556 void colo_release_ram_cache(void)
3557 {
3558     RAMBlock *block;
3559
3560     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3561     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3562         g_free(block->bmap);
3563         block->bmap = NULL;
3564     }
3565
3566     WITH_RCU_READ_LOCK_GUARD() {
3567         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3568             if (block->colo_cache) {
3569                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3570                 block->colo_cache = NULL;
3571             }
3572         }
3573     }
3574     ram_state_cleanup(&ram_state);
3575 }
3576
3577 /**
3578  * ram_load_setup: Setup RAM for migration incoming side
3579  *
3580  * Returns zero to indicate success and negative for error
3581  *
3582  * @f: QEMUFile where to receive the data
3583  * @opaque: RAMState pointer
3584  */
3585 static int ram_load_setup(QEMUFile *f, void *opaque)
3586 {
3587     if (compress_threads_load_setup(f)) {
3588         return -1;
3589     }
3590
3591     xbzrle_load_setup();
3592     ramblock_recv_map_init();
3593
3594     return 0;
3595 }
3596
3597 static int ram_load_cleanup(void *opaque)
3598 {
3599     RAMBlock *rb;
3600
3601     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3602         qemu_ram_block_writeback(rb);
3603     }
3604
3605     xbzrle_load_cleanup();
3606     compress_threads_load_cleanup();
3607
3608     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3609         g_free(rb->receivedmap);
3610         rb->receivedmap = NULL;
3611     }
3612
3613     return 0;
3614 }
3615
3616 /**
3617  * ram_postcopy_incoming_init: allocate postcopy data structures
3618  *
3619  * Returns 0 for success and negative if there was one error
3620  *
3621  * @mis: current migration incoming state
3622  *
3623  * Allocate data structures etc needed by incoming migration with
3624  * postcopy-ram. postcopy-ram's similarly names
3625  * postcopy_ram_incoming_init does the work.
3626  */
3627 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3628 {
3629     return postcopy_ram_incoming_init(mis);
3630 }
3631
3632 /**
3633  * ram_load_postcopy: load a page in postcopy case
3634  *
3635  * Returns 0 for success or -errno in case of error
3636  *
3637  * Called in postcopy mode by ram_load().
3638  * rcu_read_lock is taken prior to this being called.
3639  *
3640  * @f: QEMUFile where to send the data
3641  */
3642 static int ram_load_postcopy(QEMUFile *f)
3643 {
3644     int flags = 0, ret = 0;
3645     bool place_needed = false;
3646     bool matches_target_page_size = false;
3647     MigrationIncomingState *mis = migration_incoming_get_current();
3648     /* Currently we only use channel 0.  TODO: use all the channels */
3649     PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[0];
3650
3651     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3652         ram_addr_t addr;
3653         void *page_buffer = NULL;
3654         void *place_source = NULL;
3655         RAMBlock *block = NULL;
3656         uint8_t ch;
3657         int len;
3658
3659         addr = qemu_get_be64(f);
3660
3661         /*
3662          * If qemu file error, we should stop here, and then "addr"
3663          * may be invalid
3664          */
3665         ret = qemu_file_get_error(f);
3666         if (ret) {
3667             break;
3668         }
3669
3670         flags = addr & ~TARGET_PAGE_MASK;
3671         addr &= TARGET_PAGE_MASK;
3672
3673         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3674         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3675                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3676             block = ram_block_from_stream(mis, f, flags);
3677             if (!block) {
3678                 ret = -EINVAL;
3679                 break;
3680             }
3681
3682             /*
3683              * Relying on used_length is racy and can result in false positives.
3684              * We might place pages beyond used_length in case RAM was shrunk
3685              * while in postcopy, which is fine - trying to place via
3686              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3687              */
3688             if (!block->host || addr >= block->postcopy_length) {
3689                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3690                 ret = -EINVAL;
3691                 break;
3692             }
3693             tmp_page->target_pages++;
3694             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3695             /*
3696              * Postcopy requires that we place whole host pages atomically;
3697              * these may be huge pages for RAMBlocks that are backed by
3698              * hugetlbfs.
3699              * To make it atomic, the data is read into a temporary page
3700              * that's moved into place later.
3701              * The migration protocol uses,  possibly smaller, target-pages
3702              * however the source ensures it always sends all the components
3703              * of a host page in one chunk.
3704              */
3705             page_buffer = tmp_page->tmp_huge_page +
3706                           host_page_offset_from_ram_block_offset(block, addr);
3707             /* If all TP are zero then we can optimise the place */
3708             if (tmp_page->target_pages == 1) {
3709                 tmp_page->host_addr =
3710                     host_page_from_ram_block_offset(block, addr);
3711             } else if (tmp_page->host_addr !=
3712                        host_page_from_ram_block_offset(block, addr)) {
3713                 /* not the 1st TP within the HP */
3714                 error_report("Non-same host page detected.  "
3715                              "Target host page %p, received host page %p "
3716                              "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
3717                              tmp_page->host_addr,
3718                              host_page_from_ram_block_offset(block, addr),
3719                              block->idstr, addr, tmp_page->target_pages);
3720                 ret = -EINVAL;
3721                 break;
3722             }
3723
3724             /*
3725              * If it's the last part of a host page then we place the host
3726              * page
3727              */
3728             if (tmp_page->target_pages ==
3729                 (block->page_size / TARGET_PAGE_SIZE)) {
3730                 place_needed = true;
3731             }
3732             place_source = tmp_page->tmp_huge_page;
3733         }
3734
3735         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3736         case RAM_SAVE_FLAG_ZERO:
3737             ch = qemu_get_byte(f);
3738             /*
3739              * Can skip to set page_buffer when
3740              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3741              */
3742             if (ch || !matches_target_page_size) {
3743                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3744             }
3745             if (ch) {
3746                 tmp_page->all_zero = false;
3747             }
3748             break;
3749
3750         case RAM_SAVE_FLAG_PAGE:
3751             tmp_page->all_zero = false;
3752             if (!matches_target_page_size) {
3753                 /* For huge pages, we always use temporary buffer */
3754                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3755             } else {
3756                 /*
3757                  * For small pages that matches target page size, we
3758                  * avoid the qemu_file copy.  Instead we directly use
3759                  * the buffer of QEMUFile to place the page.  Note: we
3760                  * cannot do any QEMUFile operation before using that
3761                  * buffer to make sure the buffer is valid when
3762                  * placing the page.
3763                  */
3764                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3765                                          TARGET_PAGE_SIZE);
3766             }
3767             break;
3768         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3769             tmp_page->all_zero = false;
3770             len = qemu_get_be32(f);
3771             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3772                 error_report("Invalid compressed data length: %d", len);
3773                 ret = -EINVAL;
3774                 break;
3775             }
3776             decompress_data_with_multi_threads(f, page_buffer, len);
3777             break;
3778
3779         case RAM_SAVE_FLAG_EOS:
3780             /* normal exit */
3781             multifd_recv_sync_main();
3782             break;
3783         default:
3784             error_report("Unknown combination of migration flags: 0x%x"
3785                          " (postcopy mode)", flags);
3786             ret = -EINVAL;
3787             break;
3788         }
3789
3790         /* Got the whole host page, wait for decompress before placing. */
3791         if (place_needed) {
3792             ret |= wait_for_decompress_done();
3793         }
3794
3795         /* Detect for any possible file errors */
3796         if (!ret && qemu_file_get_error(f)) {
3797             ret = qemu_file_get_error(f);
3798         }
3799
3800         if (!ret && place_needed) {
3801             if (tmp_page->all_zero) {
3802                 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
3803             } else {
3804                 ret = postcopy_place_page(mis, tmp_page->host_addr,
3805                                           place_source, block);
3806             }
3807             place_needed = false;
3808             postcopy_temp_page_reset(tmp_page);
3809         }
3810     }
3811
3812     return ret;
3813 }
3814
3815 static bool postcopy_is_advised(void)
3816 {
3817     PostcopyState ps = postcopy_state_get();
3818     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3819 }
3820
3821 static bool postcopy_is_running(void)
3822 {
3823     PostcopyState ps = postcopy_state_get();
3824     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3825 }
3826
3827 /*
3828  * Flush content of RAM cache into SVM's memory.
3829  * Only flush the pages that be dirtied by PVM or SVM or both.
3830  */
3831 void colo_flush_ram_cache(void)
3832 {
3833     RAMBlock *block = NULL;
3834     void *dst_host;
3835     void *src_host;
3836     unsigned long offset = 0;
3837
3838     memory_global_dirty_log_sync();
3839     WITH_RCU_READ_LOCK_GUARD() {
3840         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3841             ramblock_sync_dirty_bitmap(ram_state, block);
3842         }
3843     }
3844
3845     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3846     WITH_RCU_READ_LOCK_GUARD() {
3847         block = QLIST_FIRST_RCU(&ram_list.blocks);
3848
3849         while (block) {
3850             unsigned long num = 0;
3851
3852             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
3853             if (!offset_in_ramblock(block,
3854                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3855                 offset = 0;
3856                 num = 0;
3857                 block = QLIST_NEXT_RCU(block, next);
3858             } else {
3859                 unsigned long i = 0;
3860
3861                 for (i = 0; i < num; i++) {
3862                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
3863                 }
3864                 dst_host = block->host
3865                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3866                 src_host = block->colo_cache
3867                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3868                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3869                 offset += num;
3870             }
3871         }
3872     }
3873     trace_colo_flush_ram_cache_end();
3874 }
3875
3876 /**
3877  * ram_load_precopy: load pages in precopy case
3878  *
3879  * Returns 0 for success or -errno in case of error
3880  *
3881  * Called in precopy mode by ram_load().
3882  * rcu_read_lock is taken prior to this being called.
3883  *
3884  * @f: QEMUFile where to send the data
3885  */
3886 static int ram_load_precopy(QEMUFile *f)
3887 {
3888     MigrationIncomingState *mis = migration_incoming_get_current();
3889     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3890     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3891     bool postcopy_advised = postcopy_is_advised();
3892     if (!migrate_use_compression()) {
3893         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3894     }
3895
3896     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3897         ram_addr_t addr, total_ram_bytes;
3898         void *host = NULL, *host_bak = NULL;
3899         uint8_t ch;
3900
3901         /*
3902          * Yield periodically to let main loop run, but an iteration of
3903          * the main loop is expensive, so do it each some iterations
3904          */
3905         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3906             aio_co_schedule(qemu_get_current_aio_context(),
3907                             qemu_coroutine_self());
3908             qemu_coroutine_yield();
3909         }
3910         i++;
3911
3912         addr = qemu_get_be64(f);
3913         flags = addr & ~TARGET_PAGE_MASK;
3914         addr &= TARGET_PAGE_MASK;
3915
3916         if (flags & invalid_flags) {
3917             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3918                 error_report("Received an unexpected compressed page");
3919             }
3920
3921             ret = -EINVAL;
3922             break;
3923         }
3924
3925         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3926                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3927             RAMBlock *block = ram_block_from_stream(mis, f, flags);
3928
3929             host = host_from_ram_block_offset(block, addr);
3930             /*
3931              * After going into COLO stage, we should not load the page
3932              * into SVM's memory directly, we put them into colo_cache firstly.
3933              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3934              * Previously, we copied all these memory in preparing stage of COLO
3935              * while we need to stop VM, which is a time-consuming process.
3936              * Here we optimize it by a trick, back-up every page while in
3937              * migration process while COLO is enabled, though it affects the
3938              * speed of the migration, but it obviously reduce the downtime of
3939              * back-up all SVM'S memory in COLO preparing stage.
3940              */
3941             if (migration_incoming_colo_enabled()) {
3942                 if (migration_incoming_in_colo_state()) {
3943                     /* In COLO stage, put all pages into cache temporarily */
3944                     host = colo_cache_from_block_offset(block, addr, true);
3945                 } else {
3946                    /*
3947                     * In migration stage but before COLO stage,
3948                     * Put all pages into both cache and SVM's memory.
3949                     */
3950                     host_bak = colo_cache_from_block_offset(block, addr, false);
3951                 }
3952             }
3953             if (!host) {
3954                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3955                 ret = -EINVAL;
3956                 break;
3957             }
3958             if (!migration_incoming_in_colo_state()) {
3959                 ramblock_recv_bitmap_set(block, host);
3960             }
3961
3962             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3963         }
3964
3965         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3966         case RAM_SAVE_FLAG_MEM_SIZE:
3967             /* Synchronize RAM block list */
3968             total_ram_bytes = addr;
3969             while (!ret && total_ram_bytes) {
3970                 RAMBlock *block;
3971                 char id[256];
3972                 ram_addr_t length;
3973
3974                 len = qemu_get_byte(f);
3975                 qemu_get_buffer(f, (uint8_t *)id, len);
3976                 id[len] = 0;
3977                 length = qemu_get_be64(f);
3978
3979                 block = qemu_ram_block_by_name(id);
3980                 if (block && !qemu_ram_is_migratable(block)) {
3981                     error_report("block %s should not be migrated !", id);
3982                     ret = -EINVAL;
3983                 } else if (block) {
3984                     if (length != block->used_length) {
3985                         Error *local_err = NULL;
3986
3987                         ret = qemu_ram_resize(block, length,
3988                                               &local_err);
3989                         if (local_err) {
3990                             error_report_err(local_err);
3991                         }
3992                     }
3993                     /* For postcopy we need to check hugepage sizes match */
3994                     if (postcopy_advised && migrate_postcopy_ram() &&
3995                         block->page_size != qemu_host_page_size) {
3996                         uint64_t remote_page_size = qemu_get_be64(f);
3997                         if (remote_page_size != block->page_size) {
3998                             error_report("Mismatched RAM page size %s "
3999                                          "(local) %zd != %" PRId64,
4000                                          id, block->page_size,
4001                                          remote_page_size);
4002                             ret = -EINVAL;
4003                         }
4004                     }
4005                     if (migrate_ignore_shared()) {
4006                         hwaddr addr = qemu_get_be64(f);
4007                         if (ramblock_is_ignored(block) &&
4008                             block->mr->addr != addr) {
4009                             error_report("Mismatched GPAs for block %s "
4010                                          "%" PRId64 "!= %" PRId64,
4011                                          id, (uint64_t)addr,
4012                                          (uint64_t)block->mr->addr);
4013                             ret = -EINVAL;
4014                         }
4015                     }
4016                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4017                                           block->idstr);
4018                 } else {
4019                     error_report("Unknown ramblock \"%s\", cannot "
4020                                  "accept migration", id);
4021                     ret = -EINVAL;
4022                 }
4023
4024                 total_ram_bytes -= length;
4025             }
4026             break;
4027
4028         case RAM_SAVE_FLAG_ZERO:
4029             ch = qemu_get_byte(f);
4030             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4031             break;
4032
4033         case RAM_SAVE_FLAG_PAGE:
4034             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4035             break;
4036
4037         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4038             len = qemu_get_be32(f);
4039             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4040                 error_report("Invalid compressed data length: %d", len);
4041                 ret = -EINVAL;
4042                 break;
4043             }
4044             decompress_data_with_multi_threads(f, host, len);
4045             break;
4046
4047         case RAM_SAVE_FLAG_XBZRLE:
4048             if (load_xbzrle(f, addr, host) < 0) {
4049                 error_report("Failed to decompress XBZRLE page at "
4050                              RAM_ADDR_FMT, addr);
4051                 ret = -EINVAL;
4052                 break;
4053             }
4054             break;
4055         case RAM_SAVE_FLAG_EOS:
4056             /* normal exit */
4057             multifd_recv_sync_main();
4058             break;
4059         default:
4060             if (flags & RAM_SAVE_FLAG_HOOK) {
4061                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4062             } else {
4063                 error_report("Unknown combination of migration flags: 0x%x",
4064                              flags);
4065                 ret = -EINVAL;
4066             }
4067         }
4068         if (!ret) {
4069             ret = qemu_file_get_error(f);
4070         }
4071         if (!ret && host_bak) {
4072             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4073         }
4074     }
4075
4076     ret |= wait_for_decompress_done();
4077     return ret;
4078 }
4079
4080 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4081 {
4082     int ret = 0;
4083     static uint64_t seq_iter;
4084     /*
4085      * If system is running in postcopy mode, page inserts to host memory must
4086      * be atomic
4087      */
4088     bool postcopy_running = postcopy_is_running();
4089
4090     seq_iter++;
4091
4092     if (version_id != 4) {
4093         return -EINVAL;
4094     }
4095
4096     /*
4097      * This RCU critical section can be very long running.
4098      * When RCU reclaims in the code start to become numerous,
4099      * it will be necessary to reduce the granularity of this
4100      * critical section.
4101      */
4102     WITH_RCU_READ_LOCK_GUARD() {
4103         if (postcopy_running) {
4104             ret = ram_load_postcopy(f);
4105         } else {
4106             ret = ram_load_precopy(f);
4107         }
4108     }
4109     trace_ram_load_complete(ret, seq_iter);
4110
4111     return ret;
4112 }
4113
4114 static bool ram_has_postcopy(void *opaque)
4115 {
4116     RAMBlock *rb;
4117     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4118         if (ramblock_is_pmem(rb)) {
4119             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4120                          "is not supported now!", rb->idstr, rb->host);
4121             return false;
4122         }
4123     }
4124
4125     return migrate_postcopy_ram();
4126 }
4127
4128 /* Sync all the dirty bitmap with destination VM.  */
4129 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4130 {
4131     RAMBlock *block;
4132     QEMUFile *file = s->to_dst_file;
4133     int ramblock_count = 0;
4134
4135     trace_ram_dirty_bitmap_sync_start();
4136
4137     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4138         qemu_savevm_send_recv_bitmap(file, block->idstr);
4139         trace_ram_dirty_bitmap_request(block->idstr);
4140         ramblock_count++;
4141     }
4142
4143     trace_ram_dirty_bitmap_sync_wait();
4144
4145     /* Wait until all the ramblocks' dirty bitmap synced */
4146     while (ramblock_count--) {
4147         qemu_sem_wait(&s->rp_state.rp_sem);
4148     }
4149
4150     trace_ram_dirty_bitmap_sync_complete();
4151
4152     return 0;
4153 }
4154
4155 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4156 {
4157     qemu_sem_post(&s->rp_state.rp_sem);
4158 }
4159
4160 /*
4161  * Read the received bitmap, revert it as the initial dirty bitmap.
4162  * This is only used when the postcopy migration is paused but wants
4163  * to resume from a middle point.
4164  */
4165 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4166 {
4167     int ret = -EINVAL;
4168     /* from_dst_file is always valid because we're within rp_thread */
4169     QEMUFile *file = s->rp_state.from_dst_file;
4170     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4171     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4172     uint64_t size, end_mark;
4173
4174     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4175
4176     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4177         error_report("%s: incorrect state %s", __func__,
4178                      MigrationStatus_str(s->state));
4179         return -EINVAL;
4180     }
4181
4182     /*
4183      * Note: see comments in ramblock_recv_bitmap_send() on why we
4184      * need the endianness conversion, and the paddings.
4185      */
4186     local_size = ROUND_UP(local_size, 8);
4187
4188     /* Add paddings */
4189     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4190
4191     size = qemu_get_be64(file);
4192
4193     /* The size of the bitmap should match with our ramblock */
4194     if (size != local_size) {
4195         error_report("%s: ramblock '%s' bitmap size mismatch "
4196                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4197                      block->idstr, size, local_size);
4198         ret = -EINVAL;
4199         goto out;
4200     }
4201
4202     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4203     end_mark = qemu_get_be64(file);
4204
4205     ret = qemu_file_get_error(file);
4206     if (ret || size != local_size) {
4207         error_report("%s: read bitmap failed for ramblock '%s': %d"
4208                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4209                      __func__, block->idstr, ret, local_size, size);
4210         ret = -EIO;
4211         goto out;
4212     }
4213
4214     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4215         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4216                      __func__, block->idstr, end_mark);
4217         ret = -EINVAL;
4218         goto out;
4219     }
4220
4221     /*
4222      * Endianness conversion. We are during postcopy (though paused).
4223      * The dirty bitmap won't change. We can directly modify it.
4224      */
4225     bitmap_from_le(block->bmap, le_bitmap, nbits);
4226
4227     /*
4228      * What we received is "received bitmap". Revert it as the initial
4229      * dirty bitmap for this ramblock.
4230      */
4231     bitmap_complement(block->bmap, block->bmap, nbits);
4232
4233     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4234     ramblock_dirty_bitmap_clear_discarded_pages(block);
4235
4236     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4237     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4238
4239     /*
4240      * We succeeded to sync bitmap for current ramblock. If this is
4241      * the last one to sync, we need to notify the main send thread.
4242      */
4243     ram_dirty_bitmap_reload_notify(s);
4244
4245     ret = 0;
4246 out:
4247     g_free(le_bitmap);
4248     return ret;
4249 }
4250
4251 static int ram_resume_prepare(MigrationState *s, void *opaque)
4252 {
4253     RAMState *rs = *(RAMState **)opaque;
4254     int ret;
4255
4256     ret = ram_dirty_bitmap_sync_all(s, rs);
4257     if (ret) {
4258         return ret;
4259     }
4260
4261     ram_state_resume_prepare(rs, s->to_dst_file);
4262
4263     return 0;
4264 }
4265
4266 static SaveVMHandlers savevm_ram_handlers = {
4267     .save_setup = ram_save_setup,
4268     .save_live_iterate = ram_save_iterate,
4269     .save_live_complete_postcopy = ram_save_complete,
4270     .save_live_complete_precopy = ram_save_complete,
4271     .has_postcopy = ram_has_postcopy,
4272     .save_live_pending = ram_save_pending,
4273     .load_state = ram_load,
4274     .save_cleanup = ram_save_cleanup,
4275     .load_setup = ram_load_setup,
4276     .load_cleanup = ram_load_cleanup,
4277     .resume_prepare = ram_resume_prepare,
4278 };
4279
4280 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4281                                       size_t old_size, size_t new_size)
4282 {
4283     PostcopyState ps = postcopy_state_get();
4284     ram_addr_t offset;
4285     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4286     Error *err = NULL;
4287
4288     if (ramblock_is_ignored(rb)) {
4289         return;
4290     }
4291
4292     if (!migration_is_idle()) {
4293         /*
4294          * Precopy code on the source cannot deal with the size of RAM blocks
4295          * changing at random points in time - especially after sending the
4296          * RAM block sizes in the migration stream, they must no longer change.
4297          * Abort and indicate a proper reason.
4298          */
4299         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4300         migration_cancel(err);
4301         error_free(err);
4302     }
4303
4304     switch (ps) {
4305     case POSTCOPY_INCOMING_ADVISE:
4306         /*
4307          * Update what ram_postcopy_incoming_init()->init_range() does at the
4308          * time postcopy was advised. Syncing RAM blocks with the source will
4309          * result in RAM resizes.
4310          */
4311         if (old_size < new_size) {
4312             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4313                 error_report("RAM block '%s' discard of resized RAM failed",
4314                              rb->idstr);
4315             }
4316         }
4317         rb->postcopy_length = new_size;
4318         break;
4319     case POSTCOPY_INCOMING_NONE:
4320     case POSTCOPY_INCOMING_RUNNING:
4321     case POSTCOPY_INCOMING_END:
4322         /*
4323          * Once our guest is running, postcopy does no longer care about
4324          * resizes. When growing, the new memory was not available on the
4325          * source, no handler needed.
4326          */
4327         break;
4328     default:
4329         error_report("RAM block '%s' resized during postcopy state: %d",
4330                      rb->idstr, ps);
4331         exit(-1);
4332     }
4333 }
4334
4335 static RAMBlockNotifier ram_mig_ram_notifier = {
4336     .ram_block_resized = ram_mig_ram_block_resized,
4337 };
4338
4339 void ram_mig_init(void)
4340 {
4341     qemu_mutex_init(&XBZRLE.lock);
4342     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4343     ram_block_notifier_add(&ram_mig_ram_notifier);
4344 }