migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/madvise.h"
  34 #include "qemu/main-loop.h"
  35 #include "xbzrle.h"
  36 #include "ram.h"
  37 #include "migration.h"
  38 #include "migration/register.h"
  39 #include "migration/misc.h"
  40 #include "qemu-file.h"
  41 #include "postcopy-ram.h"
  42 #include "page_cache.h"
  43 #include "qemu/error-report.h"
  44 #include "qapi/error.h"
  45 #include "qapi/qapi-types-migration.h"
  46 #include "qapi/qapi-events-migration.h"
  47 #include "qapi/qmp/qerror.h"
  48 #include "trace.h"
  49 #include "exec/ram_addr.h"
  50 #include "exec/target_page.h"
  51 #include "qemu/rcu_queue.h"
  52 #include "migration/colo.h"
  53 #include "block.h"
  54 #include "sysemu/cpu-throttle.h"
  55 #include "savevm.h"
  56 #include "qemu/iov.h"
  57 #include "multifd.h"
  58 #include "sysemu/runstate.h"
  59
  60 #include "hw/boards.h" /* for machine_dump_guest_core() */
  61
  62 #if defined(__linux__)
  63 #include "qemu/userfaultfd.h"
  64 #endif /* defined(__linux__) */
  65
  66 /***********************************************************/
  67 /* ram save/restore */
  68
  69 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  70  * worked for pages that where filled with the same char.  We switched
  71  * it to only search for the zero value.  And to avoid confusion with
  72  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  73  */
  74
  75 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  76 #define RAM_SAVE_FLAG_ZERO     0x02
  77 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  78 #define RAM_SAVE_FLAG_PAGE     0x08
  79 #define RAM_SAVE_FLAG_EOS      0x10
  80 #define RAM_SAVE_FLAG_CONTINUE 0x20
  81 #define RAM_SAVE_FLAG_XBZRLE   0x40
  82 /* 0x80 is reserved in migration.h start with 0x100 next */
  83 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  84
  85 XBZRLECacheStats xbzrle_counters;
  86
  87 /* struct contains XBZRLE cache and a static page
  88    used by the compression */
  89 static struct {
  90     /* buffer used for XBZRLE encoding */
  91     uint8_t *encoded_buf;
  92     /* buffer for storing page content */
  93     uint8_t *current_buf;
  94     /* Cache for XBZRLE, Protected by lock. */
  95     PageCache *cache;
  96     QemuMutex lock;
  97     /* it will store a page full of zeros */
  98     uint8_t *zero_target_page;
  99     /* buffer used for XBZRLE decoding */
 100     uint8_t *decoded_buf;
 101 } XBZRLE;
 102
 103 static void XBZRLE_cache_lock(void)
 104 {
 105     if (migrate_use_xbzrle()) {
 106         qemu_mutex_lock(&XBZRLE.lock);
 107     }
 108 }
 109
 110 static void XBZRLE_cache_unlock(void)
 111 {
 112     if (migrate_use_xbzrle()) {
 113         qemu_mutex_unlock(&XBZRLE.lock);
 114     }
 115 }
 116
 117 /**
 118  * xbzrle_cache_resize: resize the xbzrle cache
 119  *
 120  * This function is called from migrate_params_apply in main
 121  * thread, possibly while a migration is in progress.  A running
 122  * migration may be using the cache and might finish during this call,
 123  * hence changes to the cache are protected by XBZRLE.lock().
 124  *
 125  * Returns 0 for success or -1 for error
 126  *
 127  * @new_size: new cache size
 128  * @errp: set *errp if the check failed, with reason
 129  */
 130 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 131 {
 132     PageCache *new_cache;
 133     int64_t ret = 0;
 134
 135     /* Check for truncation */
 136     if (new_size != (size_t)new_size) {
 137         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 138                    "exceeding address space");
 139         return -1;
 140     }
 141
 142     if (new_size == migrate_xbzrle_cache_size()) {
 143         /* nothing to do */
 144         return 0;
 145     }
 146
 147     XBZRLE_cache_lock();
 148
 149     if (XBZRLE.cache != NULL) {
 150         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 151         if (!new_cache) {
 152             ret = -1;
 153             goto out;
 154         }
 155
 156         cache_fini(XBZRLE.cache);
 157         XBZRLE.cache = new_cache;
 158     }
 159 out:
 160     XBZRLE_cache_unlock();
 161     return ret;
 162 }
 163
 164 bool ramblock_is_ignored(RAMBlock *block)
 165 {
 166     return !qemu_ram_is_migratable(block) ||
 167            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 168 }
 169
 170 #undef RAMBLOCK_FOREACH
 171
 172 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 173 {
 174     RAMBlock *block;
 175     int ret = 0;
 176
 177     RCU_READ_LOCK_GUARD();
 178
 179     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 180         ret = func(block, opaque);
 181         if (ret) {
 182             break;
 183         }
 184     }
 185     return ret;
 186 }
 187
 188 static void ramblock_recv_map_init(void)
 189 {
 190     RAMBlock *rb;
 191
 192     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 193         assert(!rb->receivedmap);
 194         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 195     }
 196 }
 197
 198 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 199 {
 200     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 201                     rb->receivedmap);
 202 }
 203
 204 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 205 {
 206     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 207 }
 208
 209 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 210 {
 211     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 212 }
 213
 214 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 215                                     size_t nr)
 216 {
 217     bitmap_set_atomic(rb->receivedmap,
 218                       ramblock_recv_bitmap_offset(host_addr, rb),
 219                       nr);
 220 }
 221
 222 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 223
 224 /*
 225  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 226  *
 227  * Returns >0 if success with sent bytes, or <0 if error.
 228  */
 229 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 230                                   const char *block_name)
 231 {
 232     RAMBlock *block = qemu_ram_block_by_name(block_name);
 233     unsigned long *le_bitmap, nbits;
 234     uint64_t size;
 235
 236     if (!block) {
 237         error_report("%s: invalid block name: %s", __func__, block_name);
 238         return -1;
 239     }
 240
 241     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 242
 243     /*
 244      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 245      * machines we may need 4 more bytes for padding (see below
 246      * comment). So extend it a bit before hand.
 247      */
 248     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 249
 250     /*
 251      * Always use little endian when sending the bitmap. This is
 252      * required that when source and destination VMs are not using the
 253      * same endianness. (Note: big endian won't work.)
 254      */
 255     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 256
 257     /* Size of the bitmap, in bytes */
 258     size = DIV_ROUND_UP(nbits, 8);
 259
 260     /*
 261      * size is always aligned to 8 bytes for 64bit machines, but it
 262      * may not be true for 32bit machines. We need this padding to
 263      * make sure the migration can survive even between 32bit and
 264      * 64bit machines.
 265      */
 266     size = ROUND_UP(size, 8);
 267
 268     qemu_put_be64(file, size);
 269     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 270     /*
 271      * Mark as an end, in case the middle part is screwed up due to
 272      * some "mysterious" reason.
 273      */
 274     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 275     qemu_fflush(file);
 276
 277     g_free(le_bitmap);
 278
 279     if (qemu_file_get_error(file)) {
 280         return qemu_file_get_error(file);
 281     }
 282
 283     return size + sizeof(size);
 284 }
 285
 286 /*
 287  * An outstanding page request, on the source, having been received
 288  * and queued
 289  */
 290 struct RAMSrcPageRequest {
 291     RAMBlock *rb;
 292     hwaddr    offset;
 293     hwaddr    len;
 294
 295     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 296 };
 297
 298 /* State of RAM for migration */
 299 struct RAMState {
 300     /* QEMUFile used for this migration */
 301     QEMUFile *f;
 302     /* UFFD file descriptor, used in 'write-tracking' migration */
 303     int uffdio_fd;
 304     /* Last block that we have visited searching for dirty pages */
 305     RAMBlock *last_seen_block;
 306     /* Last block from where we have sent data */
 307     RAMBlock *last_sent_block;
 308     /* Last dirty target page we have sent */
 309     ram_addr_t last_page;
 310     /* last ram version we have seen */
 311     uint32_t last_version;
 312     /* How many times we have dirty too many pages */
 313     int dirty_rate_high_cnt;
 314     /* these variables are used for bitmap sync */
 315     /* last time we did a full bitmap_sync */
 316     int64_t time_last_bitmap_sync;
 317     /* bytes transferred at start_time */
 318     uint64_t bytes_xfer_prev;
 319     /* number of dirty pages since start_time */
 320     uint64_t num_dirty_pages_period;
 321     /* xbzrle misses since the beginning of the period */
 322     uint64_t xbzrle_cache_miss_prev;
 323     /* Amount of xbzrle pages since the beginning of the period */
 324     uint64_t xbzrle_pages_prev;
 325     /* Amount of xbzrle encoded bytes since the beginning of the period */
 326     uint64_t xbzrle_bytes_prev;
 327     /* Start using XBZRLE (e.g., after the first round). */
 328     bool xbzrle_enabled;
 329     /* Are we on the last stage of migration */
 330     bool last_stage;
 331     /* compression statistics since the beginning of the period */
 332     /* amount of count that no free thread to compress data */
 333     uint64_t compress_thread_busy_prev;
 334     /* amount bytes after compression */
 335     uint64_t compressed_size_prev;
 336     /* amount of compressed pages */
 337     uint64_t compress_pages_prev;
 338
 339     /* total handled target pages at the beginning of period */
 340     uint64_t target_page_count_prev;
 341     /* total handled target pages since start */
 342     uint64_t target_page_count;
 343     /* number of dirty bits in the bitmap */
 344     uint64_t migration_dirty_pages;
 345     /* Protects modification of the bitmap and migration dirty pages */
 346     QemuMutex bitmap_mutex;
 347     /* The RAMBlock used in the last src_page_requests */
 348     RAMBlock *last_req_rb;
 349     /* Queue of outstanding page requests from the destination */
 350     QemuMutex src_page_req_mutex;
 351     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 352 };
 353 typedef struct RAMState RAMState;
 354
 355 static RAMState *ram_state;
 356
 357 static NotifierWithReturnList precopy_notifier_list;
 358
 359 /* Whether postcopy has queued requests? */
 360 static bool postcopy_has_request(RAMState *rs)
 361 {
 362     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
 363 }
 364
 365 void precopy_infrastructure_init(void)
 366 {
 367     notifier_with_return_list_init(&precopy_notifier_list);
 368 }
 369
 370 void precopy_add_notifier(NotifierWithReturn *n)
 371 {
 372     notifier_with_return_list_add(&precopy_notifier_list, n);
 373 }
 374
 375 void precopy_remove_notifier(NotifierWithReturn *n)
 376 {
 377     notifier_with_return_remove(n);
 378 }
 379
 380 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 381 {
 382     PrecopyNotifyData pnd;
 383     pnd.reason = reason;
 384     pnd.errp = errp;
 385
 386     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 387 }
 388
 389 uint64_t ram_bytes_remaining(void)
 390 {
 391     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 392                        0;
 393 }
 394
 395 MigrationStats ram_counters;
 396
 397 static void ram_transferred_add(uint64_t bytes)
 398 {
 399     if (runstate_is_running()) {
 400         ram_counters.precopy_bytes += bytes;
 401     } else if (migration_in_postcopy()) {
 402         ram_counters.postcopy_bytes += bytes;
 403     } else {
 404         ram_counters.downtime_bytes += bytes;
 405     }
 406     ram_counters.transferred += bytes;
 407 }
 408
 409 /* used by the search for pages to send */
 410 struct PageSearchStatus {
 411     /* Current block being searched */
 412     RAMBlock    *block;
 413     /* Current page to search from */
 414     unsigned long page;
 415     /* Set once we wrap around */
 416     bool         complete_round;
 417     /* Whether current page is explicitly requested by postcopy */
 418     bool         postcopy_requested;
 419 };
 420 typedef struct PageSearchStatus PageSearchStatus;
 421
 422 CompressionStats compression_counters;
 423
 424 struct CompressParam {
 425     bool done;
 426     bool quit;
 427     bool zero_page;
 428     QEMUFile *file;
 429     QemuMutex mutex;
 430     QemuCond cond;
 431     RAMBlock *block;
 432     ram_addr_t offset;
 433
 434     /* internally used fields */
 435     z_stream stream;
 436     uint8_t *originbuf;
 437 };
 438 typedef struct CompressParam CompressParam;
 439
 440 struct DecompressParam {
 441     bool done;
 442     bool quit;
 443     QemuMutex mutex;
 444     QemuCond cond;
 445     void *des;
 446     uint8_t *compbuf;
 447     int len;
 448     z_stream stream;
 449 };
 450 typedef struct DecompressParam DecompressParam;
 451
 452 static CompressParam *comp_param;
 453 static QemuThread *compress_threads;
 454 /* comp_done_cond is used to wake up the migration thread when
 455  * one of the compression threads has finished the compression.
 456  * comp_done_lock is used to co-work with comp_done_cond.
 457  */
 458 static QemuMutex comp_done_lock;
 459 static QemuCond comp_done_cond;
 460 /* The empty QEMUFileOps will be used by file in CompressParam */
 461 static const QEMUFileOps empty_ops = { };
 462
 463 static QEMUFile *decomp_file;
 464 static DecompressParam *decomp_param;
 465 static QemuThread *decompress_threads;
 466 static QemuMutex decomp_done_lock;
 467 static QemuCond decomp_done_cond;
 468
 469 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 470                                  ram_addr_t offset, uint8_t *source_buf);
 471
 472 static void *do_data_compress(void *opaque)
 473 {
 474     CompressParam *param = opaque;
 475     RAMBlock *block;
 476     ram_addr_t offset;
 477     bool zero_page;
 478
 479     qemu_mutex_lock(&param->mutex);
 480     while (!param->quit) {
 481         if (param->block) {
 482             block = param->block;
 483             offset = param->offset;
 484             param->block = NULL;
 485             qemu_mutex_unlock(&param->mutex);
 486
 487             zero_page = do_compress_ram_page(param->file, &param->stream,
 488                                              block, offset, param->originbuf);
 489
 490             qemu_mutex_lock(&comp_done_lock);
 491             param->done = true;
 492             param->zero_page = zero_page;
 493             qemu_cond_signal(&comp_done_cond);
 494             qemu_mutex_unlock(&comp_done_lock);
 495
 496             qemu_mutex_lock(&param->mutex);
 497         } else {
 498             qemu_cond_wait(&param->cond, &param->mutex);
 499         }
 500     }
 501     qemu_mutex_unlock(&param->mutex);
 502
 503     return NULL;
 504 }
 505
 506 static void compress_threads_save_cleanup(void)
 507 {
 508     int i, thread_count;
 509
 510     if (!migrate_use_compression() || !comp_param) {
 511         return;
 512     }
 513
 514     thread_count = migrate_compress_threads();
 515     for (i = 0; i < thread_count; i++) {
 516         /*
 517          * we use it as a indicator which shows if the thread is
 518          * properly init'd or not
 519          */
 520         if (!comp_param[i].file) {
 521             break;
 522         }
 523
 524         qemu_mutex_lock(&comp_param[i].mutex);
 525         comp_param[i].quit = true;
 526         qemu_cond_signal(&comp_param[i].cond);
 527         qemu_mutex_unlock(&comp_param[i].mutex);
 528
 529         qemu_thread_join(compress_threads + i);
 530         qemu_mutex_destroy(&comp_param[i].mutex);
 531         qemu_cond_destroy(&comp_param[i].cond);
 532         deflateEnd(&comp_param[i].stream);
 533         g_free(comp_param[i].originbuf);
 534         qemu_fclose(comp_param[i].file);
 535         comp_param[i].file = NULL;
 536     }
 537     qemu_mutex_destroy(&comp_done_lock);
 538     qemu_cond_destroy(&comp_done_cond);
 539     g_free(compress_threads);
 540     g_free(comp_param);
 541     compress_threads = NULL;
 542     comp_param = NULL;
 543 }
 544
 545 static int compress_threads_save_setup(void)
 546 {
 547     int i, thread_count;
 548
 549     if (!migrate_use_compression()) {
 550         return 0;
 551     }
 552     thread_count = migrate_compress_threads();
 553     compress_threads = g_new0(QemuThread, thread_count);
 554     comp_param = g_new0(CompressParam, thread_count);
 555     qemu_cond_init(&comp_done_cond);
 556     qemu_mutex_init(&comp_done_lock);
 557     for (i = 0; i < thread_count; i++) {
 558         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 559         if (!comp_param[i].originbuf) {
 560             goto exit;
 561         }
 562
 563         if (deflateInit(&comp_param[i].stream,
 564                         migrate_compress_level()) != Z_OK) {
 565             g_free(comp_param[i].originbuf);
 566             goto exit;
 567         }
 568
 569         /* comp_param[i].file is just used as a dummy buffer to save data,
 570          * set its ops to empty.
 571          */
 572         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
 573         comp_param[i].done = true;
 574         comp_param[i].quit = false;
 575         qemu_mutex_init(&comp_param[i].mutex);
 576         qemu_cond_init(&comp_param[i].cond);
 577         qemu_thread_create(compress_threads + i, "compress",
 578                            do_data_compress, comp_param + i,
 579                            QEMU_THREAD_JOINABLE);
 580     }
 581     return 0;
 582
 583 exit:
 584     compress_threads_save_cleanup();
 585     return -1;
 586 }
 587
 588 /**
 589  * save_page_header: write page header to wire
 590  *
 591  * If this is the 1st block, it also writes the block identification
 592  *
 593  * Returns the number of bytes written
 594  *
 595  * @f: QEMUFile where to send the data
 596  * @block: block that contains the page we want to send
 597  * @offset: offset inside the block for the page
 598  *          in the lower bits, it contains flags
 599  */
 600 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 601                                ram_addr_t offset)
 602 {
 603     size_t size, len;
 604
 605     if (block == rs->last_sent_block) {
 606         offset |= RAM_SAVE_FLAG_CONTINUE;
 607     }
 608     qemu_put_be64(f, offset);
 609     size = 8;
 610
 611     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 612         len = strlen(block->idstr);
 613         qemu_put_byte(f, len);
 614         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 615         size += 1 + len;
 616         rs->last_sent_block = block;
 617     }
 618     return size;
 619 }
 620
 621 /**
 622  * mig_throttle_guest_down: throttle down the guest
 623  *
 624  * Reduce amount of guest cpu execution to hopefully slow down memory
 625  * writes. If guest dirty memory rate is reduced below the rate at
 626  * which we can transfer pages to the destination then we should be
 627  * able to complete migration. Some workloads dirty memory way too
 628  * fast and will not effectively converge, even with auto-converge.
 629  */
 630 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 631                                     uint64_t bytes_dirty_threshold)
 632 {
 633     MigrationState *s = migrate_get_current();
 634     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 635     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 636     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 637     int pct_max = s->parameters.max_cpu_throttle;
 638
 639     uint64_t throttle_now = cpu_throttle_get_percentage();
 640     uint64_t cpu_now, cpu_ideal, throttle_inc;
 641
 642     /* We have not started throttling yet. Let's start it. */
 643     if (!cpu_throttle_active()) {
 644         cpu_throttle_set(pct_initial);
 645     } else {
 646         /* Throttling already on, just increase the rate */
 647         if (!pct_tailslow) {
 648             throttle_inc = pct_increment;
 649         } else {
 650             /* Compute the ideal CPU percentage used by Guest, which may
 651              * make the dirty rate match the dirty rate threshold. */
 652             cpu_now = 100 - throttle_now;
 653             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 654                         bytes_dirty_period);
 655             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 656         }
 657         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 658     }
 659 }
 660
 661 void mig_throttle_counter_reset(void)
 662 {
 663     RAMState *rs = ram_state;
 664
 665     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 666     rs->num_dirty_pages_period = 0;
 667     rs->bytes_xfer_prev = ram_counters.transferred;
 668 }
 669
 670 /**
 671  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 672  *
 673  * @rs: current RAM state
 674  * @current_addr: address for the zero page
 675  *
 676  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 677  * The important thing is that a stale (not-yet-0'd) page be replaced
 678  * by the new data.
 679  * As a bonus, if the page wasn't in the cache it gets added so that
 680  * when a small write is made into the 0'd page it gets XBZRLE sent.
 681  */
 682 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 683 {
 684     if (!rs->xbzrle_enabled) {
 685         return;
 686     }
 687
 688     /* We don't care if this fails to allocate a new cache page
 689      * as long as it updated an old one */
 690     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 691                  ram_counters.dirty_sync_count);
 692 }
 693
 694 #define ENCODING_FLAG_XBZRLE 0x1
 695
 696 /**
 697  * save_xbzrle_page: compress and send current page
 698  *
 699  * Returns: 1 means that we wrote the page
 700  *          0 means that page is identical to the one already sent
 701  *          -1 means that xbzrle would be longer than normal
 702  *
 703  * @rs: current RAM state
 704  * @current_data: pointer to the address of the page contents
 705  * @current_addr: addr of the page
 706  * @block: block that contains the page we want to send
 707  * @offset: offset inside the block for the page
 708  */
 709 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 710                             ram_addr_t current_addr, RAMBlock *block,
 711                             ram_addr_t offset)
 712 {
 713     int encoded_len = 0, bytes_xbzrle;
 714     uint8_t *prev_cached_page;
 715
 716     if (!cache_is_cached(XBZRLE.cache, current_addr,
 717                          ram_counters.dirty_sync_count)) {
 718         xbzrle_counters.cache_miss++;
 719         if (!rs->last_stage) {
 720             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 721                              ram_counters.dirty_sync_count) == -1) {
 722                 return -1;
 723             } else {
 724                 /* update *current_data when the page has been
 725                    inserted into cache */
 726                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 727             }
 728         }
 729         return -1;
 730     }
 731
 732     /*
 733      * Reaching here means the page has hit the xbzrle cache, no matter what
 734      * encoding result it is (normal encoding, overflow or skipping the page),
 735      * count the page as encoded. This is used to calculate the encoding rate.
 736      *
 737      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 738      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 739      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 740      * skipped page included. In this way, the encoding rate can tell if the
 741      * guest page is good for xbzrle encoding.
 742      */
 743     xbzrle_counters.pages++;
 744     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 745
 746     /* save current buffer into memory */
 747     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 748
 749     /* XBZRLE encoding (if there is no overflow) */
 750     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 751                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 752                                        TARGET_PAGE_SIZE);
 753
 754     /*
 755      * Update the cache contents, so that it corresponds to the data
 756      * sent, in all cases except where we skip the page.
 757      */
 758     if (!rs->last_stage && encoded_len != 0) {
 759         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 760         /*
 761          * In the case where we couldn't compress, ensure that the caller
 762          * sends the data from the cache, since the guest might have
 763          * changed the RAM since we copied it.
 764          */
 765         *current_data = prev_cached_page;
 766     }
 767
 768     if (encoded_len == 0) {
 769         trace_save_xbzrle_page_skipping();
 770         return 0;
 771     } else if (encoded_len == -1) {
 772         trace_save_xbzrle_page_overflow();
 773         xbzrle_counters.overflow++;
 774         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 775         return -1;
 776     }
 777
 778     /* Send XBZRLE based compressed page */
 779     bytes_xbzrle = save_page_header(rs, rs->f, block,
 780                                     offset | RAM_SAVE_FLAG_XBZRLE);
 781     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 782     qemu_put_be16(rs->f, encoded_len);
 783     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 784     bytes_xbzrle += encoded_len + 1 + 2;
 785     /*
 786      * Like compressed_size (please see update_compress_thread_counts),
 787      * the xbzrle encoded bytes don't count the 8 byte header with
 788      * RAM_SAVE_FLAG_CONTINUE.
 789      */
 790     xbzrle_counters.bytes += bytes_xbzrle - 8;
 791     ram_transferred_add(bytes_xbzrle);
 792
 793     return 1;
 794 }
 795
 796 /**
 797  * migration_bitmap_find_dirty: find the next dirty page from start
 798  *
 799  * Returns the page offset within memory region of the start of a dirty page
 800  *
 801  * @rs: current RAM state
 802  * @rb: RAMBlock where to search for dirty pages
 803  * @start: page where we start the search
 804  */
 805 static inline
 806 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 807                                           unsigned long start)
 808 {
 809     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 810     unsigned long *bitmap = rb->bmap;
 811
 812     if (ramblock_is_ignored(rb)) {
 813         return size;
 814     }
 815
 816     return find_next_bit(bitmap, size, start);
 817 }
 818
 819 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
 820                                                        unsigned long page)
 821 {
 822     uint8_t shift;
 823     hwaddr size, start;
 824
 825     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
 826         return;
 827     }
 828
 829     shift = rb->clear_bmap_shift;
 830     /*
 831      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 832      * can make things easier sometimes since then start address
 833      * of the small chunk will always be 64 pages aligned so the
 834      * bitmap will always be aligned to unsigned long. We should
 835      * even be able to remove this restriction but I'm simply
 836      * keeping it.
 837      */
 838     assert(shift >= 6);
 839
 840     size = 1ULL << (TARGET_PAGE_BITS + shift);
 841     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
 842     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 843     memory_region_clear_dirty_bitmap(rb->mr, start, size);
 844 }
 845
 846 static void
 847 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
 848                                                  unsigned long start,
 849                                                  unsigned long npages)
 850 {
 851     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
 852     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
 853     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
 854
 855     /*
 856      * Clear pages from start to start + npages - 1, so the end boundary is
 857      * exclusive.
 858      */
 859     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
 860         migration_clear_memory_region_dirty_bitmap(rb, i);
 861     }
 862 }
 863
 864 /*
 865  * colo_bitmap_find_diry:find contiguous dirty pages from start
 866  *
 867  * Returns the page offset within memory region of the start of the contiguout
 868  * dirty page
 869  *
 870  * @rs: current RAM state
 871  * @rb: RAMBlock where to search for dirty pages
 872  * @start: page where we start the search
 873  * @num: the number of contiguous dirty pages
 874  */
 875 static inline
 876 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 877                                      unsigned long start, unsigned long *num)
 878 {
 879     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 880     unsigned long *bitmap = rb->bmap;
 881     unsigned long first, next;
 882
 883     *num = 0;
 884
 885     if (ramblock_is_ignored(rb)) {
 886         return size;
 887     }
 888
 889     first = find_next_bit(bitmap, size, start);
 890     if (first >= size) {
 891         return first;
 892     }
 893     next = find_next_zero_bit(bitmap, size, first + 1);
 894     assert(next >= first);
 895     *num = next - first;
 896     return first;
 897 }
 898
 899 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 900                                                 RAMBlock *rb,
 901                                                 unsigned long page)
 902 {
 903     bool ret;
 904
 905     /*
 906      * Clear dirty bitmap if needed.  This _must_ be called before we
 907      * send any of the page in the chunk because we need to make sure
 908      * we can capture further page content changes when we sync dirty
 909      * log the next time.  So as long as we are going to send any of
 910      * the page in the chunk we clear the remote dirty bitmap for all.
 911      * Clearing it earlier won't be a problem, but too late will.
 912      */
 913     migration_clear_memory_region_dirty_bitmap(rb, page);
 914
 915     ret = test_and_clear_bit(page, rb->bmap);
 916     if (ret) {
 917         rs->migration_dirty_pages--;
 918     }
 919
 920     return ret;
 921 }
 922
 923 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
 924                                        void *opaque)
 925 {
 926     const hwaddr offset = section->offset_within_region;
 927     const hwaddr size = int128_get64(section->size);
 928     const unsigned long start = offset >> TARGET_PAGE_BITS;
 929     const unsigned long npages = size >> TARGET_PAGE_BITS;
 930     RAMBlock *rb = section->mr->ram_block;
 931     uint64_t *cleared_bits = opaque;
 932
 933     /*
 934      * We don't grab ram_state->bitmap_mutex because we expect to run
 935      * only when starting migration or during postcopy recovery where
 936      * we don't have concurrent access.
 937      */
 938     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
 939         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
 940     }
 941     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
 942     bitmap_clear(rb->bmap, start, npages);
 943 }
 944
 945 /*
 946  * Exclude all dirty pages from migration that fall into a discarded range as
 947  * managed by a RamDiscardManager responsible for the mapped memory region of
 948  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
 949  *
 950  * Discarded pages ("logically unplugged") have undefined content and must
 951  * not get migrated, because even reading these pages for migration might
 952  * result in undesired behavior.
 953  *
 954  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
 955  *
 956  * Note: The result is only stable while migrating (precopy/postcopy).
 957  */
 958 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
 959 {
 960     uint64_t cleared_bits = 0;
 961
 962     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
 963         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 964         MemoryRegionSection section = {
 965             .mr = rb->mr,
 966             .offset_within_region = 0,
 967             .size = int128_make64(qemu_ram_get_used_length(rb)),
 968         };
 969
 970         ram_discard_manager_replay_discarded(rdm, &section,
 971                                              dirty_bitmap_clear_section,
 972                                              &cleared_bits);
 973     }
 974     return cleared_bits;
 975 }
 976
 977 /*
 978  * Check if a host-page aligned page falls into a discarded range as managed by
 979  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
 980  *
 981  * Note: The result is only stable while migrating (precopy/postcopy).
 982  */
 983 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
 984 {
 985     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
 986         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 987         MemoryRegionSection section = {
 988             .mr = rb->mr,
 989             .offset_within_region = start,
 990             .size = int128_make64(qemu_ram_pagesize(rb)),
 991         };
 992
 993         return !ram_discard_manager_is_populated(rdm, &section);
 994     }
 995     return false;
 996 }
 997
 998 /* Called with RCU critical section */
 999 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1000 {
1001     uint64_t new_dirty_pages =
1002         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1003
1004     rs->migration_dirty_pages += new_dirty_pages;
1005     rs->num_dirty_pages_period += new_dirty_pages;
1006 }
1007
1008 /**
1009  * ram_pagesize_summary: calculate all the pagesizes of a VM
1010  *
1011  * Returns a summary bitmap of the page sizes of all RAMBlocks
1012  *
1013  * For VMs with just normal pages this is equivalent to the host page
1014  * size. If it's got some huge pages then it's the OR of all the
1015  * different page sizes.
1016  */
1017 uint64_t ram_pagesize_summary(void)
1018 {
1019     RAMBlock *block;
1020     uint64_t summary = 0;
1021
1022     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1023         summary |= block->page_size;
1024     }
1025
1026     return summary;
1027 }
1028
1029 uint64_t ram_get_total_transferred_pages(void)
1030 {
1031     return  ram_counters.normal + ram_counters.duplicate +
1032                 compression_counters.pages + xbzrle_counters.pages;
1033 }
1034
1035 static void migration_update_rates(RAMState *rs, int64_t end_time)
1036 {
1037     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1038     double compressed_size;
1039
1040     /* calculate period counters */
1041     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1042                 / (end_time - rs->time_last_bitmap_sync);
1043
1044     if (!page_count) {
1045         return;
1046     }
1047
1048     if (migrate_use_xbzrle()) {
1049         double encoded_size, unencoded_size;
1050
1051         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1052             rs->xbzrle_cache_miss_prev) / page_count;
1053         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1054         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1055                          TARGET_PAGE_SIZE;
1056         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1057         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1058             xbzrle_counters.encoding_rate = 0;
1059         } else {
1060             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1061         }
1062         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1063         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1064     }
1065
1066     if (migrate_use_compression()) {
1067         compression_counters.busy_rate = (double)(compression_counters.busy -
1068             rs->compress_thread_busy_prev) / page_count;
1069         rs->compress_thread_busy_prev = compression_counters.busy;
1070
1071         compressed_size = compression_counters.compressed_size -
1072                           rs->compressed_size_prev;
1073         if (compressed_size) {
1074             double uncompressed_size = (compression_counters.pages -
1075                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1076
1077             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1078             compression_counters.compression_rate =
1079                                         uncompressed_size / compressed_size;
1080
1081             rs->compress_pages_prev = compression_counters.pages;
1082             rs->compressed_size_prev = compression_counters.compressed_size;
1083         }
1084     }
1085 }
1086
1087 static void migration_trigger_throttle(RAMState *rs)
1088 {
1089     MigrationState *s = migrate_get_current();
1090     uint64_t threshold = s->parameters.throttle_trigger_threshold;
1091
1092     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1093     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1094     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1095
1096     /* During block migration the auto-converge logic incorrectly detects
1097      * that ram migration makes no progress. Avoid this by disabling the
1098      * throttling logic during the bulk phase of block migration. */
1099     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1100         /* The following detection logic can be refined later. For now:
1101            Check to see if the ratio between dirtied bytes and the approx.
1102            amount of bytes that just got transferred since the last time
1103            we were in this routine reaches the threshold. If that happens
1104            twice, start or increase throttling. */
1105
1106         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1107             (++rs->dirty_rate_high_cnt >= 2)) {
1108             trace_migration_throttle();
1109             rs->dirty_rate_high_cnt = 0;
1110             mig_throttle_guest_down(bytes_dirty_period,
1111                                     bytes_dirty_threshold);
1112         }
1113     }
1114 }
1115
1116 static void migration_bitmap_sync(RAMState *rs)
1117 {
1118     RAMBlock *block;
1119     int64_t end_time;
1120
1121     ram_counters.dirty_sync_count++;
1122
1123     if (!rs->time_last_bitmap_sync) {
1124         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1125     }
1126
1127     trace_migration_bitmap_sync_start();
1128     memory_global_dirty_log_sync();
1129
1130     qemu_mutex_lock(&rs->bitmap_mutex);
1131     WITH_RCU_READ_LOCK_GUARD() {
1132         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1133             ramblock_sync_dirty_bitmap(rs, block);
1134         }
1135         ram_counters.remaining = ram_bytes_remaining();
1136     }
1137     qemu_mutex_unlock(&rs->bitmap_mutex);
1138
1139     memory_global_after_dirty_log_sync();
1140     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1141
1142     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1143
1144     /* more than 1 second = 1000 millisecons */
1145     if (end_time > rs->time_last_bitmap_sync + 1000) {
1146         migration_trigger_throttle(rs);
1147
1148         migration_update_rates(rs, end_time);
1149
1150         rs->target_page_count_prev = rs->target_page_count;
1151
1152         /* reset period counters */
1153         rs->time_last_bitmap_sync = end_time;
1154         rs->num_dirty_pages_period = 0;
1155         rs->bytes_xfer_prev = ram_counters.transferred;
1156     }
1157     if (migrate_use_events()) {
1158         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1159     }
1160 }
1161
1162 static void migration_bitmap_sync_precopy(RAMState *rs)
1163 {
1164     Error *local_err = NULL;
1165
1166     /*
1167      * The current notifier usage is just an optimization to migration, so we
1168      * don't stop the normal migration process in the error case.
1169      */
1170     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1171         error_report_err(local_err);
1172         local_err = NULL;
1173     }
1174
1175     migration_bitmap_sync(rs);
1176
1177     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1178         error_report_err(local_err);
1179     }
1180 }
1181
1182 static void ram_release_page(const char *rbname, uint64_t offset)
1183 {
1184     if (!migrate_release_ram() || !migration_in_postcopy()) {
1185         return;
1186     }
1187
1188     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1189 }
1190
1191 /**
1192  * save_zero_page_to_file: send the zero page to the file
1193  *
1194  * Returns the size of data written to the file, 0 means the page is not
1195  * a zero page
1196  *
1197  * @rs: current RAM state
1198  * @file: the file where the data is saved
1199  * @block: block that contains the page we want to send
1200  * @offset: offset inside the block for the page
1201  */
1202 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1203                                   RAMBlock *block, ram_addr_t offset)
1204 {
1205     uint8_t *p = block->host + offset;
1206     int len = 0;
1207
1208     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1209         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1210         qemu_put_byte(file, 0);
1211         len += 1;
1212         ram_release_page(block->idstr, offset);
1213     }
1214     return len;
1215 }
1216
1217 /**
1218  * save_zero_page: send the zero page to the stream
1219  *
1220  * Returns the number of pages written.
1221  *
1222  * @rs: current RAM state
1223  * @block: block that contains the page we want to send
1224  * @offset: offset inside the block for the page
1225  */
1226 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1227 {
1228     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1229
1230     if (len) {
1231         ram_counters.duplicate++;
1232         ram_transferred_add(len);
1233         return 1;
1234     }
1235     return -1;
1236 }
1237
1238 /*
1239  * @pages: the number of pages written by the control path,
1240  *        < 0 - error
1241  *        > 0 - number of pages written
1242  *
1243  * Return true if the pages has been saved, otherwise false is returned.
1244  */
1245 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1246                               int *pages)
1247 {
1248     uint64_t bytes_xmit = 0;
1249     int ret;
1250
1251     *pages = -1;
1252     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1253                                 &bytes_xmit);
1254     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1255         return false;
1256     }
1257
1258     if (bytes_xmit) {
1259         ram_transferred_add(bytes_xmit);
1260         *pages = 1;
1261     }
1262
1263     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1264         return true;
1265     }
1266
1267     if (bytes_xmit > 0) {
1268         ram_counters.normal++;
1269     } else if (bytes_xmit == 0) {
1270         ram_counters.duplicate++;
1271     }
1272
1273     return true;
1274 }
1275
1276 /*
1277  * directly send the page to the stream
1278  *
1279  * Returns the number of pages written.
1280  *
1281  * @rs: current RAM state
1282  * @block: block that contains the page we want to send
1283  * @offset: offset inside the block for the page
1284  * @buf: the page to be sent
1285  * @async: send to page asyncly
1286  */
1287 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1288                             uint8_t *buf, bool async)
1289 {
1290     ram_transferred_add(save_page_header(rs, rs->f, block,
1291                                          offset | RAM_SAVE_FLAG_PAGE));
1292     if (async) {
1293         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1294                               migrate_release_ram() &&
1295                               migration_in_postcopy());
1296     } else {
1297         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1298     }
1299     ram_transferred_add(TARGET_PAGE_SIZE);
1300     ram_counters.normal++;
1301     return 1;
1302 }
1303
1304 /**
1305  * ram_save_page: send the given page to the stream
1306  *
1307  * Returns the number of pages written.
1308  *          < 0 - error
1309  *          >=0 - Number of pages written - this might legally be 0
1310  *                if xbzrle noticed the page was the same.
1311  *
1312  * @rs: current RAM state
1313  * @block: block that contains the page we want to send
1314  * @offset: offset inside the block for the page
1315  */
1316 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1317 {
1318     int pages = -1;
1319     uint8_t *p;
1320     bool send_async = true;
1321     RAMBlock *block = pss->block;
1322     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1323     ram_addr_t current_addr = block->offset + offset;
1324
1325     p = block->host + offset;
1326     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1327
1328     XBZRLE_cache_lock();
1329     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1330         pages = save_xbzrle_page(rs, &p, current_addr, block,
1331                                  offset);
1332         if (!rs->last_stage) {
1333             /* Can't send this cached data async, since the cache page
1334              * might get updated before it gets to the wire
1335              */
1336             send_async = false;
1337         }
1338     }
1339
1340     /* XBZRLE overflow or normal page */
1341     if (pages == -1) {
1342         pages = save_normal_page(rs, block, offset, p, send_async);
1343     }
1344
1345     XBZRLE_cache_unlock();
1346
1347     return pages;
1348 }
1349
1350 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1351                                  ram_addr_t offset)
1352 {
1353     if (multifd_queue_page(rs->f, block, offset) < 0) {
1354         return -1;
1355     }
1356     ram_counters.normal++;
1357
1358     return 1;
1359 }
1360
1361 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1362                                  ram_addr_t offset, uint8_t *source_buf)
1363 {
1364     RAMState *rs = ram_state;
1365     uint8_t *p = block->host + offset;
1366     int ret;
1367
1368     if (save_zero_page_to_file(rs, f, block, offset)) {
1369         return true;
1370     }
1371
1372     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1373
1374     /*
1375      * copy it to a internal buffer to avoid it being modified by VM
1376      * so that we can catch up the error during compression and
1377      * decompression
1378      */
1379     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1380     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1381     if (ret < 0) {
1382         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1383         error_report("compressed data failed!");
1384     }
1385     return false;
1386 }
1387
1388 static void
1389 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1390 {
1391     ram_transferred_add(bytes_xmit);
1392
1393     if (param->zero_page) {
1394         ram_counters.duplicate++;
1395         return;
1396     }
1397
1398     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1399     compression_counters.compressed_size += bytes_xmit - 8;
1400     compression_counters.pages++;
1401 }
1402
1403 static bool save_page_use_compression(RAMState *rs);
1404
1405 static void flush_compressed_data(RAMState *rs)
1406 {
1407     int idx, len, thread_count;
1408
1409     if (!save_page_use_compression(rs)) {
1410         return;
1411     }
1412     thread_count = migrate_compress_threads();
1413
1414     qemu_mutex_lock(&comp_done_lock);
1415     for (idx = 0; idx < thread_count; idx++) {
1416         while (!comp_param[idx].done) {
1417             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1418         }
1419     }
1420     qemu_mutex_unlock(&comp_done_lock);
1421
1422     for (idx = 0; idx < thread_count; idx++) {
1423         qemu_mutex_lock(&comp_param[idx].mutex);
1424         if (!comp_param[idx].quit) {
1425             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1426             /*
1427              * it's safe to fetch zero_page without holding comp_done_lock
1428              * as there is no further request submitted to the thread,
1429              * i.e, the thread should be waiting for a request at this point.
1430              */
1431             update_compress_thread_counts(&comp_param[idx], len);
1432         }
1433         qemu_mutex_unlock(&comp_param[idx].mutex);
1434     }
1435 }
1436
1437 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1438                                        ram_addr_t offset)
1439 {
1440     param->block = block;
1441     param->offset = offset;
1442 }
1443
1444 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1445                                            ram_addr_t offset)
1446 {
1447     int idx, thread_count, bytes_xmit = -1, pages = -1;
1448     bool wait = migrate_compress_wait_thread();
1449
1450     thread_count = migrate_compress_threads();
1451     qemu_mutex_lock(&comp_done_lock);
1452 retry:
1453     for (idx = 0; idx < thread_count; idx++) {
1454         if (comp_param[idx].done) {
1455             comp_param[idx].done = false;
1456             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1457             qemu_mutex_lock(&comp_param[idx].mutex);
1458             set_compress_params(&comp_param[idx], block, offset);
1459             qemu_cond_signal(&comp_param[idx].cond);
1460             qemu_mutex_unlock(&comp_param[idx].mutex);
1461             pages = 1;
1462             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1463             break;
1464         }
1465     }
1466
1467     /*
1468      * wait for the free thread if the user specifies 'compress-wait-thread',
1469      * otherwise we will post the page out in the main thread as normal page.
1470      */
1471     if (pages < 0 && wait) {
1472         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1473         goto retry;
1474     }
1475     qemu_mutex_unlock(&comp_done_lock);
1476
1477     return pages;
1478 }
1479
1480 /**
1481  * find_dirty_block: find the next dirty page and update any state
1482  * associated with the search process.
1483  *
1484  * Returns true if a page is found
1485  *
1486  * @rs: current RAM state
1487  * @pss: data about the state of the current dirty page scan
1488  * @again: set to false if the search has scanned the whole of RAM
1489  */
1490 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1491 {
1492     /* This is not a postcopy requested page */
1493     pss->postcopy_requested = false;
1494
1495     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1496     if (pss->complete_round && pss->block == rs->last_seen_block &&
1497         pss->page >= rs->last_page) {
1498         /*
1499          * We've been once around the RAM and haven't found anything.
1500          * Give up.
1501          */
1502         *again = false;
1503         return false;
1504     }
1505     if (!offset_in_ramblock(pss->block,
1506                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1507         /* Didn't find anything in this RAM Block */
1508         pss->page = 0;
1509         pss->block = QLIST_NEXT_RCU(pss->block, next);
1510         if (!pss->block) {
1511             /*
1512              * If memory migration starts over, we will meet a dirtied page
1513              * which may still exists in compression threads's ring, so we
1514              * should flush the compressed data to make sure the new page
1515              * is not overwritten by the old one in the destination.
1516              *
1517              * Also If xbzrle is on, stop using the data compression at this
1518              * point. In theory, xbzrle can do better than compression.
1519              */
1520             flush_compressed_data(rs);
1521
1522             /* Hit the end of the list */
1523             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1524             /* Flag that we've looped */
1525             pss->complete_round = true;
1526             /* After the first round, enable XBZRLE. */
1527             if (migrate_use_xbzrle()) {
1528                 rs->xbzrle_enabled = true;
1529             }
1530         }
1531         /* Didn't find anything this time, but try again on the new block */
1532         *again = true;
1533         return false;
1534     } else {
1535         /* Can go around again, but... */
1536         *again = true;
1537         /* We've found something so probably don't need to */
1538         return true;
1539     }
1540 }
1541
1542 /**
1543  * unqueue_page: gets a page of the queue
1544  *
1545  * Helper for 'get_queued_page' - gets a page off the queue
1546  *
1547  * Returns the block of the page (or NULL if none available)
1548  *
1549  * @rs: current RAM state
1550  * @offset: used to return the offset within the RAMBlock
1551  */
1552 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1553 {
1554     struct RAMSrcPageRequest *entry;
1555     RAMBlock *block = NULL;
1556     size_t page_size;
1557
1558     if (!postcopy_has_request(rs)) {
1559         return NULL;
1560     }
1561
1562     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1563
1564     /*
1565      * This should _never_ change even after we take the lock, because no one
1566      * should be taking anything off the request list other than us.
1567      */
1568     assert(postcopy_has_request(rs));
1569
1570     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1571     block = entry->rb;
1572     *offset = entry->offset;
1573     page_size = qemu_ram_pagesize(block);
1574     /* Each page request should only be multiple page size of the ramblock */
1575     assert((entry->len % page_size) == 0);
1576
1577     if (entry->len > page_size) {
1578         entry->len -= page_size;
1579         entry->offset += page_size;
1580     } else {
1581         memory_region_unref(block->mr);
1582         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1583         g_free(entry);
1584         migration_consume_urgent_request();
1585     }
1586
1587     trace_unqueue_page(block->idstr, *offset,
1588                        test_bit((*offset >> TARGET_PAGE_BITS), block->bmap));
1589
1590     return block;
1591 }
1592
1593 #if defined(__linux__)
1594 /**
1595  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1596  *   is found, return RAM block pointer and page offset
1597  *
1598  * Returns pointer to the RAMBlock containing faulting page,
1599  *   NULL if no write faults are pending
1600  *
1601  * @rs: current RAM state
1602  * @offset: page offset from the beginning of the block
1603  */
1604 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1605 {
1606     struct uffd_msg uffd_msg;
1607     void *page_address;
1608     RAMBlock *block;
1609     int res;
1610
1611     if (!migrate_background_snapshot()) {
1612         return NULL;
1613     }
1614
1615     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1616     if (res <= 0) {
1617         return NULL;
1618     }
1619
1620     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1621     block = qemu_ram_block_from_host(page_address, false, offset);
1622     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1623     return block;
1624 }
1625
1626 /**
1627  * ram_save_release_protection: release UFFD write protection after
1628  *   a range of pages has been saved
1629  *
1630  * @rs: current RAM state
1631  * @pss: page-search-status structure
1632  * @start_page: index of the first page in the range relative to pss->block
1633  *
1634  * Returns 0 on success, negative value in case of an error
1635 */
1636 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1637         unsigned long start_page)
1638 {
1639     int res = 0;
1640
1641     /* Check if page is from UFFD-managed region. */
1642     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1643         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1644         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1645
1646         /* Flush async buffers before un-protect. */
1647         qemu_fflush(rs->f);
1648         /* Un-protect memory range. */
1649         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1650                 false, false);
1651     }
1652
1653     return res;
1654 }
1655
1656 /* ram_write_tracking_available: check if kernel supports required UFFD features
1657  *
1658  * Returns true if supports, false otherwise
1659  */
1660 bool ram_write_tracking_available(void)
1661 {
1662     uint64_t uffd_features;
1663     int res;
1664
1665     res = uffd_query_features(&uffd_features);
1666     return (res == 0 &&
1667             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1668 }
1669
1670 /* ram_write_tracking_compatible: check if guest configuration is
1671  *   compatible with 'write-tracking'
1672  *
1673  * Returns true if compatible, false otherwise
1674  */
1675 bool ram_write_tracking_compatible(void)
1676 {
1677     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1678     int uffd_fd;
1679     RAMBlock *block;
1680     bool ret = false;
1681
1682     /* Open UFFD file descriptor */
1683     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1684     if (uffd_fd < 0) {
1685         return false;
1686     }
1687
1688     RCU_READ_LOCK_GUARD();
1689
1690     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1691         uint64_t uffd_ioctls;
1692
1693         /* Nothing to do with read-only and MMIO-writable regions */
1694         if (block->mr->readonly || block->mr->rom_device) {
1695             continue;
1696         }
1697         /* Try to register block memory via UFFD-IO to track writes */
1698         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1699                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1700             goto out;
1701         }
1702         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1703             goto out;
1704         }
1705     }
1706     ret = true;
1707
1708 out:
1709     uffd_close_fd(uffd_fd);
1710     return ret;
1711 }
1712
1713 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1714                                        ram_addr_t size)
1715 {
1716     /*
1717      * We read one byte of each page; this will preallocate page tables if
1718      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1719      * where no page was populated yet. This might require adaption when
1720      * supporting other mappings, like shmem.
1721      */
1722     for (; offset < size; offset += block->page_size) {
1723         char tmp = *((char *)block->host + offset);
1724
1725         /* Don't optimize the read out */
1726         asm volatile("" : "+r" (tmp));
1727     }
1728 }
1729
1730 static inline int populate_read_section(MemoryRegionSection *section,
1731                                         void *opaque)
1732 {
1733     const hwaddr size = int128_get64(section->size);
1734     hwaddr offset = section->offset_within_region;
1735     RAMBlock *block = section->mr->ram_block;
1736
1737     populate_read_range(block, offset, size);
1738     return 0;
1739 }
1740
1741 /*
1742  * ram_block_populate_read: preallocate page tables and populate pages in the
1743  *   RAM block by reading a byte of each page.
1744  *
1745  * Since it's solely used for userfault_fd WP feature, here we just
1746  *   hardcode page size to qemu_real_host_page_size.
1747  *
1748  * @block: RAM block to populate
1749  */
1750 static void ram_block_populate_read(RAMBlock *rb)
1751 {
1752     /*
1753      * Skip populating all pages that fall into a discarded range as managed by
1754      * a RamDiscardManager responsible for the mapped memory region of the
1755      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1756      * must not get populated automatically. We don't have to track
1757      * modifications via userfaultfd WP reliably, because these pages will
1758      * not be part of the migration stream either way -- see
1759      * ramblock_dirty_bitmap_exclude_discarded_pages().
1760      *
1761      * Note: The result is only stable while migrating (precopy/postcopy).
1762      */
1763     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1764         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1765         MemoryRegionSection section = {
1766             .mr = rb->mr,
1767             .offset_within_region = 0,
1768             .size = rb->mr->size,
1769         };
1770
1771         ram_discard_manager_replay_populated(rdm, &section,
1772                                              populate_read_section, NULL);
1773     } else {
1774         populate_read_range(rb, 0, rb->used_length);
1775     }
1776 }
1777
1778 /*
1779  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1780  */
1781 void ram_write_tracking_prepare(void)
1782 {
1783     RAMBlock *block;
1784
1785     RCU_READ_LOCK_GUARD();
1786
1787     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1788         /* Nothing to do with read-only and MMIO-writable regions */
1789         if (block->mr->readonly || block->mr->rom_device) {
1790             continue;
1791         }
1792
1793         /*
1794          * Populate pages of the RAM block before enabling userfault_fd
1795          * write protection.
1796          *
1797          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1798          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1799          * pages with pte_none() entries in page table.
1800          */
1801         ram_block_populate_read(block);
1802     }
1803 }
1804
1805 /*
1806  * ram_write_tracking_start: start UFFD-WP memory tracking
1807  *
1808  * Returns 0 for success or negative value in case of error
1809  */
1810 int ram_write_tracking_start(void)
1811 {
1812     int uffd_fd;
1813     RAMState *rs = ram_state;
1814     RAMBlock *block;
1815
1816     /* Open UFFD file descriptor */
1817     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1818     if (uffd_fd < 0) {
1819         return uffd_fd;
1820     }
1821     rs->uffdio_fd = uffd_fd;
1822
1823     RCU_READ_LOCK_GUARD();
1824
1825     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1826         /* Nothing to do with read-only and MMIO-writable regions */
1827         if (block->mr->readonly || block->mr->rom_device) {
1828             continue;
1829         }
1830
1831         /* Register block memory with UFFD to track writes */
1832         if (uffd_register_memory(rs->uffdio_fd, block->host,
1833                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1834             goto fail;
1835         }
1836         /* Apply UFFD write protection to the block memory range */
1837         if (uffd_change_protection(rs->uffdio_fd, block->host,
1838                 block->max_length, true, false)) {
1839             goto fail;
1840         }
1841         block->flags |= RAM_UF_WRITEPROTECT;
1842         memory_region_ref(block->mr);
1843
1844         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1845                 block->host, block->max_length);
1846     }
1847
1848     return 0;
1849
1850 fail:
1851     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1852
1853     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1854         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1855             continue;
1856         }
1857         /*
1858          * In case some memory block failed to be write-protected
1859          * remove protection and unregister all succeeded RAM blocks
1860          */
1861         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1862                 false, false);
1863         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1864         /* Cleanup flags and remove reference */
1865         block->flags &= ~RAM_UF_WRITEPROTECT;
1866         memory_region_unref(block->mr);
1867     }
1868
1869     uffd_close_fd(uffd_fd);
1870     rs->uffdio_fd = -1;
1871     return -1;
1872 }
1873
1874 /**
1875  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1876  */
1877 void ram_write_tracking_stop(void)
1878 {
1879     RAMState *rs = ram_state;
1880     RAMBlock *block;
1881
1882     RCU_READ_LOCK_GUARD();
1883
1884     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1885         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1886             continue;
1887         }
1888         /* Remove protection and unregister all affected RAM blocks */
1889         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1890                 false, false);
1891         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1892
1893         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1894                 block->host, block->max_length);
1895
1896         /* Cleanup flags and remove reference */
1897         block->flags &= ~RAM_UF_WRITEPROTECT;
1898         memory_region_unref(block->mr);
1899     }
1900
1901     /* Finally close UFFD file descriptor */
1902     uffd_close_fd(rs->uffdio_fd);
1903     rs->uffdio_fd = -1;
1904 }
1905
1906 #else
1907 /* No target OS support, stubs just fail or ignore */
1908
1909 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1910 {
1911     (void) rs;
1912     (void) offset;
1913
1914     return NULL;
1915 }
1916
1917 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1918         unsigned long start_page)
1919 {
1920     (void) rs;
1921     (void) pss;
1922     (void) start_page;
1923
1924     return 0;
1925 }
1926
1927 bool ram_write_tracking_available(void)
1928 {
1929     return false;
1930 }
1931
1932 bool ram_write_tracking_compatible(void)
1933 {
1934     assert(0);
1935     return false;
1936 }
1937
1938 int ram_write_tracking_start(void)
1939 {
1940     assert(0);
1941     return -1;
1942 }
1943
1944 void ram_write_tracking_stop(void)
1945 {
1946     assert(0);
1947 }
1948 #endif /* defined(__linux__) */
1949
1950 /**
1951  * get_queued_page: unqueue a page from the postcopy requests
1952  *
1953  * Skips pages that are already sent (!dirty)
1954  *
1955  * Returns true if a queued page is found
1956  *
1957  * @rs: current RAM state
1958  * @pss: data about the state of the current dirty page scan
1959  */
1960 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1961 {
1962     RAMBlock  *block;
1963     ram_addr_t offset;
1964
1965     block = unqueue_page(rs, &offset);
1966
1967     if (!block) {
1968         /*
1969          * Poll write faults too if background snapshot is enabled; that's
1970          * when we have vcpus got blocked by the write protected pages.
1971          */
1972         block = poll_fault_page(rs, &offset);
1973     }
1974
1975     if (block) {
1976         /*
1977          * We want the background search to continue from the queued page
1978          * since the guest is likely to want other pages near to the page
1979          * it just requested.
1980          */
1981         pss->block = block;
1982         pss->page = offset >> TARGET_PAGE_BITS;
1983
1984         /*
1985          * This unqueued page would break the "one round" check, even is
1986          * really rare.
1987          */
1988         pss->complete_round = false;
1989         pss->postcopy_requested = true;
1990     }
1991
1992     return !!block;
1993 }
1994
1995 /**
1996  * migration_page_queue_free: drop any remaining pages in the ram
1997  * request queue
1998  *
1999  * It should be empty at the end anyway, but in error cases there may
2000  * be some left.  in case that there is any page left, we drop it.
2001  *
2002  */
2003 static void migration_page_queue_free(RAMState *rs)
2004 {
2005     struct RAMSrcPageRequest *mspr, *next_mspr;
2006     /* This queue generally should be empty - but in the case of a failed
2007      * migration might have some droppings in.
2008      */
2009     RCU_READ_LOCK_GUARD();
2010     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2011         memory_region_unref(mspr->rb->mr);
2012         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2013         g_free(mspr);
2014     }
2015 }
2016
2017 /**
2018  * ram_save_queue_pages: queue the page for transmission
2019  *
2020  * A request from postcopy destination for example.
2021  *
2022  * Returns zero on success or negative on error
2023  *
2024  * @rbname: Name of the RAMBLock of the request. NULL means the
2025  *          same that last one.
2026  * @start: starting address from the start of the RAMBlock
2027  * @len: length (in bytes) to send
2028  */
2029 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2030 {
2031     RAMBlock *ramblock;
2032     RAMState *rs = ram_state;
2033
2034     ram_counters.postcopy_requests++;
2035     RCU_READ_LOCK_GUARD();
2036
2037     if (!rbname) {
2038         /* Reuse last RAMBlock */
2039         ramblock = rs->last_req_rb;
2040
2041         if (!ramblock) {
2042             /*
2043              * Shouldn't happen, we can't reuse the last RAMBlock if
2044              * it's the 1st request.
2045              */
2046             error_report("ram_save_queue_pages no previous block");
2047             return -1;
2048         }
2049     } else {
2050         ramblock = qemu_ram_block_by_name(rbname);
2051
2052         if (!ramblock) {
2053             /* We shouldn't be asked for a non-existent RAMBlock */
2054             error_report("ram_save_queue_pages no block '%s'", rbname);
2055             return -1;
2056         }
2057         rs->last_req_rb = ramblock;
2058     }
2059     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2060     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2061         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2062                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2063                      __func__, start, len, ramblock->used_length);
2064         return -1;
2065     }
2066
2067     struct RAMSrcPageRequest *new_entry =
2068         g_new0(struct RAMSrcPageRequest, 1);
2069     new_entry->rb = ramblock;
2070     new_entry->offset = start;
2071     new_entry->len = len;
2072
2073     memory_region_ref(ramblock->mr);
2074     qemu_mutex_lock(&rs->src_page_req_mutex);
2075     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2076     migration_make_urgent_request();
2077     qemu_mutex_unlock(&rs->src_page_req_mutex);
2078
2079     return 0;
2080 }
2081
2082 static bool save_page_use_compression(RAMState *rs)
2083 {
2084     if (!migrate_use_compression()) {
2085         return false;
2086     }
2087
2088     /*
2089      * If xbzrle is enabled (e.g., after first round of migration), stop
2090      * using the data compression. In theory, xbzrle can do better than
2091      * compression.
2092      */
2093     if (rs->xbzrle_enabled) {
2094         return false;
2095     }
2096
2097     return true;
2098 }
2099
2100 /*
2101  * try to compress the page before posting it out, return true if the page
2102  * has been properly handled by compression, otherwise needs other
2103  * paths to handle it
2104  */
2105 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2106 {
2107     if (!save_page_use_compression(rs)) {
2108         return false;
2109     }
2110
2111     /*
2112      * When starting the process of a new block, the first page of
2113      * the block should be sent out before other pages in the same
2114      * block, and all the pages in last block should have been sent
2115      * out, keeping this order is important, because the 'cont' flag
2116      * is used to avoid resending the block name.
2117      *
2118      * We post the fist page as normal page as compression will take
2119      * much CPU resource.
2120      */
2121     if (block != rs->last_sent_block) {
2122         flush_compressed_data(rs);
2123         return false;
2124     }
2125
2126     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2127         return true;
2128     }
2129
2130     compression_counters.busy++;
2131     return false;
2132 }
2133
2134 /**
2135  * ram_save_target_page: save one target page
2136  *
2137  * Returns the number of pages written
2138  *
2139  * @rs: current RAM state
2140  * @pss: data about the page we want to send
2141  */
2142 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
2143 {
2144     RAMBlock *block = pss->block;
2145     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2146     int res;
2147
2148     if (control_save_page(rs, block, offset, &res)) {
2149         return res;
2150     }
2151
2152     if (save_compress_page(rs, block, offset)) {
2153         return 1;
2154     }
2155
2156     res = save_zero_page(rs, block, offset);
2157     if (res > 0) {
2158         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2159          * page would be stale
2160          */
2161         if (!save_page_use_compression(rs)) {
2162             XBZRLE_cache_lock();
2163             xbzrle_cache_zero_page(rs, block->offset + offset);
2164             XBZRLE_cache_unlock();
2165         }
2166         return res;
2167     }
2168
2169     /*
2170      * Do not use multifd for:
2171      * 1. Compression as the first page in the new block should be posted out
2172      *    before sending the compressed page
2173      * 2. In postcopy as one whole host page should be placed
2174      */
2175     if (!save_page_use_compression(rs) && migrate_use_multifd()
2176         && !migration_in_postcopy()) {
2177         return ram_save_multifd_page(rs, block, offset);
2178     }
2179
2180     return ram_save_page(rs, pss);
2181 }
2182
2183 /**
2184  * ram_save_host_page: save a whole host page
2185  *
2186  * Starting at *offset send pages up to the end of the current host
2187  * page. It's valid for the initial offset to point into the middle of
2188  * a host page in which case the remainder of the hostpage is sent.
2189  * Only dirty target pages are sent. Note that the host page size may
2190  * be a huge page for this block.
2191  * The saving stops at the boundary of the used_length of the block
2192  * if the RAMBlock isn't a multiple of the host page size.
2193  *
2194  * Returns the number of pages written or negative on error
2195  *
2196  * @rs: current RAM state
2197  * @pss: data about the page we want to send
2198  */
2199 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2200 {
2201     int tmppages, pages = 0;
2202     size_t pagesize_bits =
2203         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2204     unsigned long hostpage_boundary =
2205         QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2206     unsigned long start_page = pss->page;
2207     int res;
2208
2209     if (ramblock_is_ignored(pss->block)) {
2210         error_report("block %s should not be migrated !", pss->block->idstr);
2211         return 0;
2212     }
2213
2214     do {
2215         /* Check the pages is dirty and if it is send it */
2216         if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2217             tmppages = ram_save_target_page(rs, pss);
2218             if (tmppages < 0) {
2219                 return tmppages;
2220             }
2221
2222             pages += tmppages;
2223             /*
2224              * Allow rate limiting to happen in the middle of huge pages if
2225              * something is sent in the current iteration.
2226              */
2227             if (pagesize_bits > 1 && tmppages > 0) {
2228                 migration_rate_limit();
2229             }
2230         }
2231         pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2232     } while ((pss->page < hostpage_boundary) &&
2233              offset_in_ramblock(pss->block,
2234                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2235     /* The offset we leave with is the min boundary of host page and block */
2236     pss->page = MIN(pss->page, hostpage_boundary);
2237
2238     res = ram_save_release_protection(rs, pss, start_page);
2239     return (res < 0 ? res : pages);
2240 }
2241
2242 /**
2243  * ram_find_and_save_block: finds a dirty page and sends it to f
2244  *
2245  * Called within an RCU critical section.
2246  *
2247  * Returns the number of pages written where zero means no dirty pages,
2248  * or negative on error
2249  *
2250  * @rs: current RAM state
2251  *
2252  * On systems where host-page-size > target-page-size it will send all the
2253  * pages in a host page that are dirty.
2254  */
2255 static int ram_find_and_save_block(RAMState *rs)
2256 {
2257     PageSearchStatus pss;
2258     int pages = 0;
2259     bool again, found;
2260
2261     /* No dirty page as there is zero RAM */
2262     if (!ram_bytes_total()) {
2263         return pages;
2264     }
2265
2266     pss.block = rs->last_seen_block;
2267     pss.page = rs->last_page;
2268     pss.complete_round = false;
2269
2270     if (!pss.block) {
2271         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2272     }
2273
2274     do {
2275         again = true;
2276         found = get_queued_page(rs, &pss);
2277
2278         if (!found) {
2279             /* priority queue empty, so just search for something dirty */
2280             found = find_dirty_block(rs, &pss, &again);
2281         }
2282
2283         if (found) {
2284             pages = ram_save_host_page(rs, &pss);
2285         }
2286     } while (!pages && again);
2287
2288     rs->last_seen_block = pss.block;
2289     rs->last_page = pss.page;
2290
2291     return pages;
2292 }
2293
2294 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2295 {
2296     uint64_t pages = size / TARGET_PAGE_SIZE;
2297
2298     if (zero) {
2299         ram_counters.duplicate += pages;
2300     } else {
2301         ram_counters.normal += pages;
2302         ram_transferred_add(size);
2303         qemu_update_position(f, size);
2304     }
2305 }
2306
2307 static uint64_t ram_bytes_total_common(bool count_ignored)
2308 {
2309     RAMBlock *block;
2310     uint64_t total = 0;
2311
2312     RCU_READ_LOCK_GUARD();
2313
2314     if (count_ignored) {
2315         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2316             total += block->used_length;
2317         }
2318     } else {
2319         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2320             total += block->used_length;
2321         }
2322     }
2323     return total;
2324 }
2325
2326 uint64_t ram_bytes_total(void)
2327 {
2328     return ram_bytes_total_common(false);
2329 }
2330
2331 static void xbzrle_load_setup(void)
2332 {
2333     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2334 }
2335
2336 static void xbzrle_load_cleanup(void)
2337 {
2338     g_free(XBZRLE.decoded_buf);
2339     XBZRLE.decoded_buf = NULL;
2340 }
2341
2342 static void ram_state_cleanup(RAMState **rsp)
2343 {
2344     if (*rsp) {
2345         migration_page_queue_free(*rsp);
2346         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2347         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2348         g_free(*rsp);
2349         *rsp = NULL;
2350     }
2351 }
2352
2353 static void xbzrle_cleanup(void)
2354 {
2355     XBZRLE_cache_lock();
2356     if (XBZRLE.cache) {
2357         cache_fini(XBZRLE.cache);
2358         g_free(XBZRLE.encoded_buf);
2359         g_free(XBZRLE.current_buf);
2360         g_free(XBZRLE.zero_target_page);
2361         XBZRLE.cache = NULL;
2362         XBZRLE.encoded_buf = NULL;
2363         XBZRLE.current_buf = NULL;
2364         XBZRLE.zero_target_page = NULL;
2365     }
2366     XBZRLE_cache_unlock();
2367 }
2368
2369 static void ram_save_cleanup(void *opaque)
2370 {
2371     RAMState **rsp = opaque;
2372     RAMBlock *block;
2373
2374     /* We don't use dirty log with background snapshots */
2375     if (!migrate_background_snapshot()) {
2376         /* caller have hold iothread lock or is in a bh, so there is
2377          * no writing race against the migration bitmap
2378          */
2379         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2380             /*
2381              * do not stop dirty log without starting it, since
2382              * memory_global_dirty_log_stop will assert that
2383              * memory_global_dirty_log_start/stop used in pairs
2384              */
2385             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2386         }
2387     }
2388
2389     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2390         g_free(block->clear_bmap);
2391         block->clear_bmap = NULL;
2392         g_free(block->bmap);
2393         block->bmap = NULL;
2394     }
2395
2396     xbzrle_cleanup();
2397     compress_threads_save_cleanup();
2398     ram_state_cleanup(rsp);
2399 }
2400
2401 static void ram_state_reset(RAMState *rs)
2402 {
2403     rs->last_seen_block = NULL;
2404     rs->last_sent_block = NULL;
2405     rs->last_page = 0;
2406     rs->last_version = ram_list.version;
2407     rs->xbzrle_enabled = false;
2408 }
2409
2410 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2411
2412 /* **** functions for postcopy ***** */
2413
2414 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2415 {
2416     struct RAMBlock *block;
2417
2418     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2419         unsigned long *bitmap = block->bmap;
2420         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2421         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2422
2423         while (run_start < range) {
2424             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2425             ram_discard_range(block->idstr,
2426                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2427                               ((ram_addr_t)(run_end - run_start))
2428                                 << TARGET_PAGE_BITS);
2429             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2430         }
2431     }
2432 }
2433
2434 /**
2435  * postcopy_send_discard_bm_ram: discard a RAMBlock
2436  *
2437  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2438  *
2439  * @ms: current migration state
2440  * @block: RAMBlock to discard
2441  */
2442 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2443 {
2444     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2445     unsigned long current;
2446     unsigned long *bitmap = block->bmap;
2447
2448     for (current = 0; current < end; ) {
2449         unsigned long one = find_next_bit(bitmap, end, current);
2450         unsigned long zero, discard_length;
2451
2452         if (one >= end) {
2453             break;
2454         }
2455
2456         zero = find_next_zero_bit(bitmap, end, one + 1);
2457
2458         if (zero >= end) {
2459             discard_length = end - one;
2460         } else {
2461             discard_length = zero - one;
2462         }
2463         postcopy_discard_send_range(ms, one, discard_length);
2464         current = one + discard_length;
2465     }
2466 }
2467
2468 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2469
2470 /**
2471  * postcopy_each_ram_send_discard: discard all RAMBlocks
2472  *
2473  * Utility for the outgoing postcopy code.
2474  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2475  *   passing it bitmap indexes and name.
2476  * (qemu_ram_foreach_block ends up passing unscaled lengths
2477  *  which would mean postcopy code would have to deal with target page)
2478  *
2479  * @ms: current migration state
2480  */
2481 static void postcopy_each_ram_send_discard(MigrationState *ms)
2482 {
2483     struct RAMBlock *block;
2484
2485     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2486         postcopy_discard_send_init(ms, block->idstr);
2487
2488         /*
2489          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2490          * host-page size chunks, mark any partially dirty host-page size
2491          * chunks as all dirty.  In this case the host-page is the host-page
2492          * for the particular RAMBlock, i.e. it might be a huge page.
2493          */
2494         postcopy_chunk_hostpages_pass(ms, block);
2495
2496         /*
2497          * Postcopy sends chunks of bitmap over the wire, but it
2498          * just needs indexes at this point, avoids it having
2499          * target page specific code.
2500          */
2501         postcopy_send_discard_bm_ram(ms, block);
2502         postcopy_discard_send_finish(ms);
2503     }
2504 }
2505
2506 /**
2507  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2508  *
2509  * Helper for postcopy_chunk_hostpages; it's called twice to
2510  * canonicalize the two bitmaps, that are similar, but one is
2511  * inverted.
2512  *
2513  * Postcopy requires that all target pages in a hostpage are dirty or
2514  * clean, not a mix.  This function canonicalizes the bitmaps.
2515  *
2516  * @ms: current migration state
2517  * @block: block that contains the page we want to canonicalize
2518  */
2519 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2520 {
2521     RAMState *rs = ram_state;
2522     unsigned long *bitmap = block->bmap;
2523     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2524     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2525     unsigned long run_start;
2526
2527     if (block->page_size == TARGET_PAGE_SIZE) {
2528         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2529         return;
2530     }
2531
2532     /* Find a dirty page */
2533     run_start = find_next_bit(bitmap, pages, 0);
2534
2535     while (run_start < pages) {
2536
2537         /*
2538          * If the start of this run of pages is in the middle of a host
2539          * page, then we need to fixup this host page.
2540          */
2541         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2542             /* Find the end of this run */
2543             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2544             /*
2545              * If the end isn't at the start of a host page, then the
2546              * run doesn't finish at the end of a host page
2547              * and we need to discard.
2548              */
2549         }
2550
2551         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2552             unsigned long page;
2553             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2554                                                              host_ratio);
2555             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2556
2557             /* Clean up the bitmap */
2558             for (page = fixup_start_addr;
2559                  page < fixup_start_addr + host_ratio; page++) {
2560                 /*
2561                  * Remark them as dirty, updating the count for any pages
2562                  * that weren't previously dirty.
2563                  */
2564                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2565             }
2566         }
2567
2568         /* Find the next dirty page for the next iteration */
2569         run_start = find_next_bit(bitmap, pages, run_start);
2570     }
2571 }
2572
2573 /**
2574  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2575  *
2576  * Transmit the set of pages to be discarded after precopy to the target
2577  * these are pages that:
2578  *     a) Have been previously transmitted but are now dirty again
2579  *     b) Pages that have never been transmitted, this ensures that
2580  *        any pages on the destination that have been mapped by background
2581  *        tasks get discarded (transparent huge pages is the specific concern)
2582  * Hopefully this is pretty sparse
2583  *
2584  * @ms: current migration state
2585  */
2586 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2587 {
2588     RAMState *rs = ram_state;
2589
2590     RCU_READ_LOCK_GUARD();
2591
2592     /* This should be our last sync, the src is now paused */
2593     migration_bitmap_sync(rs);
2594
2595     /* Easiest way to make sure we don't resume in the middle of a host-page */
2596     rs->last_seen_block = NULL;
2597     rs->last_sent_block = NULL;
2598     rs->last_page = 0;
2599
2600     postcopy_each_ram_send_discard(ms);
2601
2602     trace_ram_postcopy_send_discard_bitmap();
2603 }
2604
2605 /**
2606  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2607  *
2608  * Returns zero on success
2609  *
2610  * @rbname: name of the RAMBlock of the request. NULL means the
2611  *          same that last one.
2612  * @start: RAMBlock starting page
2613  * @length: RAMBlock size
2614  */
2615 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2616 {
2617     trace_ram_discard_range(rbname, start, length);
2618
2619     RCU_READ_LOCK_GUARD();
2620     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2621
2622     if (!rb) {
2623         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2624         return -1;
2625     }
2626
2627     /*
2628      * On source VM, we don't need to update the received bitmap since
2629      * we don't even have one.
2630      */
2631     if (rb->receivedmap) {
2632         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2633                      length >> qemu_target_page_bits());
2634     }
2635
2636     return ram_block_discard_range(rb, start, length);
2637 }
2638
2639 /*
2640  * For every allocation, we will try not to crash the VM if the
2641  * allocation failed.
2642  */
2643 static int xbzrle_init(void)
2644 {
2645     Error *local_err = NULL;
2646
2647     if (!migrate_use_xbzrle()) {
2648         return 0;
2649     }
2650
2651     XBZRLE_cache_lock();
2652
2653     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2654     if (!XBZRLE.zero_target_page) {
2655         error_report("%s: Error allocating zero page", __func__);
2656         goto err_out;
2657     }
2658
2659     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2660                               TARGET_PAGE_SIZE, &local_err);
2661     if (!XBZRLE.cache) {
2662         error_report_err(local_err);
2663         goto free_zero_page;
2664     }
2665
2666     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2667     if (!XBZRLE.encoded_buf) {
2668         error_report("%s: Error allocating encoded_buf", __func__);
2669         goto free_cache;
2670     }
2671
2672     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2673     if (!XBZRLE.current_buf) {
2674         error_report("%s: Error allocating current_buf", __func__);
2675         goto free_encoded_buf;
2676     }
2677
2678     /* We are all good */
2679     XBZRLE_cache_unlock();
2680     return 0;
2681
2682 free_encoded_buf:
2683     g_free(XBZRLE.encoded_buf);
2684     XBZRLE.encoded_buf = NULL;
2685 free_cache:
2686     cache_fini(XBZRLE.cache);
2687     XBZRLE.cache = NULL;
2688 free_zero_page:
2689     g_free(XBZRLE.zero_target_page);
2690     XBZRLE.zero_target_page = NULL;
2691 err_out:
2692     XBZRLE_cache_unlock();
2693     return -ENOMEM;
2694 }
2695
2696 static int ram_state_init(RAMState **rsp)
2697 {
2698     *rsp = g_try_new0(RAMState, 1);
2699
2700     if (!*rsp) {
2701         error_report("%s: Init ramstate fail", __func__);
2702         return -1;
2703     }
2704
2705     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2706     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2707     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2708
2709     /*
2710      * Count the total number of pages used by ram blocks not including any
2711      * gaps due to alignment or unplugs.
2712      * This must match with the initial values of dirty bitmap.
2713      */
2714     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2715     ram_state_reset(*rsp);
2716
2717     return 0;
2718 }
2719
2720 static void ram_list_init_bitmaps(void)
2721 {
2722     MigrationState *ms = migrate_get_current();
2723     RAMBlock *block;
2724     unsigned long pages;
2725     uint8_t shift;
2726
2727     /* Skip setting bitmap if there is no RAM */
2728     if (ram_bytes_total()) {
2729         shift = ms->clear_bitmap_shift;
2730         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2731             error_report("clear_bitmap_shift (%u) too big, using "
2732                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2733             shift = CLEAR_BITMAP_SHIFT_MAX;
2734         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2735             error_report("clear_bitmap_shift (%u) too small, using "
2736                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2737             shift = CLEAR_BITMAP_SHIFT_MIN;
2738         }
2739
2740         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2741             pages = block->max_length >> TARGET_PAGE_BITS;
2742             /*
2743              * The initial dirty bitmap for migration must be set with all
2744              * ones to make sure we'll migrate every guest RAM page to
2745              * destination.
2746              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2747              * new migration after a failed migration, ram_list.
2748              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2749              * guest memory.
2750              */
2751             block->bmap = bitmap_new(pages);
2752             bitmap_set(block->bmap, 0, pages);
2753             block->clear_bmap_shift = shift;
2754             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2755         }
2756     }
2757 }
2758
2759 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2760 {
2761     unsigned long pages;
2762     RAMBlock *rb;
2763
2764     RCU_READ_LOCK_GUARD();
2765
2766     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2767             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2768             rs->migration_dirty_pages -= pages;
2769     }
2770 }
2771
2772 static void ram_init_bitmaps(RAMState *rs)
2773 {
2774     /* For memory_global_dirty_log_start below.  */
2775     qemu_mutex_lock_iothread();
2776     qemu_mutex_lock_ramlist();
2777
2778     WITH_RCU_READ_LOCK_GUARD() {
2779         ram_list_init_bitmaps();
2780         /* We don't use dirty log with background snapshots */
2781         if (!migrate_background_snapshot()) {
2782             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2783             migration_bitmap_sync_precopy(rs);
2784         }
2785     }
2786     qemu_mutex_unlock_ramlist();
2787     qemu_mutex_unlock_iothread();
2788
2789     /*
2790      * After an eventual first bitmap sync, fixup the initial bitmap
2791      * containing all 1s to exclude any discarded pages from migration.
2792      */
2793     migration_bitmap_clear_discarded_pages(rs);
2794 }
2795
2796 static int ram_init_all(RAMState **rsp)
2797 {
2798     if (ram_state_init(rsp)) {
2799         return -1;
2800     }
2801
2802     if (xbzrle_init()) {
2803         ram_state_cleanup(rsp);
2804         return -1;
2805     }
2806
2807     ram_init_bitmaps(*rsp);
2808
2809     return 0;
2810 }
2811
2812 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2813 {
2814     RAMBlock *block;
2815     uint64_t pages = 0;
2816
2817     /*
2818      * Postcopy is not using xbzrle/compression, so no need for that.
2819      * Also, since source are already halted, we don't need to care
2820      * about dirty page logging as well.
2821      */
2822
2823     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2824         pages += bitmap_count_one(block->bmap,
2825                                   block->used_length >> TARGET_PAGE_BITS);
2826     }
2827
2828     /* This may not be aligned with current bitmaps. Recalculate. */
2829     rs->migration_dirty_pages = pages;
2830
2831     ram_state_reset(rs);
2832
2833     /* Update RAMState cache of output QEMUFile */
2834     rs->f = out;
2835
2836     trace_ram_state_resume_prepare(pages);
2837 }
2838
2839 /*
2840  * This function clears bits of the free pages reported by the caller from the
2841  * migration dirty bitmap. @addr is the host address corresponding to the
2842  * start of the continuous guest free pages, and @len is the total bytes of
2843  * those pages.
2844  */
2845 void qemu_guest_free_page_hint(void *addr, size_t len)
2846 {
2847     RAMBlock *block;
2848     ram_addr_t offset;
2849     size_t used_len, start, npages;
2850     MigrationState *s = migrate_get_current();
2851
2852     /* This function is currently expected to be used during live migration */
2853     if (!migration_is_setup_or_active(s->state)) {
2854         return;
2855     }
2856
2857     for (; len > 0; len -= used_len, addr += used_len) {
2858         block = qemu_ram_block_from_host(addr, false, &offset);
2859         if (unlikely(!block || offset >= block->used_length)) {
2860             /*
2861              * The implementation might not support RAMBlock resize during
2862              * live migration, but it could happen in theory with future
2863              * updates. So we add a check here to capture that case.
2864              */
2865             error_report_once("%s unexpected error", __func__);
2866             return;
2867         }
2868
2869         if (len <= block->used_length - offset) {
2870             used_len = len;
2871         } else {
2872             used_len = block->used_length - offset;
2873         }
2874
2875         start = offset >> TARGET_PAGE_BITS;
2876         npages = used_len >> TARGET_PAGE_BITS;
2877
2878         qemu_mutex_lock(&ram_state->bitmap_mutex);
2879         /*
2880          * The skipped free pages are equavalent to be sent from clear_bmap's
2881          * perspective, so clear the bits from the memory region bitmap which
2882          * are initially set. Otherwise those skipped pages will be sent in
2883          * the next round after syncing from the memory region bitmap.
2884          */
2885         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2886         ram_state->migration_dirty_pages -=
2887                       bitmap_count_one_with_offset(block->bmap, start, npages);
2888         bitmap_clear(block->bmap, start, npages);
2889         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2890     }
2891 }
2892
2893 /*
2894  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2895  * long-running RCU critical section.  When rcu-reclaims in the code
2896  * start to become numerous it will be necessary to reduce the
2897  * granularity of these critical sections.
2898  */
2899
2900 /**
2901  * ram_save_setup: Setup RAM for migration
2902  *
2903  * Returns zero to indicate success and negative for error
2904  *
2905  * @f: QEMUFile where to send the data
2906  * @opaque: RAMState pointer
2907  */
2908 static int ram_save_setup(QEMUFile *f, void *opaque)
2909 {
2910     RAMState **rsp = opaque;
2911     RAMBlock *block;
2912     int ret;
2913
2914     if (compress_threads_save_setup()) {
2915         return -1;
2916     }
2917
2918     /* migration has already setup the bitmap, reuse it. */
2919     if (!migration_in_colo_state()) {
2920         if (ram_init_all(rsp) != 0) {
2921             compress_threads_save_cleanup();
2922             return -1;
2923         }
2924     }
2925     (*rsp)->f = f;
2926
2927     WITH_RCU_READ_LOCK_GUARD() {
2928         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2929
2930         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2931             qemu_put_byte(f, strlen(block->idstr));
2932             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2933             qemu_put_be64(f, block->used_length);
2934             if (migrate_postcopy_ram() && block->page_size !=
2935                                           qemu_host_page_size) {
2936                 qemu_put_be64(f, block->page_size);
2937             }
2938             if (migrate_ignore_shared()) {
2939                 qemu_put_be64(f, block->mr->addr);
2940             }
2941         }
2942     }
2943
2944     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2945     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2946
2947     ret =  multifd_send_sync_main(f);
2948     if (ret < 0) {
2949         return ret;
2950     }
2951
2952     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2953     qemu_fflush(f);
2954
2955     return 0;
2956 }
2957
2958 /**
2959  * ram_save_iterate: iterative stage for migration
2960  *
2961  * Returns zero to indicate success and negative for error
2962  *
2963  * @f: QEMUFile where to send the data
2964  * @opaque: RAMState pointer
2965  */
2966 static int ram_save_iterate(QEMUFile *f, void *opaque)
2967 {
2968     RAMState **temp = opaque;
2969     RAMState *rs = *temp;
2970     int ret = 0;
2971     int i;
2972     int64_t t0;
2973     int done = 0;
2974
2975     if (blk_mig_bulk_active()) {
2976         /* Avoid transferring ram during bulk phase of block migration as
2977          * the bulk phase will usually take a long time and transferring
2978          * ram updates during that time is pointless. */
2979         goto out;
2980     }
2981
2982     /*
2983      * We'll take this lock a little bit long, but it's okay for two reasons.
2984      * Firstly, the only possible other thread to take it is who calls
2985      * qemu_guest_free_page_hint(), which should be rare; secondly, see
2986      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
2987      * guarantees that we'll at least released it in a regular basis.
2988      */
2989     qemu_mutex_lock(&rs->bitmap_mutex);
2990     WITH_RCU_READ_LOCK_GUARD() {
2991         if (ram_list.version != rs->last_version) {
2992             ram_state_reset(rs);
2993         }
2994
2995         /* Read version before ram_list.blocks */
2996         smp_rmb();
2997
2998         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2999
3000         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3001         i = 0;
3002         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3003                postcopy_has_request(rs)) {
3004             int pages;
3005
3006             if (qemu_file_get_error(f)) {
3007                 break;
3008             }
3009
3010             pages = ram_find_and_save_block(rs);
3011             /* no more pages to sent */
3012             if (pages == 0) {
3013                 done = 1;
3014                 break;
3015             }
3016
3017             if (pages < 0) {
3018                 qemu_file_set_error(f, pages);
3019                 break;
3020             }
3021
3022             rs->target_page_count += pages;
3023
3024             /*
3025              * During postcopy, it is necessary to make sure one whole host
3026              * page is sent in one chunk.
3027              */
3028             if (migrate_postcopy_ram()) {
3029                 flush_compressed_data(rs);
3030             }
3031
3032             /*
3033              * we want to check in the 1st loop, just in case it was the 1st
3034              * time and we had to sync the dirty bitmap.
3035              * qemu_clock_get_ns() is a bit expensive, so we only check each
3036              * some iterations
3037              */
3038             if ((i & 63) == 0) {
3039                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3040                               1000000;
3041                 if (t1 > MAX_WAIT) {
3042                     trace_ram_save_iterate_big_wait(t1, i);
3043                     break;
3044                 }
3045             }
3046             i++;
3047         }
3048     }
3049     qemu_mutex_unlock(&rs->bitmap_mutex);
3050
3051     /*
3052      * Must occur before EOS (or any QEMUFile operation)
3053      * because of RDMA protocol.
3054      */
3055     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3056
3057 out:
3058     if (ret >= 0
3059         && migration_is_setup_or_active(migrate_get_current()->state)) {
3060         ret = multifd_send_sync_main(rs->f);
3061         if (ret < 0) {
3062             return ret;
3063         }
3064
3065         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3066         qemu_fflush(f);
3067         ram_transferred_add(8);
3068
3069         ret = qemu_file_get_error(f);
3070     }
3071     if (ret < 0) {
3072         return ret;
3073     }
3074
3075     return done;
3076 }
3077
3078 /**
3079  * ram_save_complete: function called to send the remaining amount of ram
3080  *
3081  * Returns zero to indicate success or negative on error
3082  *
3083  * Called with iothread lock
3084  *
3085  * @f: QEMUFile where to send the data
3086  * @opaque: RAMState pointer
3087  */
3088 static int ram_save_complete(QEMUFile *f, void *opaque)
3089 {
3090     RAMState **temp = opaque;
3091     RAMState *rs = *temp;
3092     int ret = 0;
3093
3094     rs->last_stage = !migration_in_colo_state();
3095
3096     WITH_RCU_READ_LOCK_GUARD() {
3097         if (!migration_in_postcopy()) {
3098             migration_bitmap_sync_precopy(rs);
3099         }
3100
3101         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3102
3103         /* try transferring iterative blocks of memory */
3104
3105         /* flush all remaining blocks regardless of rate limiting */
3106         while (true) {
3107             int pages;
3108
3109             pages = ram_find_and_save_block(rs);
3110             /* no more blocks to sent */
3111             if (pages == 0) {
3112                 break;
3113             }
3114             if (pages < 0) {
3115                 ret = pages;
3116                 break;
3117             }
3118         }
3119
3120         flush_compressed_data(rs);
3121         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3122     }
3123
3124     if (ret < 0) {
3125         return ret;
3126     }
3127
3128     ret = multifd_send_sync_main(rs->f);
3129     if (ret < 0) {
3130         return ret;
3131     }
3132
3133     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3134     qemu_fflush(f);
3135
3136     return 0;
3137 }
3138
3139 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3140                              uint64_t *res_precopy_only,
3141                              uint64_t *res_compatible,
3142                              uint64_t *res_postcopy_only)
3143 {
3144     RAMState **temp = opaque;
3145     RAMState *rs = *temp;
3146     uint64_t remaining_size;
3147
3148     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3149
3150     if (!migration_in_postcopy() &&
3151         remaining_size < max_size) {
3152         qemu_mutex_lock_iothread();
3153         WITH_RCU_READ_LOCK_GUARD() {
3154             migration_bitmap_sync_precopy(rs);
3155         }
3156         qemu_mutex_unlock_iothread();
3157         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3158     }
3159
3160     if (migrate_postcopy_ram()) {
3161         /* We can do postcopy, and all the data is postcopiable */
3162         *res_compatible += remaining_size;
3163     } else {
3164         *res_precopy_only += remaining_size;
3165     }
3166 }
3167
3168 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3169 {
3170     unsigned int xh_len;
3171     int xh_flags;
3172     uint8_t *loaded_data;
3173
3174     /* extract RLE header */
3175     xh_flags = qemu_get_byte(f);
3176     xh_len = qemu_get_be16(f);
3177
3178     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3179         error_report("Failed to load XBZRLE page - wrong compression!");
3180         return -1;
3181     }
3182
3183     if (xh_len > TARGET_PAGE_SIZE) {
3184         error_report("Failed to load XBZRLE page - len overflow!");
3185         return -1;
3186     }
3187     loaded_data = XBZRLE.decoded_buf;
3188     /* load data and decode */
3189     /* it can change loaded_data to point to an internal buffer */
3190     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3191
3192     /* decode RLE */
3193     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3194                              TARGET_PAGE_SIZE) == -1) {
3195         error_report("Failed to load XBZRLE page - decode error!");
3196         return -1;
3197     }
3198
3199     return 0;
3200 }
3201
3202 /**
3203  * ram_block_from_stream: read a RAMBlock id from the migration stream
3204  *
3205  * Must be called from within a rcu critical section.
3206  *
3207  * Returns a pointer from within the RCU-protected ram_list.
3208  *
3209  * @mis: the migration incoming state pointer
3210  * @f: QEMUFile where to read the data from
3211  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3212  */
3213 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3214                                               QEMUFile *f, int flags)
3215 {
3216     RAMBlock *block = mis->last_recv_block;
3217     char id[256];
3218     uint8_t len;
3219
3220     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3221         if (!block) {
3222             error_report("Ack, bad migration stream!");
3223             return NULL;
3224         }
3225         return block;
3226     }
3227
3228     len = qemu_get_byte(f);
3229     qemu_get_buffer(f, (uint8_t *)id, len);
3230     id[len] = 0;
3231
3232     block = qemu_ram_block_by_name(id);
3233     if (!block) {
3234         error_report("Can't find block %s", id);
3235         return NULL;
3236     }
3237
3238     if (ramblock_is_ignored(block)) {
3239         error_report("block %s should not be migrated !", id);
3240         return NULL;
3241     }
3242
3243     mis->last_recv_block = block;
3244
3245     return block;
3246 }
3247
3248 static inline void *host_from_ram_block_offset(RAMBlock *block,
3249                                                ram_addr_t offset)
3250 {
3251     if (!offset_in_ramblock(block, offset)) {
3252         return NULL;
3253     }
3254
3255     return block->host + offset;
3256 }
3257
3258 static void *host_page_from_ram_block_offset(RAMBlock *block,
3259                                              ram_addr_t offset)
3260 {
3261     /* Note: Explicitly no check against offset_in_ramblock(). */
3262     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3263                                    block->page_size);
3264 }
3265
3266 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3267                                                          ram_addr_t offset)
3268 {
3269     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3270 }
3271
3272 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3273                              ram_addr_t offset, bool record_bitmap)
3274 {
3275     if (!offset_in_ramblock(block, offset)) {
3276         return NULL;
3277     }
3278     if (!block->colo_cache) {
3279         error_report("%s: colo_cache is NULL in block :%s",
3280                      __func__, block->idstr);
3281         return NULL;
3282     }
3283
3284     /*
3285     * During colo checkpoint, we need bitmap of these migrated pages.
3286     * It help us to decide which pages in ram cache should be flushed
3287     * into VM's RAM later.
3288     */
3289     if (record_bitmap &&
3290         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3291         ram_state->migration_dirty_pages++;
3292     }
3293     return block->colo_cache + offset;
3294 }
3295
3296 /**
3297  * ram_handle_compressed: handle the zero page case
3298  *
3299  * If a page (or a whole RDMA chunk) has been
3300  * determined to be zero, then zap it.
3301  *
3302  * @host: host address for the zero page
3303  * @ch: what the page is filled from.  We only support zero
3304  * @size: size of the zero page
3305  */
3306 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3307 {
3308     if (ch != 0 || !buffer_is_zero(host, size)) {
3309         memset(host, ch, size);
3310     }
3311 }
3312
3313 /* return the size after decompression, or negative value on error */
3314 static int
3315 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3316                      const uint8_t *source, size_t source_len)
3317 {
3318     int err;
3319
3320     err = inflateReset(stream);
3321     if (err != Z_OK) {
3322         return -1;
3323     }
3324
3325     stream->avail_in = source_len;
3326     stream->next_in = (uint8_t *)source;
3327     stream->avail_out = dest_len;
3328     stream->next_out = dest;
3329
3330     err = inflate(stream, Z_NO_FLUSH);
3331     if (err != Z_STREAM_END) {
3332         return -1;
3333     }
3334
3335     return stream->total_out;
3336 }
3337
3338 static void *do_data_decompress(void *opaque)
3339 {
3340     DecompressParam *param = opaque;
3341     unsigned long pagesize;
3342     uint8_t *des;
3343     int len, ret;
3344
3345     qemu_mutex_lock(&param->mutex);
3346     while (!param->quit) {
3347         if (param->des) {
3348             des = param->des;
3349             len = param->len;
3350             param->des = 0;
3351             qemu_mutex_unlock(&param->mutex);
3352
3353             pagesize = TARGET_PAGE_SIZE;
3354
3355             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3356                                        param->compbuf, len);
3357             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3358                 error_report("decompress data failed");
3359                 qemu_file_set_error(decomp_file, ret);
3360             }
3361
3362             qemu_mutex_lock(&decomp_done_lock);
3363             param->done = true;
3364             qemu_cond_signal(&decomp_done_cond);
3365             qemu_mutex_unlock(&decomp_done_lock);
3366
3367             qemu_mutex_lock(&param->mutex);
3368         } else {
3369             qemu_cond_wait(&param->cond, &param->mutex);
3370         }
3371     }
3372     qemu_mutex_unlock(&param->mutex);
3373
3374     return NULL;
3375 }
3376
3377 static int wait_for_decompress_done(void)
3378 {
3379     int idx, thread_count;
3380
3381     if (!migrate_use_compression()) {
3382         return 0;
3383     }
3384
3385     thread_count = migrate_decompress_threads();
3386     qemu_mutex_lock(&decomp_done_lock);
3387     for (idx = 0; idx < thread_count; idx++) {
3388         while (!decomp_param[idx].done) {
3389             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3390         }
3391     }
3392     qemu_mutex_unlock(&decomp_done_lock);
3393     return qemu_file_get_error(decomp_file);
3394 }
3395
3396 static void compress_threads_load_cleanup(void)
3397 {
3398     int i, thread_count;
3399
3400     if (!migrate_use_compression()) {
3401         return;
3402     }
3403     thread_count = migrate_decompress_threads();
3404     for (i = 0; i < thread_count; i++) {
3405         /*
3406          * we use it as a indicator which shows if the thread is
3407          * properly init'd or not
3408          */
3409         if (!decomp_param[i].compbuf) {
3410             break;
3411         }
3412
3413         qemu_mutex_lock(&decomp_param[i].mutex);
3414         decomp_param[i].quit = true;
3415         qemu_cond_signal(&decomp_param[i].cond);
3416         qemu_mutex_unlock(&decomp_param[i].mutex);
3417     }
3418     for (i = 0; i < thread_count; i++) {
3419         if (!decomp_param[i].compbuf) {
3420             break;
3421         }
3422
3423         qemu_thread_join(decompress_threads + i);
3424         qemu_mutex_destroy(&decomp_param[i].mutex);
3425         qemu_cond_destroy(&decomp_param[i].cond);
3426         inflateEnd(&decomp_param[i].stream);
3427         g_free(decomp_param[i].compbuf);
3428         decomp_param[i].compbuf = NULL;
3429     }
3430     g_free(decompress_threads);
3431     g_free(decomp_param);
3432     decompress_threads = NULL;
3433     decomp_param = NULL;
3434     decomp_file = NULL;
3435 }
3436
3437 static int compress_threads_load_setup(QEMUFile *f)
3438 {
3439     int i, thread_count;
3440
3441     if (!migrate_use_compression()) {
3442         return 0;
3443     }
3444
3445     thread_count = migrate_decompress_threads();
3446     decompress_threads = g_new0(QemuThread, thread_count);
3447     decomp_param = g_new0(DecompressParam, thread_count);
3448     qemu_mutex_init(&decomp_done_lock);
3449     qemu_cond_init(&decomp_done_cond);
3450     decomp_file = f;
3451     for (i = 0; i < thread_count; i++) {
3452         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3453             goto exit;
3454         }
3455
3456         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3457         qemu_mutex_init(&decomp_param[i].mutex);
3458         qemu_cond_init(&decomp_param[i].cond);
3459         decomp_param[i].done = true;
3460         decomp_param[i].quit = false;
3461         qemu_thread_create(decompress_threads + i, "decompress",
3462                            do_data_decompress, decomp_param + i,
3463                            QEMU_THREAD_JOINABLE);
3464     }
3465     return 0;
3466 exit:
3467     compress_threads_load_cleanup();
3468     return -1;
3469 }
3470
3471 static void decompress_data_with_multi_threads(QEMUFile *f,
3472                                                void *host, int len)
3473 {
3474     int idx, thread_count;
3475
3476     thread_count = migrate_decompress_threads();
3477     QEMU_LOCK_GUARD(&decomp_done_lock);
3478     while (true) {
3479         for (idx = 0; idx < thread_count; idx++) {
3480             if (decomp_param[idx].done) {
3481                 decomp_param[idx].done = false;
3482                 qemu_mutex_lock(&decomp_param[idx].mutex);
3483                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3484                 decomp_param[idx].des = host;
3485                 decomp_param[idx].len = len;
3486                 qemu_cond_signal(&decomp_param[idx].cond);
3487                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3488                 break;
3489             }
3490         }
3491         if (idx < thread_count) {
3492             break;
3493         } else {
3494             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3495         }
3496     }
3497 }
3498
3499 static void colo_init_ram_state(void)
3500 {
3501     ram_state_init(&ram_state);
3502 }
3503
3504 /*
3505  * colo cache: this is for secondary VM, we cache the whole
3506  * memory of the secondary VM, it is need to hold the global lock
3507  * to call this helper.
3508  */
3509 int colo_init_ram_cache(void)
3510 {
3511     RAMBlock *block;
3512
3513     WITH_RCU_READ_LOCK_GUARD() {
3514         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3515             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3516                                                     NULL, false, false);
3517             if (!block->colo_cache) {
3518                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3519                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3520                              block->used_length);
3521                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3522                     if (block->colo_cache) {
3523                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3524                         block->colo_cache = NULL;
3525                     }
3526                 }
3527                 return -errno;
3528             }
3529             if (!machine_dump_guest_core(current_machine)) {
3530                 qemu_madvise(block->colo_cache, block->used_length,
3531                              QEMU_MADV_DONTDUMP);
3532             }
3533         }
3534     }
3535
3536     /*
3537     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3538     * with to decide which page in cache should be flushed into SVM's RAM. Here
3539     * we use the same name 'ram_bitmap' as for migration.
3540     */
3541     if (ram_bytes_total()) {
3542         RAMBlock *block;
3543
3544         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3545             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3546             block->bmap = bitmap_new(pages);
3547         }
3548     }
3549
3550     colo_init_ram_state();
3551     return 0;
3552 }
3553
3554 /* TODO: duplicated with ram_init_bitmaps */
3555 void colo_incoming_start_dirty_log(void)
3556 {
3557     RAMBlock *block = NULL;
3558     /* For memory_global_dirty_log_start below. */
3559     qemu_mutex_lock_iothread();
3560     qemu_mutex_lock_ramlist();
3561
3562     memory_global_dirty_log_sync();
3563     WITH_RCU_READ_LOCK_GUARD() {
3564         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3565             ramblock_sync_dirty_bitmap(ram_state, block);
3566             /* Discard this dirty bitmap record */
3567             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3568         }
3569         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3570     }
3571     ram_state->migration_dirty_pages = 0;
3572     qemu_mutex_unlock_ramlist();
3573     qemu_mutex_unlock_iothread();
3574 }
3575
3576 /* It is need to hold the global lock to call this helper */
3577 void colo_release_ram_cache(void)
3578 {
3579     RAMBlock *block;
3580
3581     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3582     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3583         g_free(block->bmap);
3584         block->bmap = NULL;
3585     }
3586
3587     WITH_RCU_READ_LOCK_GUARD() {
3588         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3589             if (block->colo_cache) {
3590                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3591                 block->colo_cache = NULL;
3592             }
3593         }
3594     }
3595     ram_state_cleanup(&ram_state);
3596 }
3597
3598 /**
3599  * ram_load_setup: Setup RAM for migration incoming side
3600  *
3601  * Returns zero to indicate success and negative for error
3602  *
3603  * @f: QEMUFile where to receive the data
3604  * @opaque: RAMState pointer
3605  */
3606 static int ram_load_setup(QEMUFile *f, void *opaque)
3607 {
3608     if (compress_threads_load_setup(f)) {
3609         return -1;
3610     }
3611
3612     xbzrle_load_setup();
3613     ramblock_recv_map_init();
3614
3615     return 0;
3616 }
3617
3618 static int ram_load_cleanup(void *opaque)
3619 {
3620     RAMBlock *rb;
3621
3622     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3623         qemu_ram_block_writeback(rb);
3624     }
3625
3626     xbzrle_load_cleanup();
3627     compress_threads_load_cleanup();
3628
3629     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3630         g_free(rb->receivedmap);
3631         rb->receivedmap = NULL;
3632     }
3633
3634     return 0;
3635 }
3636
3637 /**
3638  * ram_postcopy_incoming_init: allocate postcopy data structures
3639  *
3640  * Returns 0 for success and negative if there was one error
3641  *
3642  * @mis: current migration incoming state
3643  *
3644  * Allocate data structures etc needed by incoming migration with
3645  * postcopy-ram. postcopy-ram's similarly names
3646  * postcopy_ram_incoming_init does the work.
3647  */
3648 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3649 {
3650     return postcopy_ram_incoming_init(mis);
3651 }
3652
3653 /**
3654  * ram_load_postcopy: load a page in postcopy case
3655  *
3656  * Returns 0 for success or -errno in case of error
3657  *
3658  * Called in postcopy mode by ram_load().
3659  * rcu_read_lock is taken prior to this being called.
3660  *
3661  * @f: QEMUFile where to send the data
3662  */
3663 int ram_load_postcopy(QEMUFile *f)
3664 {
3665     int flags = 0, ret = 0;
3666     bool place_needed = false;
3667     bool matches_target_page_size = false;
3668     MigrationIncomingState *mis = migration_incoming_get_current();
3669     /* Currently we only use channel 0.  TODO: use all the channels */
3670     PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[0];
3671
3672     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3673         ram_addr_t addr;
3674         void *page_buffer = NULL;
3675         void *place_source = NULL;
3676         RAMBlock *block = NULL;
3677         uint8_t ch;
3678         int len;
3679
3680         addr = qemu_get_be64(f);
3681
3682         /*
3683          * If qemu file error, we should stop here, and then "addr"
3684          * may be invalid
3685          */
3686         ret = qemu_file_get_error(f);
3687         if (ret) {
3688             break;
3689         }
3690
3691         flags = addr & ~TARGET_PAGE_MASK;
3692         addr &= TARGET_PAGE_MASK;
3693
3694         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3695         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3696                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3697             block = ram_block_from_stream(mis, f, flags);
3698             if (!block) {
3699                 ret = -EINVAL;
3700                 break;
3701             }
3702
3703             /*
3704              * Relying on used_length is racy and can result in false positives.
3705              * We might place pages beyond used_length in case RAM was shrunk
3706              * while in postcopy, which is fine - trying to place via
3707              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3708              */
3709             if (!block->host || addr >= block->postcopy_length) {
3710                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3711                 ret = -EINVAL;
3712                 break;
3713             }
3714             tmp_page->target_pages++;
3715             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3716             /*
3717              * Postcopy requires that we place whole host pages atomically;
3718              * these may be huge pages for RAMBlocks that are backed by
3719              * hugetlbfs.
3720              * To make it atomic, the data is read into a temporary page
3721              * that's moved into place later.
3722              * The migration protocol uses,  possibly smaller, target-pages
3723              * however the source ensures it always sends all the components
3724              * of a host page in one chunk.
3725              */
3726             page_buffer = tmp_page->tmp_huge_page +
3727                           host_page_offset_from_ram_block_offset(block, addr);
3728             /* If all TP are zero then we can optimise the place */
3729             if (tmp_page->target_pages == 1) {
3730                 tmp_page->host_addr =
3731                     host_page_from_ram_block_offset(block, addr);
3732             } else if (tmp_page->host_addr !=
3733                        host_page_from_ram_block_offset(block, addr)) {
3734                 /* not the 1st TP within the HP */
3735                 error_report("Non-same host page detected.  "
3736                              "Target host page %p, received host page %p "
3737                              "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
3738                              tmp_page->host_addr,
3739                              host_page_from_ram_block_offset(block, addr),
3740                              block->idstr, addr, tmp_page->target_pages);
3741                 ret = -EINVAL;
3742                 break;
3743             }
3744
3745             /*
3746              * If it's the last part of a host page then we place the host
3747              * page
3748              */
3749             if (tmp_page->target_pages ==
3750                 (block->page_size / TARGET_PAGE_SIZE)) {
3751                 place_needed = true;
3752             }
3753             place_source = tmp_page->tmp_huge_page;
3754         }
3755
3756         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3757         case RAM_SAVE_FLAG_ZERO:
3758             ch = qemu_get_byte(f);
3759             /*
3760              * Can skip to set page_buffer when
3761              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3762              */
3763             if (ch || !matches_target_page_size) {
3764                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3765             }
3766             if (ch) {
3767                 tmp_page->all_zero = false;
3768             }
3769             break;
3770
3771         case RAM_SAVE_FLAG_PAGE:
3772             tmp_page->all_zero = false;
3773             if (!matches_target_page_size) {
3774                 /* For huge pages, we always use temporary buffer */
3775                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3776             } else {
3777                 /*
3778                  * For small pages that matches target page size, we
3779                  * avoid the qemu_file copy.  Instead we directly use
3780                  * the buffer of QEMUFile to place the page.  Note: we
3781                  * cannot do any QEMUFile operation before using that
3782                  * buffer to make sure the buffer is valid when
3783                  * placing the page.
3784                  */
3785                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3786                                          TARGET_PAGE_SIZE);
3787             }
3788             break;
3789         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3790             tmp_page->all_zero = false;
3791             len = qemu_get_be32(f);
3792             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3793                 error_report("Invalid compressed data length: %d", len);
3794                 ret = -EINVAL;
3795                 break;
3796             }
3797             decompress_data_with_multi_threads(f, page_buffer, len);
3798             break;
3799
3800         case RAM_SAVE_FLAG_EOS:
3801             /* normal exit */
3802             multifd_recv_sync_main();
3803             break;
3804         default:
3805             error_report("Unknown combination of migration flags: 0x%x"
3806                          " (postcopy mode)", flags);
3807             ret = -EINVAL;
3808             break;
3809         }
3810
3811         /* Got the whole host page, wait for decompress before placing. */
3812         if (place_needed) {
3813             ret |= wait_for_decompress_done();
3814         }
3815
3816         /* Detect for any possible file errors */
3817         if (!ret && qemu_file_get_error(f)) {
3818             ret = qemu_file_get_error(f);
3819         }
3820
3821         if (!ret && place_needed) {
3822             if (tmp_page->all_zero) {
3823                 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
3824             } else {
3825                 ret = postcopy_place_page(mis, tmp_page->host_addr,
3826                                           place_source, block);
3827             }
3828             place_needed = false;
3829             postcopy_temp_page_reset(tmp_page);
3830         }
3831     }
3832
3833     return ret;
3834 }
3835
3836 static bool postcopy_is_advised(void)
3837 {
3838     PostcopyState ps = postcopy_state_get();
3839     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3840 }
3841
3842 static bool postcopy_is_running(void)
3843 {
3844     PostcopyState ps = postcopy_state_get();
3845     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3846 }
3847
3848 /*
3849  * Flush content of RAM cache into SVM's memory.
3850  * Only flush the pages that be dirtied by PVM or SVM or both.
3851  */
3852 void colo_flush_ram_cache(void)
3853 {
3854     RAMBlock *block = NULL;
3855     void *dst_host;
3856     void *src_host;
3857     unsigned long offset = 0;
3858
3859     memory_global_dirty_log_sync();
3860     WITH_RCU_READ_LOCK_GUARD() {
3861         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3862             ramblock_sync_dirty_bitmap(ram_state, block);
3863         }
3864     }
3865
3866     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3867     WITH_RCU_READ_LOCK_GUARD() {
3868         block = QLIST_FIRST_RCU(&ram_list.blocks);
3869
3870         while (block) {
3871             unsigned long num = 0;
3872
3873             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
3874             if (!offset_in_ramblock(block,
3875                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3876                 offset = 0;
3877                 num = 0;
3878                 block = QLIST_NEXT_RCU(block, next);
3879             } else {
3880                 unsigned long i = 0;
3881
3882                 for (i = 0; i < num; i++) {
3883                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
3884                 }
3885                 dst_host = block->host
3886                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3887                 src_host = block->colo_cache
3888                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3889                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3890                 offset += num;
3891             }
3892         }
3893     }
3894     trace_colo_flush_ram_cache_end();
3895 }
3896
3897 /**
3898  * ram_load_precopy: load pages in precopy case
3899  *
3900  * Returns 0 for success or -errno in case of error
3901  *
3902  * Called in precopy mode by ram_load().
3903  * rcu_read_lock is taken prior to this being called.
3904  *
3905  * @f: QEMUFile where to send the data
3906  */
3907 static int ram_load_precopy(QEMUFile *f)
3908 {
3909     MigrationIncomingState *mis = migration_incoming_get_current();
3910     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3911     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3912     bool postcopy_advised = postcopy_is_advised();
3913     if (!migrate_use_compression()) {
3914         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3915     }
3916
3917     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3918         ram_addr_t addr, total_ram_bytes;
3919         void *host = NULL, *host_bak = NULL;
3920         uint8_t ch;
3921
3922         /*
3923          * Yield periodically to let main loop run, but an iteration of
3924          * the main loop is expensive, so do it each some iterations
3925          */
3926         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3927             aio_co_schedule(qemu_get_current_aio_context(),
3928                             qemu_coroutine_self());
3929             qemu_coroutine_yield();
3930         }
3931         i++;
3932
3933         addr = qemu_get_be64(f);
3934         flags = addr & ~TARGET_PAGE_MASK;
3935         addr &= TARGET_PAGE_MASK;
3936
3937         if (flags & invalid_flags) {
3938             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3939                 error_report("Received an unexpected compressed page");
3940             }
3941
3942             ret = -EINVAL;
3943             break;
3944         }
3945
3946         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3947                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3948             RAMBlock *block = ram_block_from_stream(mis, f, flags);
3949
3950             host = host_from_ram_block_offset(block, addr);
3951             /*
3952              * After going into COLO stage, we should not load the page
3953              * into SVM's memory directly, we put them into colo_cache firstly.
3954              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3955              * Previously, we copied all these memory in preparing stage of COLO
3956              * while we need to stop VM, which is a time-consuming process.
3957              * Here we optimize it by a trick, back-up every page while in
3958              * migration process while COLO is enabled, though it affects the
3959              * speed of the migration, but it obviously reduce the downtime of
3960              * back-up all SVM'S memory in COLO preparing stage.
3961              */
3962             if (migration_incoming_colo_enabled()) {
3963                 if (migration_incoming_in_colo_state()) {
3964                     /* In COLO stage, put all pages into cache temporarily */
3965                     host = colo_cache_from_block_offset(block, addr, true);
3966                 } else {
3967                    /*
3968                     * In migration stage but before COLO stage,
3969                     * Put all pages into both cache and SVM's memory.
3970                     */
3971                     host_bak = colo_cache_from_block_offset(block, addr, false);
3972                 }
3973             }
3974             if (!host) {
3975                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3976                 ret = -EINVAL;
3977                 break;
3978             }
3979             if (!migration_incoming_in_colo_state()) {
3980                 ramblock_recv_bitmap_set(block, host);
3981             }
3982
3983             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3984         }
3985
3986         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3987         case RAM_SAVE_FLAG_MEM_SIZE:
3988             /* Synchronize RAM block list */
3989             total_ram_bytes = addr;
3990             while (!ret && total_ram_bytes) {
3991                 RAMBlock *block;
3992                 char id[256];
3993                 ram_addr_t length;
3994
3995                 len = qemu_get_byte(f);
3996                 qemu_get_buffer(f, (uint8_t *)id, len);
3997                 id[len] = 0;
3998                 length = qemu_get_be64(f);
3999
4000                 block = qemu_ram_block_by_name(id);
4001                 if (block && !qemu_ram_is_migratable(block)) {
4002                     error_report("block %s should not be migrated !", id);
4003                     ret = -EINVAL;
4004                 } else if (block) {
4005                     if (length != block->used_length) {
4006                         Error *local_err = NULL;
4007
4008                         ret = qemu_ram_resize(block, length,
4009                                               &local_err);
4010                         if (local_err) {
4011                             error_report_err(local_err);
4012                         }
4013                     }
4014                     /* For postcopy we need to check hugepage sizes match */
4015                     if (postcopy_advised && migrate_postcopy_ram() &&
4016                         block->page_size != qemu_host_page_size) {
4017                         uint64_t remote_page_size = qemu_get_be64(f);
4018                         if (remote_page_size != block->page_size) {
4019                             error_report("Mismatched RAM page size %s "
4020                                          "(local) %zd != %" PRId64,
4021                                          id, block->page_size,
4022                                          remote_page_size);
4023                             ret = -EINVAL;
4024                         }
4025                     }
4026                     if (migrate_ignore_shared()) {
4027                         hwaddr addr = qemu_get_be64(f);
4028                         if (ramblock_is_ignored(block) &&
4029                             block->mr->addr != addr) {
4030                             error_report("Mismatched GPAs for block %s "
4031                                          "%" PRId64 "!= %" PRId64,
4032                                          id, (uint64_t)addr,
4033                                          (uint64_t)block->mr->addr);
4034                             ret = -EINVAL;
4035                         }
4036                     }
4037                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4038                                           block->idstr);
4039                 } else {
4040                     error_report("Unknown ramblock \"%s\", cannot "
4041                                  "accept migration", id);
4042                     ret = -EINVAL;
4043                 }
4044
4045                 total_ram_bytes -= length;
4046             }
4047             break;
4048
4049         case RAM_SAVE_FLAG_ZERO:
4050             ch = qemu_get_byte(f);
4051             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4052             break;
4053
4054         case RAM_SAVE_FLAG_PAGE:
4055             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4056             break;
4057
4058         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4059             len = qemu_get_be32(f);
4060             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4061                 error_report("Invalid compressed data length: %d", len);
4062                 ret = -EINVAL;
4063                 break;
4064             }
4065             decompress_data_with_multi_threads(f, host, len);
4066             break;
4067
4068         case RAM_SAVE_FLAG_XBZRLE:
4069             if (load_xbzrle(f, addr, host) < 0) {
4070                 error_report("Failed to decompress XBZRLE page at "
4071                              RAM_ADDR_FMT, addr);
4072                 ret = -EINVAL;
4073                 break;
4074             }
4075             break;
4076         case RAM_SAVE_FLAG_EOS:
4077             /* normal exit */
4078             multifd_recv_sync_main();
4079             break;
4080         default:
4081             if (flags & RAM_SAVE_FLAG_HOOK) {
4082                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4083             } else {
4084                 error_report("Unknown combination of migration flags: 0x%x",
4085                              flags);
4086                 ret = -EINVAL;
4087             }
4088         }
4089         if (!ret) {
4090             ret = qemu_file_get_error(f);
4091         }
4092         if (!ret && host_bak) {
4093             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4094         }
4095     }
4096
4097     ret |= wait_for_decompress_done();
4098     return ret;
4099 }
4100
4101 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4102 {
4103     int ret = 0;
4104     static uint64_t seq_iter;
4105     /*
4106      * If system is running in postcopy mode, page inserts to host memory must
4107      * be atomic
4108      */
4109     bool postcopy_running = postcopy_is_running();
4110
4111     seq_iter++;
4112
4113     if (version_id != 4) {
4114         return -EINVAL;
4115     }
4116
4117     /*
4118      * This RCU critical section can be very long running.
4119      * When RCU reclaims in the code start to become numerous,
4120      * it will be necessary to reduce the granularity of this
4121      * critical section.
4122      */
4123     WITH_RCU_READ_LOCK_GUARD() {
4124         if (postcopy_running) {
4125             ret = ram_load_postcopy(f);
4126         } else {
4127             ret = ram_load_precopy(f);
4128         }
4129     }
4130     trace_ram_load_complete(ret, seq_iter);
4131
4132     return ret;
4133 }
4134
4135 static bool ram_has_postcopy(void *opaque)
4136 {
4137     RAMBlock *rb;
4138     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4139         if (ramblock_is_pmem(rb)) {
4140             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4141                          "is not supported now!", rb->idstr, rb->host);
4142             return false;
4143         }
4144     }
4145
4146     return migrate_postcopy_ram();
4147 }
4148
4149 /* Sync all the dirty bitmap with destination VM.  */
4150 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4151 {
4152     RAMBlock *block;
4153     QEMUFile *file = s->to_dst_file;
4154     int ramblock_count = 0;
4155
4156     trace_ram_dirty_bitmap_sync_start();
4157
4158     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4159         qemu_savevm_send_recv_bitmap(file, block->idstr);
4160         trace_ram_dirty_bitmap_request(block->idstr);
4161         ramblock_count++;
4162     }
4163
4164     trace_ram_dirty_bitmap_sync_wait();
4165
4166     /* Wait until all the ramblocks' dirty bitmap synced */
4167     while (ramblock_count--) {
4168         qemu_sem_wait(&s->rp_state.rp_sem);
4169     }
4170
4171     trace_ram_dirty_bitmap_sync_complete();
4172
4173     return 0;
4174 }
4175
4176 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4177 {
4178     qemu_sem_post(&s->rp_state.rp_sem);
4179 }
4180
4181 /*
4182  * Read the received bitmap, revert it as the initial dirty bitmap.
4183  * This is only used when the postcopy migration is paused but wants
4184  * to resume from a middle point.
4185  */
4186 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4187 {
4188     int ret = -EINVAL;
4189     /* from_dst_file is always valid because we're within rp_thread */
4190     QEMUFile *file = s->rp_state.from_dst_file;
4191     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4192     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4193     uint64_t size, end_mark;
4194
4195     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4196
4197     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4198         error_report("%s: incorrect state %s", __func__,
4199                      MigrationStatus_str(s->state));
4200         return -EINVAL;
4201     }
4202
4203     /*
4204      * Note: see comments in ramblock_recv_bitmap_send() on why we
4205      * need the endianness conversion, and the paddings.
4206      */
4207     local_size = ROUND_UP(local_size, 8);
4208
4209     /* Add paddings */
4210     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4211
4212     size = qemu_get_be64(file);
4213
4214     /* The size of the bitmap should match with our ramblock */
4215     if (size != local_size) {
4216         error_report("%s: ramblock '%s' bitmap size mismatch "
4217                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4218                      block->idstr, size, local_size);
4219         ret = -EINVAL;
4220         goto out;
4221     }
4222
4223     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4224     end_mark = qemu_get_be64(file);
4225
4226     ret = qemu_file_get_error(file);
4227     if (ret || size != local_size) {
4228         error_report("%s: read bitmap failed for ramblock '%s': %d"
4229                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4230                      __func__, block->idstr, ret, local_size, size);
4231         ret = -EIO;
4232         goto out;
4233     }
4234
4235     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4236         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4237                      __func__, block->idstr, end_mark);
4238         ret = -EINVAL;
4239         goto out;
4240     }
4241
4242     /*
4243      * Endianness conversion. We are during postcopy (though paused).
4244      * The dirty bitmap won't change. We can directly modify it.
4245      */
4246     bitmap_from_le(block->bmap, le_bitmap, nbits);
4247
4248     /*
4249      * What we received is "received bitmap". Revert it as the initial
4250      * dirty bitmap for this ramblock.
4251      */
4252     bitmap_complement(block->bmap, block->bmap, nbits);
4253
4254     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4255     ramblock_dirty_bitmap_clear_discarded_pages(block);
4256
4257     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4258     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4259
4260     /*
4261      * We succeeded to sync bitmap for current ramblock. If this is
4262      * the last one to sync, we need to notify the main send thread.
4263      */
4264     ram_dirty_bitmap_reload_notify(s);
4265
4266     ret = 0;
4267 out:
4268     g_free(le_bitmap);
4269     return ret;
4270 }
4271
4272 static int ram_resume_prepare(MigrationState *s, void *opaque)
4273 {
4274     RAMState *rs = *(RAMState **)opaque;
4275     int ret;
4276
4277     ret = ram_dirty_bitmap_sync_all(s, rs);
4278     if (ret) {
4279         return ret;
4280     }
4281
4282     ram_state_resume_prepare(rs, s->to_dst_file);
4283
4284     return 0;
4285 }
4286
4287 static SaveVMHandlers savevm_ram_handlers = {
4288     .save_setup = ram_save_setup,
4289     .save_live_iterate = ram_save_iterate,
4290     .save_live_complete_postcopy = ram_save_complete,
4291     .save_live_complete_precopy = ram_save_complete,
4292     .has_postcopy = ram_has_postcopy,
4293     .save_live_pending = ram_save_pending,
4294     .load_state = ram_load,
4295     .save_cleanup = ram_save_cleanup,
4296     .load_setup = ram_load_setup,
4297     .load_cleanup = ram_load_cleanup,
4298     .resume_prepare = ram_resume_prepare,
4299 };
4300
4301 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4302                                       size_t old_size, size_t new_size)
4303 {
4304     PostcopyState ps = postcopy_state_get();
4305     ram_addr_t offset;
4306     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4307     Error *err = NULL;
4308
4309     if (ramblock_is_ignored(rb)) {
4310         return;
4311     }
4312
4313     if (!migration_is_idle()) {
4314         /*
4315          * Precopy code on the source cannot deal with the size of RAM blocks
4316          * changing at random points in time - especially after sending the
4317          * RAM block sizes in the migration stream, they must no longer change.
4318          * Abort and indicate a proper reason.
4319          */
4320         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4321         migration_cancel(err);
4322         error_free(err);
4323     }
4324
4325     switch (ps) {
4326     case POSTCOPY_INCOMING_ADVISE:
4327         /*
4328          * Update what ram_postcopy_incoming_init()->init_range() does at the
4329          * time postcopy was advised. Syncing RAM blocks with the source will
4330          * result in RAM resizes.
4331          */
4332         if (old_size < new_size) {
4333             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4334                 error_report("RAM block '%s' discard of resized RAM failed",
4335                              rb->idstr);
4336             }
4337         }
4338         rb->postcopy_length = new_size;
4339         break;
4340     case POSTCOPY_INCOMING_NONE:
4341     case POSTCOPY_INCOMING_RUNNING:
4342     case POSTCOPY_INCOMING_END:
4343         /*
4344          * Once our guest is running, postcopy does no longer care about
4345          * resizes. When growing, the new memory was not available on the
4346          * source, no handler needed.
4347          */
4348         break;
4349     default:
4350         error_report("RAM block '%s' resized during postcopy state: %d",
4351                      rb->idstr, ps);
4352         exit(-1);
4353     }
4354 }
4355
4356 static RAMBlockNotifier ram_mig_ram_notifier = {
4357     .ram_block_resized = ram_mig_ram_block_resized,
4358 };
4359
4360 void ram_mig_init(void)
4361 {
4362     qemu_mutex_init(&XBZRLE.lock);
4363     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4364     ram_block_notifier_add(&ram_mig_ram_notifier);
4365 }