migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/main-loop.h"
  34 #include "xbzrle.h"
  35 #include "ram.h"
  36 #include "migration.h"
  37 #include "migration/register.h"
  38 #include "migration/misc.h"
  39 #include "qemu-file.h"
  40 #include "postcopy-ram.h"
  41 #include "page_cache.h"
  42 #include "qemu/error-report.h"
  43 #include "qapi/error.h"
  44 #include "qapi/qapi-types-migration.h"
  45 #include "qapi/qapi-events-migration.h"
  46 #include "qapi/qmp/qerror.h"
  47 #include "trace.h"
  48 #include "exec/ram_addr.h"
  49 #include "exec/target_page.h"
  50 #include "qemu/rcu_queue.h"
  51 #include "migration/colo.h"
  52 #include "block.h"
  53 #include "sysemu/cpu-throttle.h"
  54 #include "savevm.h"
  55 #include "qemu/iov.h"
  56 #include "multifd.h"
  57 #include "sysemu/runstate.h"
  58
  59 #include "hw/boards.h" /* for machine_dump_guest_core() */
  60
  61 #if defined(__linux__)
  62 #include "qemu/userfaultfd.h"
  63 #endif /* defined(__linux__) */
  64
  65 /***********************************************************/
  66 /* ram save/restore */
  67
  68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  69  * worked for pages that where filled with the same char.  We switched
  70  * it to only search for the zero value.  And to avoid confusion with
  71  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  72  */
  73
  74 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  75 #define RAM_SAVE_FLAG_ZERO     0x02
  76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  77 #define RAM_SAVE_FLAG_PAGE     0x08
  78 #define RAM_SAVE_FLAG_EOS      0x10
  79 #define RAM_SAVE_FLAG_CONTINUE 0x20
  80 #define RAM_SAVE_FLAG_XBZRLE   0x40
  81 /* 0x80 is reserved in migration.h start with 0x100 next */
  82 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  83
  84 XBZRLECacheStats xbzrle_counters;
  85
  86 /* struct contains XBZRLE cache and a static page
  87    used by the compression */
  88 static struct {
  89     /* buffer used for XBZRLE encoding */
  90     uint8_t *encoded_buf;
  91     /* buffer for storing page content */
  92     uint8_t *current_buf;
  93     /* Cache for XBZRLE, Protected by lock. */
  94     PageCache *cache;
  95     QemuMutex lock;
  96     /* it will store a page full of zeros */
  97     uint8_t *zero_target_page;
  98     /* buffer used for XBZRLE decoding */
  99     uint8_t *decoded_buf;
 100 } XBZRLE;
 101
 102 static void XBZRLE_cache_lock(void)
 103 {
 104     if (migrate_use_xbzrle()) {
 105         qemu_mutex_lock(&XBZRLE.lock);
 106     }
 107 }
 108
 109 static void XBZRLE_cache_unlock(void)
 110 {
 111     if (migrate_use_xbzrle()) {
 112         qemu_mutex_unlock(&XBZRLE.lock);
 113     }
 114 }
 115
 116 /**
 117  * xbzrle_cache_resize: resize the xbzrle cache
 118  *
 119  * This function is called from migrate_params_apply in main
 120  * thread, possibly while a migration is in progress.  A running
 121  * migration may be using the cache and might finish during this call,
 122  * hence changes to the cache are protected by XBZRLE.lock().
 123  *
 124  * Returns 0 for success or -1 for error
 125  *
 126  * @new_size: new cache size
 127  * @errp: set *errp if the check failed, with reason
 128  */
 129 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 130 {
 131     PageCache *new_cache;
 132     int64_t ret = 0;
 133
 134     /* Check for truncation */
 135     if (new_size != (size_t)new_size) {
 136         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 137                    "exceeding address space");
 138         return -1;
 139     }
 140
 141     if (new_size == migrate_xbzrle_cache_size()) {
 142         /* nothing to do */
 143         return 0;
 144     }
 145
 146     XBZRLE_cache_lock();
 147
 148     if (XBZRLE.cache != NULL) {
 149         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 150         if (!new_cache) {
 151             ret = -1;
 152             goto out;
 153         }
 154
 155         cache_fini(XBZRLE.cache);
 156         XBZRLE.cache = new_cache;
 157     }
 158 out:
 159     XBZRLE_cache_unlock();
 160     return ret;
 161 }
 162
 163 bool ramblock_is_ignored(RAMBlock *block)
 164 {
 165     return !qemu_ram_is_migratable(block) ||
 166            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 167 }
 168
 169 #undef RAMBLOCK_FOREACH
 170
 171 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 172 {
 173     RAMBlock *block;
 174     int ret = 0;
 175
 176     RCU_READ_LOCK_GUARD();
 177
 178     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 179         ret = func(block, opaque);
 180         if (ret) {
 181             break;
 182         }
 183     }
 184     return ret;
 185 }
 186
 187 static void ramblock_recv_map_init(void)
 188 {
 189     RAMBlock *rb;
 190
 191     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 192         assert(!rb->receivedmap);
 193         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 194     }
 195 }
 196
 197 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 198 {
 199     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 200                     rb->receivedmap);
 201 }
 202
 203 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 204 {
 205     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 206 }
 207
 208 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 209 {
 210     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 211 }
 212
 213 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 214                                     size_t nr)
 215 {
 216     bitmap_set_atomic(rb->receivedmap,
 217                       ramblock_recv_bitmap_offset(host_addr, rb),
 218                       nr);
 219 }
 220
 221 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 222
 223 /*
 224  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 225  *
 226  * Returns >0 if success with sent bytes, or <0 if error.
 227  */
 228 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 229                                   const char *block_name)
 230 {
 231     RAMBlock *block = qemu_ram_block_by_name(block_name);
 232     unsigned long *le_bitmap, nbits;
 233     uint64_t size;
 234
 235     if (!block) {
 236         error_report("%s: invalid block name: %s", __func__, block_name);
 237         return -1;
 238     }
 239
 240     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 241
 242     /*
 243      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 244      * machines we may need 4 more bytes for padding (see below
 245      * comment). So extend it a bit before hand.
 246      */
 247     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 248
 249     /*
 250      * Always use little endian when sending the bitmap. This is
 251      * required that when source and destination VMs are not using the
 252      * same endianness. (Note: big endian won't work.)
 253      */
 254     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 255
 256     /* Size of the bitmap, in bytes */
 257     size = DIV_ROUND_UP(nbits, 8);
 258
 259     /*
 260      * size is always aligned to 8 bytes for 64bit machines, but it
 261      * may not be true for 32bit machines. We need this padding to
 262      * make sure the migration can survive even between 32bit and
 263      * 64bit machines.
 264      */
 265     size = ROUND_UP(size, 8);
 266
 267     qemu_put_be64(file, size);
 268     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 269     /*
 270      * Mark as an end, in case the middle part is screwed up due to
 271      * some "mysterious" reason.
 272      */
 273     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 274     qemu_fflush(file);
 275
 276     g_free(le_bitmap);
 277
 278     if (qemu_file_get_error(file)) {
 279         return qemu_file_get_error(file);
 280     }
 281
 282     return size + sizeof(size);
 283 }
 284
 285 /*
 286  * An outstanding page request, on the source, having been received
 287  * and queued
 288  */
 289 struct RAMSrcPageRequest {
 290     RAMBlock *rb;
 291     hwaddr    offset;
 292     hwaddr    len;
 293
 294     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 295 };
 296
 297 /* State of RAM for migration */
 298 struct RAMState {
 299     /* QEMUFile used for this migration */
 300     QEMUFile *f;
 301     /* UFFD file descriptor, used in 'write-tracking' migration */
 302     int uffdio_fd;
 303     /* Last block that we have visited searching for dirty pages */
 304     RAMBlock *last_seen_block;
 305     /* Last block from where we have sent data */
 306     RAMBlock *last_sent_block;
 307     /* Last dirty target page we have sent */
 308     ram_addr_t last_page;
 309     /* last ram version we have seen */
 310     uint32_t last_version;
 311     /* How many times we have dirty too many pages */
 312     int dirty_rate_high_cnt;
 313     /* these variables are used for bitmap sync */
 314     /* last time we did a full bitmap_sync */
 315     int64_t time_last_bitmap_sync;
 316     /* bytes transferred at start_time */
 317     uint64_t bytes_xfer_prev;
 318     /* number of dirty pages since start_time */
 319     uint64_t num_dirty_pages_period;
 320     /* xbzrle misses since the beginning of the period */
 321     uint64_t xbzrle_cache_miss_prev;
 322     /* Amount of xbzrle pages since the beginning of the period */
 323     uint64_t xbzrle_pages_prev;
 324     /* Amount of xbzrle encoded bytes since the beginning of the period */
 325     uint64_t xbzrle_bytes_prev;
 326     /* Start using XBZRLE (e.g., after the first round). */
 327     bool xbzrle_enabled;
 328
 329     /* compression statistics since the beginning of the period */
 330     /* amount of count that no free thread to compress data */
 331     uint64_t compress_thread_busy_prev;
 332     /* amount bytes after compression */
 333     uint64_t compressed_size_prev;
 334     /* amount of compressed pages */
 335     uint64_t compress_pages_prev;
 336
 337     /* total handled target pages at the beginning of period */
 338     uint64_t target_page_count_prev;
 339     /* total handled target pages since start */
 340     uint64_t target_page_count;
 341     /* number of dirty bits in the bitmap */
 342     uint64_t migration_dirty_pages;
 343     /* Protects modification of the bitmap and migration dirty pages */
 344     QemuMutex bitmap_mutex;
 345     /* The RAMBlock used in the last src_page_requests */
 346     RAMBlock *last_req_rb;
 347     /* Queue of outstanding page requests from the destination */
 348     QemuMutex src_page_req_mutex;
 349     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 350 };
 351 typedef struct RAMState RAMState;
 352
 353 static RAMState *ram_state;
 354
 355 static NotifierWithReturnList precopy_notifier_list;
 356
 357 void precopy_infrastructure_init(void)
 358 {
 359     notifier_with_return_list_init(&precopy_notifier_list);
 360 }
 361
 362 void precopy_add_notifier(NotifierWithReturn *n)
 363 {
 364     notifier_with_return_list_add(&precopy_notifier_list, n);
 365 }
 366
 367 void precopy_remove_notifier(NotifierWithReturn *n)
 368 {
 369     notifier_with_return_remove(n);
 370 }
 371
 372 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 373 {
 374     PrecopyNotifyData pnd;
 375     pnd.reason = reason;
 376     pnd.errp = errp;
 377
 378     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 379 }
 380
 381 uint64_t ram_bytes_remaining(void)
 382 {
 383     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 384                        0;
 385 }
 386
 387 MigrationStats ram_counters;
 388
 389 /* used by the search for pages to send */
 390 struct PageSearchStatus {
 391     /* Current block being searched */
 392     RAMBlock    *block;
 393     /* Current page to search from */
 394     unsigned long page;
 395     /* Set once we wrap around */
 396     bool         complete_round;
 397 };
 398 typedef struct PageSearchStatus PageSearchStatus;
 399
 400 CompressionStats compression_counters;
 401
 402 struct CompressParam {
 403     bool done;
 404     bool quit;
 405     bool zero_page;
 406     QEMUFile *file;
 407     QemuMutex mutex;
 408     QemuCond cond;
 409     RAMBlock *block;
 410     ram_addr_t offset;
 411
 412     /* internally used fields */
 413     z_stream stream;
 414     uint8_t *originbuf;
 415 };
 416 typedef struct CompressParam CompressParam;
 417
 418 struct DecompressParam {
 419     bool done;
 420     bool quit;
 421     QemuMutex mutex;
 422     QemuCond cond;
 423     void *des;
 424     uint8_t *compbuf;
 425     int len;
 426     z_stream stream;
 427 };
 428 typedef struct DecompressParam DecompressParam;
 429
 430 static CompressParam *comp_param;
 431 static QemuThread *compress_threads;
 432 /* comp_done_cond is used to wake up the migration thread when
 433  * one of the compression threads has finished the compression.
 434  * comp_done_lock is used to co-work with comp_done_cond.
 435  */
 436 static QemuMutex comp_done_lock;
 437 static QemuCond comp_done_cond;
 438 /* The empty QEMUFileOps will be used by file in CompressParam */
 439 static const QEMUFileOps empty_ops = { };
 440
 441 static QEMUFile *decomp_file;
 442 static DecompressParam *decomp_param;
 443 static QemuThread *decompress_threads;
 444 static QemuMutex decomp_done_lock;
 445 static QemuCond decomp_done_cond;
 446
 447 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 448                                  ram_addr_t offset, uint8_t *source_buf);
 449
 450 static void *do_data_compress(void *opaque)
 451 {
 452     CompressParam *param = opaque;
 453     RAMBlock *block;
 454     ram_addr_t offset;
 455     bool zero_page;
 456
 457     qemu_mutex_lock(&param->mutex);
 458     while (!param->quit) {
 459         if (param->block) {
 460             block = param->block;
 461             offset = param->offset;
 462             param->block = NULL;
 463             qemu_mutex_unlock(&param->mutex);
 464
 465             zero_page = do_compress_ram_page(param->file, &param->stream,
 466                                              block, offset, param->originbuf);
 467
 468             qemu_mutex_lock(&comp_done_lock);
 469             param->done = true;
 470             param->zero_page = zero_page;
 471             qemu_cond_signal(&comp_done_cond);
 472             qemu_mutex_unlock(&comp_done_lock);
 473
 474             qemu_mutex_lock(&param->mutex);
 475         } else {
 476             qemu_cond_wait(&param->cond, &param->mutex);
 477         }
 478     }
 479     qemu_mutex_unlock(&param->mutex);
 480
 481     return NULL;
 482 }
 483
 484 static void compress_threads_save_cleanup(void)
 485 {
 486     int i, thread_count;
 487
 488     if (!migrate_use_compression() || !comp_param) {
 489         return;
 490     }
 491
 492     thread_count = migrate_compress_threads();
 493     for (i = 0; i < thread_count; i++) {
 494         /*
 495          * we use it as a indicator which shows if the thread is
 496          * properly init'd or not
 497          */
 498         if (!comp_param[i].file) {
 499             break;
 500         }
 501
 502         qemu_mutex_lock(&comp_param[i].mutex);
 503         comp_param[i].quit = true;
 504         qemu_cond_signal(&comp_param[i].cond);
 505         qemu_mutex_unlock(&comp_param[i].mutex);
 506
 507         qemu_thread_join(compress_threads + i);
 508         qemu_mutex_destroy(&comp_param[i].mutex);
 509         qemu_cond_destroy(&comp_param[i].cond);
 510         deflateEnd(&comp_param[i].stream);
 511         g_free(comp_param[i].originbuf);
 512         qemu_fclose(comp_param[i].file);
 513         comp_param[i].file = NULL;
 514     }
 515     qemu_mutex_destroy(&comp_done_lock);
 516     qemu_cond_destroy(&comp_done_cond);
 517     g_free(compress_threads);
 518     g_free(comp_param);
 519     compress_threads = NULL;
 520     comp_param = NULL;
 521 }
 522
 523 static int compress_threads_save_setup(void)
 524 {
 525     int i, thread_count;
 526
 527     if (!migrate_use_compression()) {
 528         return 0;
 529     }
 530     thread_count = migrate_compress_threads();
 531     compress_threads = g_new0(QemuThread, thread_count);
 532     comp_param = g_new0(CompressParam, thread_count);
 533     qemu_cond_init(&comp_done_cond);
 534     qemu_mutex_init(&comp_done_lock);
 535     for (i = 0; i < thread_count; i++) {
 536         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 537         if (!comp_param[i].originbuf) {
 538             goto exit;
 539         }
 540
 541         if (deflateInit(&comp_param[i].stream,
 542                         migrate_compress_level()) != Z_OK) {
 543             g_free(comp_param[i].originbuf);
 544             goto exit;
 545         }
 546
 547         /* comp_param[i].file is just used as a dummy buffer to save data,
 548          * set its ops to empty.
 549          */
 550         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
 551         comp_param[i].done = true;
 552         comp_param[i].quit = false;
 553         qemu_mutex_init(&comp_param[i].mutex);
 554         qemu_cond_init(&comp_param[i].cond);
 555         qemu_thread_create(compress_threads + i, "compress",
 556                            do_data_compress, comp_param + i,
 557                            QEMU_THREAD_JOINABLE);
 558     }
 559     return 0;
 560
 561 exit:
 562     compress_threads_save_cleanup();
 563     return -1;
 564 }
 565
 566 /**
 567  * save_page_header: write page header to wire
 568  *
 569  * If this is the 1st block, it also writes the block identification
 570  *
 571  * Returns the number of bytes written
 572  *
 573  * @f: QEMUFile where to send the data
 574  * @block: block that contains the page we want to send
 575  * @offset: offset inside the block for the page
 576  *          in the lower bits, it contains flags
 577  */
 578 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 579                                ram_addr_t offset)
 580 {
 581     size_t size, len;
 582
 583     if (block == rs->last_sent_block) {
 584         offset |= RAM_SAVE_FLAG_CONTINUE;
 585     }
 586     qemu_put_be64(f, offset);
 587     size = 8;
 588
 589     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 590         len = strlen(block->idstr);
 591         qemu_put_byte(f, len);
 592         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 593         size += 1 + len;
 594         rs->last_sent_block = block;
 595     }
 596     return size;
 597 }
 598
 599 /**
 600  * mig_throttle_guest_down: throttle down the guest
 601  *
 602  * Reduce amount of guest cpu execution to hopefully slow down memory
 603  * writes. If guest dirty memory rate is reduced below the rate at
 604  * which we can transfer pages to the destination then we should be
 605  * able to complete migration. Some workloads dirty memory way too
 606  * fast and will not effectively converge, even with auto-converge.
 607  */
 608 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 609                                     uint64_t bytes_dirty_threshold)
 610 {
 611     MigrationState *s = migrate_get_current();
 612     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 613     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 614     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 615     int pct_max = s->parameters.max_cpu_throttle;
 616
 617     uint64_t throttle_now = cpu_throttle_get_percentage();
 618     uint64_t cpu_now, cpu_ideal, throttle_inc;
 619
 620     /* We have not started throttling yet. Let's start it. */
 621     if (!cpu_throttle_active()) {
 622         cpu_throttle_set(pct_initial);
 623     } else {
 624         /* Throttling already on, just increase the rate */
 625         if (!pct_tailslow) {
 626             throttle_inc = pct_increment;
 627         } else {
 628             /* Compute the ideal CPU percentage used by Guest, which may
 629              * make the dirty rate match the dirty rate threshold. */
 630             cpu_now = 100 - throttle_now;
 631             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 632                         bytes_dirty_period);
 633             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 634         }
 635         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 636     }
 637 }
 638
 639 void mig_throttle_counter_reset(void)
 640 {
 641     RAMState *rs = ram_state;
 642
 643     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 644     rs->num_dirty_pages_period = 0;
 645     rs->bytes_xfer_prev = ram_counters.transferred;
 646 }
 647
 648 /**
 649  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 650  *
 651  * @rs: current RAM state
 652  * @current_addr: address for the zero page
 653  *
 654  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 655  * The important thing is that a stale (not-yet-0'd) page be replaced
 656  * by the new data.
 657  * As a bonus, if the page wasn't in the cache it gets added so that
 658  * when a small write is made into the 0'd page it gets XBZRLE sent.
 659  */
 660 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 661 {
 662     if (!rs->xbzrle_enabled) {
 663         return;
 664     }
 665
 666     /* We don't care if this fails to allocate a new cache page
 667      * as long as it updated an old one */
 668     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 669                  ram_counters.dirty_sync_count);
 670 }
 671
 672 #define ENCODING_FLAG_XBZRLE 0x1
 673
 674 /**
 675  * save_xbzrle_page: compress and send current page
 676  *
 677  * Returns: 1 means that we wrote the page
 678  *          0 means that page is identical to the one already sent
 679  *          -1 means that xbzrle would be longer than normal
 680  *
 681  * @rs: current RAM state
 682  * @current_data: pointer to the address of the page contents
 683  * @current_addr: addr of the page
 684  * @block: block that contains the page we want to send
 685  * @offset: offset inside the block for the page
 686  * @last_stage: if we are at the completion stage
 687  */
 688 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 689                             ram_addr_t current_addr, RAMBlock *block,
 690                             ram_addr_t offset, bool last_stage)
 691 {
 692     int encoded_len = 0, bytes_xbzrle;
 693     uint8_t *prev_cached_page;
 694
 695     if (!cache_is_cached(XBZRLE.cache, current_addr,
 696                          ram_counters.dirty_sync_count)) {
 697         xbzrle_counters.cache_miss++;
 698         if (!last_stage) {
 699             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 700                              ram_counters.dirty_sync_count) == -1) {
 701                 return -1;
 702             } else {
 703                 /* update *current_data when the page has been
 704                    inserted into cache */
 705                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 706             }
 707         }
 708         return -1;
 709     }
 710
 711     /*
 712      * Reaching here means the page has hit the xbzrle cache, no matter what
 713      * encoding result it is (normal encoding, overflow or skipping the page),
 714      * count the page as encoded. This is used to calculate the encoding rate.
 715      *
 716      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 717      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 718      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 719      * skipped page included. In this way, the encoding rate can tell if the
 720      * guest page is good for xbzrle encoding.
 721      */
 722     xbzrle_counters.pages++;
 723     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 724
 725     /* save current buffer into memory */
 726     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 727
 728     /* XBZRLE encoding (if there is no overflow) */
 729     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 730                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 731                                        TARGET_PAGE_SIZE);
 732
 733     /*
 734      * Update the cache contents, so that it corresponds to the data
 735      * sent, in all cases except where we skip the page.
 736      */
 737     if (!last_stage && encoded_len != 0) {
 738         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 739         /*
 740          * In the case where we couldn't compress, ensure that the caller
 741          * sends the data from the cache, since the guest might have
 742          * changed the RAM since we copied it.
 743          */
 744         *current_data = prev_cached_page;
 745     }
 746
 747     if (encoded_len == 0) {
 748         trace_save_xbzrle_page_skipping();
 749         return 0;
 750     } else if (encoded_len == -1) {
 751         trace_save_xbzrle_page_overflow();
 752         xbzrle_counters.overflow++;
 753         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 754         return -1;
 755     }
 756
 757     /* Send XBZRLE based compressed page */
 758     bytes_xbzrle = save_page_header(rs, rs->f, block,
 759                                     offset | RAM_SAVE_FLAG_XBZRLE);
 760     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 761     qemu_put_be16(rs->f, encoded_len);
 762     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 763     bytes_xbzrle += encoded_len + 1 + 2;
 764     /*
 765      * Like compressed_size (please see update_compress_thread_counts),
 766      * the xbzrle encoded bytes don't count the 8 byte header with
 767      * RAM_SAVE_FLAG_CONTINUE.
 768      */
 769     xbzrle_counters.bytes += bytes_xbzrle - 8;
 770     ram_counters.transferred += bytes_xbzrle;
 771
 772     return 1;
 773 }
 774
 775 /**
 776  * migration_bitmap_find_dirty: find the next dirty page from start
 777  *
 778  * Returns the page offset within memory region of the start of a dirty page
 779  *
 780  * @rs: current RAM state
 781  * @rb: RAMBlock where to search for dirty pages
 782  * @start: page where we start the search
 783  */
 784 static inline
 785 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 786                                           unsigned long start)
 787 {
 788     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 789     unsigned long *bitmap = rb->bmap;
 790
 791     if (ramblock_is_ignored(rb)) {
 792         return size;
 793     }
 794
 795     return find_next_bit(bitmap, size, start);
 796 }
 797
 798 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
 799                                                        unsigned long page)
 800 {
 801     uint8_t shift;
 802     hwaddr size, start;
 803
 804     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
 805         return;
 806     }
 807
 808     shift = rb->clear_bmap_shift;
 809     /*
 810      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 811      * can make things easier sometimes since then start address
 812      * of the small chunk will always be 64 pages aligned so the
 813      * bitmap will always be aligned to unsigned long. We should
 814      * even be able to remove this restriction but I'm simply
 815      * keeping it.
 816      */
 817     assert(shift >= 6);
 818
 819     size = 1ULL << (TARGET_PAGE_BITS + shift);
 820     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
 821     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 822     memory_region_clear_dirty_bitmap(rb->mr, start, size);
 823 }
 824
 825 static void
 826 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
 827                                                  unsigned long start,
 828                                                  unsigned long npages)
 829 {
 830     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
 831     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
 832     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
 833
 834     /*
 835      * Clear pages from start to start + npages - 1, so the end boundary is
 836      * exclusive.
 837      */
 838     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
 839         migration_clear_memory_region_dirty_bitmap(rb, i);
 840     }
 841 }
 842
 843 /*
 844  * colo_bitmap_find_diry:find contiguous dirty pages from start
 845  *
 846  * Returns the page offset within memory region of the start of the contiguout
 847  * dirty page
 848  *
 849  * @rs: current RAM state
 850  * @rb: RAMBlock where to search for dirty pages
 851  * @start: page where we start the search
 852  * @num: the number of contiguous dirty pages
 853  */
 854 static inline
 855 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 856                                      unsigned long start, unsigned long *num)
 857 {
 858     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 859     unsigned long *bitmap = rb->bmap;
 860     unsigned long first, next;
 861
 862     *num = 0;
 863
 864     if (ramblock_is_ignored(rb)) {
 865         return size;
 866     }
 867
 868     first = find_next_bit(bitmap, size, start);
 869     if (first >= size) {
 870         return first;
 871     }
 872     next = find_next_zero_bit(bitmap, size, first + 1);
 873     assert(next >= first);
 874     *num = next - first;
 875     return first;
 876 }
 877
 878 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 879                                                 RAMBlock *rb,
 880                                                 unsigned long page)
 881 {
 882     bool ret;
 883
 884     /*
 885      * Clear dirty bitmap if needed.  This _must_ be called before we
 886      * send any of the page in the chunk because we need to make sure
 887      * we can capture further page content changes when we sync dirty
 888      * log the next time.  So as long as we are going to send any of
 889      * the page in the chunk we clear the remote dirty bitmap for all.
 890      * Clearing it earlier won't be a problem, but too late will.
 891      */
 892     migration_clear_memory_region_dirty_bitmap(rb, page);
 893
 894     ret = test_and_clear_bit(page, rb->bmap);
 895     if (ret) {
 896         rs->migration_dirty_pages--;
 897     }
 898
 899     return ret;
 900 }
 901
 902 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
 903                                        void *opaque)
 904 {
 905     const hwaddr offset = section->offset_within_region;
 906     const hwaddr size = int128_get64(section->size);
 907     const unsigned long start = offset >> TARGET_PAGE_BITS;
 908     const unsigned long npages = size >> TARGET_PAGE_BITS;
 909     RAMBlock *rb = section->mr->ram_block;
 910     uint64_t *cleared_bits = opaque;
 911
 912     /*
 913      * We don't grab ram_state->bitmap_mutex because we expect to run
 914      * only when starting migration or during postcopy recovery where
 915      * we don't have concurrent access.
 916      */
 917     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
 918         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
 919     }
 920     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
 921     bitmap_clear(rb->bmap, start, npages);
 922 }
 923
 924 /*
 925  * Exclude all dirty pages from migration that fall into a discarded range as
 926  * managed by a RamDiscardManager responsible for the mapped memory region of
 927  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
 928  *
 929  * Discarded pages ("logically unplugged") have undefined content and must
 930  * not get migrated, because even reading these pages for migration might
 931  * result in undesired behavior.
 932  *
 933  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
 934  *
 935  * Note: The result is only stable while migrating (precopy/postcopy).
 936  */
 937 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
 938 {
 939     uint64_t cleared_bits = 0;
 940
 941     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
 942         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 943         MemoryRegionSection section = {
 944             .mr = rb->mr,
 945             .offset_within_region = 0,
 946             .size = int128_make64(qemu_ram_get_used_length(rb)),
 947         };
 948
 949         ram_discard_manager_replay_discarded(rdm, &section,
 950                                              dirty_bitmap_clear_section,
 951                                              &cleared_bits);
 952     }
 953     return cleared_bits;
 954 }
 955
 956 /*
 957  * Check if a host-page aligned page falls into a discarded range as managed by
 958  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
 959  *
 960  * Note: The result is only stable while migrating (precopy/postcopy).
 961  */
 962 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
 963 {
 964     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
 965         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 966         MemoryRegionSection section = {
 967             .mr = rb->mr,
 968             .offset_within_region = start,
 969             .size = int128_make64(qemu_ram_pagesize(rb)),
 970         };
 971
 972         return !ram_discard_manager_is_populated(rdm, &section);
 973     }
 974     return false;
 975 }
 976
 977 /* Called with RCU critical section */
 978 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 979 {
 980     uint64_t new_dirty_pages =
 981         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
 982
 983     rs->migration_dirty_pages += new_dirty_pages;
 984     rs->num_dirty_pages_period += new_dirty_pages;
 985 }
 986
 987 /**
 988  * ram_pagesize_summary: calculate all the pagesizes of a VM
 989  *
 990  * Returns a summary bitmap of the page sizes of all RAMBlocks
 991  *
 992  * For VMs with just normal pages this is equivalent to the host page
 993  * size. If it's got some huge pages then it's the OR of all the
 994  * different page sizes.
 995  */
 996 uint64_t ram_pagesize_summary(void)
 997 {
 998     RAMBlock *block;
 999     uint64_t summary = 0;
1000
1001     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1002         summary |= block->page_size;
1003     }
1004
1005     return summary;
1006 }
1007
1008 uint64_t ram_get_total_transferred_pages(void)
1009 {
1010     return  ram_counters.normal + ram_counters.duplicate +
1011                 compression_counters.pages + xbzrle_counters.pages;
1012 }
1013
1014 static void migration_update_rates(RAMState *rs, int64_t end_time)
1015 {
1016     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1017     double compressed_size;
1018
1019     /* calculate period counters */
1020     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1021                 / (end_time - rs->time_last_bitmap_sync);
1022
1023     if (!page_count) {
1024         return;
1025     }
1026
1027     if (migrate_use_xbzrle()) {
1028         double encoded_size, unencoded_size;
1029
1030         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1031             rs->xbzrle_cache_miss_prev) / page_count;
1032         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1033         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1034                          TARGET_PAGE_SIZE;
1035         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1036         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1037             xbzrle_counters.encoding_rate = 0;
1038         } else {
1039             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1040         }
1041         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1042         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1043     }
1044
1045     if (migrate_use_compression()) {
1046         compression_counters.busy_rate = (double)(compression_counters.busy -
1047             rs->compress_thread_busy_prev) / page_count;
1048         rs->compress_thread_busy_prev = compression_counters.busy;
1049
1050         compressed_size = compression_counters.compressed_size -
1051                           rs->compressed_size_prev;
1052         if (compressed_size) {
1053             double uncompressed_size = (compression_counters.pages -
1054                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1055
1056             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1057             compression_counters.compression_rate =
1058                                         uncompressed_size / compressed_size;
1059
1060             rs->compress_pages_prev = compression_counters.pages;
1061             rs->compressed_size_prev = compression_counters.compressed_size;
1062         }
1063     }
1064 }
1065
1066 static void migration_trigger_throttle(RAMState *rs)
1067 {
1068     MigrationState *s = migrate_get_current();
1069     uint64_t threshold = s->parameters.throttle_trigger_threshold;
1070
1071     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1072     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1073     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1074
1075     /* During block migration the auto-converge logic incorrectly detects
1076      * that ram migration makes no progress. Avoid this by disabling the
1077      * throttling logic during the bulk phase of block migration. */
1078     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1079         /* The following detection logic can be refined later. For now:
1080            Check to see if the ratio between dirtied bytes and the approx.
1081            amount of bytes that just got transferred since the last time
1082            we were in this routine reaches the threshold. If that happens
1083            twice, start or increase throttling. */
1084
1085         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1086             (++rs->dirty_rate_high_cnt >= 2)) {
1087             trace_migration_throttle();
1088             rs->dirty_rate_high_cnt = 0;
1089             mig_throttle_guest_down(bytes_dirty_period,
1090                                     bytes_dirty_threshold);
1091         }
1092     }
1093 }
1094
1095 static void migration_bitmap_sync(RAMState *rs)
1096 {
1097     RAMBlock *block;
1098     int64_t end_time;
1099
1100     ram_counters.dirty_sync_count++;
1101
1102     if (!rs->time_last_bitmap_sync) {
1103         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1104     }
1105
1106     trace_migration_bitmap_sync_start();
1107     memory_global_dirty_log_sync();
1108
1109     qemu_mutex_lock(&rs->bitmap_mutex);
1110     WITH_RCU_READ_LOCK_GUARD() {
1111         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1112             ramblock_sync_dirty_bitmap(rs, block);
1113         }
1114         ram_counters.remaining = ram_bytes_remaining();
1115     }
1116     qemu_mutex_unlock(&rs->bitmap_mutex);
1117
1118     memory_global_after_dirty_log_sync();
1119     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1120
1121     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1122
1123     /* more than 1 second = 1000 millisecons */
1124     if (end_time > rs->time_last_bitmap_sync + 1000) {
1125         migration_trigger_throttle(rs);
1126
1127         migration_update_rates(rs, end_time);
1128
1129         rs->target_page_count_prev = rs->target_page_count;
1130
1131         /* reset period counters */
1132         rs->time_last_bitmap_sync = end_time;
1133         rs->num_dirty_pages_period = 0;
1134         rs->bytes_xfer_prev = ram_counters.transferred;
1135     }
1136     if (migrate_use_events()) {
1137         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1138     }
1139 }
1140
1141 static void migration_bitmap_sync_precopy(RAMState *rs)
1142 {
1143     Error *local_err = NULL;
1144
1145     /*
1146      * The current notifier usage is just an optimization to migration, so we
1147      * don't stop the normal migration process in the error case.
1148      */
1149     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1150         error_report_err(local_err);
1151         local_err = NULL;
1152     }
1153
1154     migration_bitmap_sync(rs);
1155
1156     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1157         error_report_err(local_err);
1158     }
1159 }
1160
1161 /**
1162  * save_zero_page_to_file: send the zero page to the file
1163  *
1164  * Returns the size of data written to the file, 0 means the page is not
1165  * a zero page
1166  *
1167  * @rs: current RAM state
1168  * @file: the file where the data is saved
1169  * @block: block that contains the page we want to send
1170  * @offset: offset inside the block for the page
1171  */
1172 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1173                                   RAMBlock *block, ram_addr_t offset)
1174 {
1175     uint8_t *p = block->host + offset;
1176     int len = 0;
1177
1178     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1179         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1180         qemu_put_byte(file, 0);
1181         len += 1;
1182     }
1183     return len;
1184 }
1185
1186 /**
1187  * save_zero_page: send the zero page to the stream
1188  *
1189  * Returns the number of pages written.
1190  *
1191  * @rs: current RAM state
1192  * @block: block that contains the page we want to send
1193  * @offset: offset inside the block for the page
1194  */
1195 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1196 {
1197     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1198
1199     if (len) {
1200         ram_counters.duplicate++;
1201         ram_counters.transferred += len;
1202         return 1;
1203     }
1204     return -1;
1205 }
1206
1207 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1208 {
1209     if (!migrate_release_ram() || !migration_in_postcopy()) {
1210         return;
1211     }
1212
1213     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1214 }
1215
1216 /*
1217  * @pages: the number of pages written by the control path,
1218  *        < 0 - error
1219  *        > 0 - number of pages written
1220  *
1221  * Return true if the pages has been saved, otherwise false is returned.
1222  */
1223 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1224                               int *pages)
1225 {
1226     uint64_t bytes_xmit = 0;
1227     int ret;
1228
1229     *pages = -1;
1230     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1231                                 &bytes_xmit);
1232     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1233         return false;
1234     }
1235
1236     if (bytes_xmit) {
1237         ram_counters.transferred += bytes_xmit;
1238         *pages = 1;
1239     }
1240
1241     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1242         return true;
1243     }
1244
1245     if (bytes_xmit > 0) {
1246         ram_counters.normal++;
1247     } else if (bytes_xmit == 0) {
1248         ram_counters.duplicate++;
1249     }
1250
1251     return true;
1252 }
1253
1254 /*
1255  * directly send the page to the stream
1256  *
1257  * Returns the number of pages written.
1258  *
1259  * @rs: current RAM state
1260  * @block: block that contains the page we want to send
1261  * @offset: offset inside the block for the page
1262  * @buf: the page to be sent
1263  * @async: send to page asyncly
1264  */
1265 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1266                             uint8_t *buf, bool async)
1267 {
1268     ram_counters.transferred += save_page_header(rs, rs->f, block,
1269                                                  offset | RAM_SAVE_FLAG_PAGE);
1270     if (async) {
1271         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1272                               migrate_release_ram() &
1273                               migration_in_postcopy());
1274     } else {
1275         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1276     }
1277     ram_counters.transferred += TARGET_PAGE_SIZE;
1278     ram_counters.normal++;
1279     return 1;
1280 }
1281
1282 /**
1283  * ram_save_page: send the given page to the stream
1284  *
1285  * Returns the number of pages written.
1286  *          < 0 - error
1287  *          >=0 - Number of pages written - this might legally be 0
1288  *                if xbzrle noticed the page was the same.
1289  *
1290  * @rs: current RAM state
1291  * @block: block that contains the page we want to send
1292  * @offset: offset inside the block for the page
1293  * @last_stage: if we are at the completion stage
1294  */
1295 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1296 {
1297     int pages = -1;
1298     uint8_t *p;
1299     bool send_async = true;
1300     RAMBlock *block = pss->block;
1301     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1302     ram_addr_t current_addr = block->offset + offset;
1303
1304     p = block->host + offset;
1305     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1306
1307     XBZRLE_cache_lock();
1308     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1309         pages = save_xbzrle_page(rs, &p, current_addr, block,
1310                                  offset, last_stage);
1311         if (!last_stage) {
1312             /* Can't send this cached data async, since the cache page
1313              * might get updated before it gets to the wire
1314              */
1315             send_async = false;
1316         }
1317     }
1318
1319     /* XBZRLE overflow or normal page */
1320     if (pages == -1) {
1321         pages = save_normal_page(rs, block, offset, p, send_async);
1322     }
1323
1324     XBZRLE_cache_unlock();
1325
1326     return pages;
1327 }
1328
1329 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1330                                  ram_addr_t offset)
1331 {
1332     if (multifd_queue_page(rs->f, block, offset) < 0) {
1333         return -1;
1334     }
1335     ram_counters.normal++;
1336
1337     return 1;
1338 }
1339
1340 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1341                                  ram_addr_t offset, uint8_t *source_buf)
1342 {
1343     RAMState *rs = ram_state;
1344     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1345     bool zero_page = false;
1346     int ret;
1347
1348     if (save_zero_page_to_file(rs, f, block, offset)) {
1349         zero_page = true;
1350         goto exit;
1351     }
1352
1353     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1354
1355     /*
1356      * copy it to a internal buffer to avoid it being modified by VM
1357      * so that we can catch up the error during compression and
1358      * decompression
1359      */
1360     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1361     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1362     if (ret < 0) {
1363         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1364         error_report("compressed data failed!");
1365         return false;
1366     }
1367
1368 exit:
1369     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1370     return zero_page;
1371 }
1372
1373 static void
1374 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1375 {
1376     ram_counters.transferred += bytes_xmit;
1377
1378     if (param->zero_page) {
1379         ram_counters.duplicate++;
1380         return;
1381     }
1382
1383     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1384     compression_counters.compressed_size += bytes_xmit - 8;
1385     compression_counters.pages++;
1386 }
1387
1388 static bool save_page_use_compression(RAMState *rs);
1389
1390 static void flush_compressed_data(RAMState *rs)
1391 {
1392     int idx, len, thread_count;
1393
1394     if (!save_page_use_compression(rs)) {
1395         return;
1396     }
1397     thread_count = migrate_compress_threads();
1398
1399     qemu_mutex_lock(&comp_done_lock);
1400     for (idx = 0; idx < thread_count; idx++) {
1401         while (!comp_param[idx].done) {
1402             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1403         }
1404     }
1405     qemu_mutex_unlock(&comp_done_lock);
1406
1407     for (idx = 0; idx < thread_count; idx++) {
1408         qemu_mutex_lock(&comp_param[idx].mutex);
1409         if (!comp_param[idx].quit) {
1410             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1411             /*
1412              * it's safe to fetch zero_page without holding comp_done_lock
1413              * as there is no further request submitted to the thread,
1414              * i.e, the thread should be waiting for a request at this point.
1415              */
1416             update_compress_thread_counts(&comp_param[idx], len);
1417         }
1418         qemu_mutex_unlock(&comp_param[idx].mutex);
1419     }
1420 }
1421
1422 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1423                                        ram_addr_t offset)
1424 {
1425     param->block = block;
1426     param->offset = offset;
1427 }
1428
1429 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1430                                            ram_addr_t offset)
1431 {
1432     int idx, thread_count, bytes_xmit = -1, pages = -1;
1433     bool wait = migrate_compress_wait_thread();
1434
1435     thread_count = migrate_compress_threads();
1436     qemu_mutex_lock(&comp_done_lock);
1437 retry:
1438     for (idx = 0; idx < thread_count; idx++) {
1439         if (comp_param[idx].done) {
1440             comp_param[idx].done = false;
1441             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1442             qemu_mutex_lock(&comp_param[idx].mutex);
1443             set_compress_params(&comp_param[idx], block, offset);
1444             qemu_cond_signal(&comp_param[idx].cond);
1445             qemu_mutex_unlock(&comp_param[idx].mutex);
1446             pages = 1;
1447             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1448             break;
1449         }
1450     }
1451
1452     /*
1453      * wait for the free thread if the user specifies 'compress-wait-thread',
1454      * otherwise we will post the page out in the main thread as normal page.
1455      */
1456     if (pages < 0 && wait) {
1457         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1458         goto retry;
1459     }
1460     qemu_mutex_unlock(&comp_done_lock);
1461
1462     return pages;
1463 }
1464
1465 /**
1466  * find_dirty_block: find the next dirty page and update any state
1467  * associated with the search process.
1468  *
1469  * Returns true if a page is found
1470  *
1471  * @rs: current RAM state
1472  * @pss: data about the state of the current dirty page scan
1473  * @again: set to false if the search has scanned the whole of RAM
1474  */
1475 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1476 {
1477     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1478     if (pss->complete_round && pss->block == rs->last_seen_block &&
1479         pss->page >= rs->last_page) {
1480         /*
1481          * We've been once around the RAM and haven't found anything.
1482          * Give up.
1483          */
1484         *again = false;
1485         return false;
1486     }
1487     if (!offset_in_ramblock(pss->block,
1488                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1489         /* Didn't find anything in this RAM Block */
1490         pss->page = 0;
1491         pss->block = QLIST_NEXT_RCU(pss->block, next);
1492         if (!pss->block) {
1493             /*
1494              * If memory migration starts over, we will meet a dirtied page
1495              * which may still exists in compression threads's ring, so we
1496              * should flush the compressed data to make sure the new page
1497              * is not overwritten by the old one in the destination.
1498              *
1499              * Also If xbzrle is on, stop using the data compression at this
1500              * point. In theory, xbzrle can do better than compression.
1501              */
1502             flush_compressed_data(rs);
1503
1504             /* Hit the end of the list */
1505             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1506             /* Flag that we've looped */
1507             pss->complete_round = true;
1508             /* After the first round, enable XBZRLE. */
1509             if (migrate_use_xbzrle()) {
1510                 rs->xbzrle_enabled = true;
1511             }
1512         }
1513         /* Didn't find anything this time, but try again on the new block */
1514         *again = true;
1515         return false;
1516     } else {
1517         /* Can go around again, but... */
1518         *again = true;
1519         /* We've found something so probably don't need to */
1520         return true;
1521     }
1522 }
1523
1524 /**
1525  * unqueue_page: gets a page of the queue
1526  *
1527  * Helper for 'get_queued_page' - gets a page off the queue
1528  *
1529  * Returns the block of the page (or NULL if none available)
1530  *
1531  * @rs: current RAM state
1532  * @offset: used to return the offset within the RAMBlock
1533  */
1534 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1535 {
1536     RAMBlock *block = NULL;
1537
1538     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1539         return NULL;
1540     }
1541
1542     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1543     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1544         struct RAMSrcPageRequest *entry =
1545                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1546         block = entry->rb;
1547         *offset = entry->offset;
1548
1549         if (entry->len > TARGET_PAGE_SIZE) {
1550             entry->len -= TARGET_PAGE_SIZE;
1551             entry->offset += TARGET_PAGE_SIZE;
1552         } else {
1553             memory_region_unref(block->mr);
1554             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1555             g_free(entry);
1556             migration_consume_urgent_request();
1557         }
1558     }
1559
1560     return block;
1561 }
1562
1563 #if defined(__linux__)
1564 /**
1565  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1566  *   is found, return RAM block pointer and page offset
1567  *
1568  * Returns pointer to the RAMBlock containing faulting page,
1569  *   NULL if no write faults are pending
1570  *
1571  * @rs: current RAM state
1572  * @offset: page offset from the beginning of the block
1573  */
1574 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1575 {
1576     struct uffd_msg uffd_msg;
1577     void *page_address;
1578     RAMBlock *block;
1579     int res;
1580
1581     if (!migrate_background_snapshot()) {
1582         return NULL;
1583     }
1584
1585     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1586     if (res <= 0) {
1587         return NULL;
1588     }
1589
1590     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1591     block = qemu_ram_block_from_host(page_address, false, offset);
1592     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1593     return block;
1594 }
1595
1596 /**
1597  * ram_save_release_protection: release UFFD write protection after
1598  *   a range of pages has been saved
1599  *
1600  * @rs: current RAM state
1601  * @pss: page-search-status structure
1602  * @start_page: index of the first page in the range relative to pss->block
1603  *
1604  * Returns 0 on success, negative value in case of an error
1605 */
1606 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1607         unsigned long start_page)
1608 {
1609     int res = 0;
1610
1611     /* Check if page is from UFFD-managed region. */
1612     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1613         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1614         uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1615
1616         /* Flush async buffers before un-protect. */
1617         qemu_fflush(rs->f);
1618         /* Un-protect memory range. */
1619         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1620                 false, false);
1621     }
1622
1623     return res;
1624 }
1625
1626 /* ram_write_tracking_available: check if kernel supports required UFFD features
1627  *
1628  * Returns true if supports, false otherwise
1629  */
1630 bool ram_write_tracking_available(void)
1631 {
1632     uint64_t uffd_features;
1633     int res;
1634
1635     res = uffd_query_features(&uffd_features);
1636     return (res == 0 &&
1637             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1638 }
1639
1640 /* ram_write_tracking_compatible: check if guest configuration is
1641  *   compatible with 'write-tracking'
1642  *
1643  * Returns true if compatible, false otherwise
1644  */
1645 bool ram_write_tracking_compatible(void)
1646 {
1647     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1648     int uffd_fd;
1649     RAMBlock *block;
1650     bool ret = false;
1651
1652     /* Open UFFD file descriptor */
1653     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1654     if (uffd_fd < 0) {
1655         return false;
1656     }
1657
1658     RCU_READ_LOCK_GUARD();
1659
1660     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1661         uint64_t uffd_ioctls;
1662
1663         /* Nothing to do with read-only and MMIO-writable regions */
1664         if (block->mr->readonly || block->mr->rom_device) {
1665             continue;
1666         }
1667         /* Try to register block memory via UFFD-IO to track writes */
1668         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1669                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1670             goto out;
1671         }
1672         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1673             goto out;
1674         }
1675     }
1676     ret = true;
1677
1678 out:
1679     uffd_close_fd(uffd_fd);
1680     return ret;
1681 }
1682
1683 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1684                                        ram_addr_t size)
1685 {
1686     /*
1687      * We read one byte of each page; this will preallocate page tables if
1688      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1689      * where no page was populated yet. This might require adaption when
1690      * supporting other mappings, like shmem.
1691      */
1692     for (; offset < size; offset += block->page_size) {
1693         char tmp = *((char *)block->host + offset);
1694
1695         /* Don't optimize the read out */
1696         asm volatile("" : "+r" (tmp));
1697     }
1698 }
1699
1700 static inline int populate_read_section(MemoryRegionSection *section,
1701                                         void *opaque)
1702 {
1703     const hwaddr size = int128_get64(section->size);
1704     hwaddr offset = section->offset_within_region;
1705     RAMBlock *block = section->mr->ram_block;
1706
1707     populate_read_range(block, offset, size);
1708     return 0;
1709 }
1710
1711 /*
1712  * ram_block_populate_read: preallocate page tables and populate pages in the
1713  *   RAM block by reading a byte of each page.
1714  *
1715  * Since it's solely used for userfault_fd WP feature, here we just
1716  *   hardcode page size to qemu_real_host_page_size.
1717  *
1718  * @block: RAM block to populate
1719  */
1720 static void ram_block_populate_read(RAMBlock *rb)
1721 {
1722     /*
1723      * Skip populating all pages that fall into a discarded range as managed by
1724      * a RamDiscardManager responsible for the mapped memory region of the
1725      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1726      * must not get populated automatically. We don't have to track
1727      * modifications via userfaultfd WP reliably, because these pages will
1728      * not be part of the migration stream either way -- see
1729      * ramblock_dirty_bitmap_exclude_discarded_pages().
1730      *
1731      * Note: The result is only stable while migrating (precopy/postcopy).
1732      */
1733     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1734         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1735         MemoryRegionSection section = {
1736             .mr = rb->mr,
1737             .offset_within_region = 0,
1738             .size = rb->mr->size,
1739         };
1740
1741         ram_discard_manager_replay_populated(rdm, &section,
1742                                              populate_read_section, NULL);
1743     } else {
1744         populate_read_range(rb, 0, rb->used_length);
1745     }
1746 }
1747
1748 /*
1749  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1750  */
1751 void ram_write_tracking_prepare(void)
1752 {
1753     RAMBlock *block;
1754
1755     RCU_READ_LOCK_GUARD();
1756
1757     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1758         /* Nothing to do with read-only and MMIO-writable regions */
1759         if (block->mr->readonly || block->mr->rom_device) {
1760             continue;
1761         }
1762
1763         /*
1764          * Populate pages of the RAM block before enabling userfault_fd
1765          * write protection.
1766          *
1767          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1768          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1769          * pages with pte_none() entries in page table.
1770          */
1771         ram_block_populate_read(block);
1772     }
1773 }
1774
1775 /*
1776  * ram_write_tracking_start: start UFFD-WP memory tracking
1777  *
1778  * Returns 0 for success or negative value in case of error
1779  */
1780 int ram_write_tracking_start(void)
1781 {
1782     int uffd_fd;
1783     RAMState *rs = ram_state;
1784     RAMBlock *block;
1785
1786     /* Open UFFD file descriptor */
1787     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1788     if (uffd_fd < 0) {
1789         return uffd_fd;
1790     }
1791     rs->uffdio_fd = uffd_fd;
1792
1793     RCU_READ_LOCK_GUARD();
1794
1795     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1796         /* Nothing to do with read-only and MMIO-writable regions */
1797         if (block->mr->readonly || block->mr->rom_device) {
1798             continue;
1799         }
1800
1801         /* Register block memory with UFFD to track writes */
1802         if (uffd_register_memory(rs->uffdio_fd, block->host,
1803                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1804             goto fail;
1805         }
1806         /* Apply UFFD write protection to the block memory range */
1807         if (uffd_change_protection(rs->uffdio_fd, block->host,
1808                 block->max_length, true, false)) {
1809             goto fail;
1810         }
1811         block->flags |= RAM_UF_WRITEPROTECT;
1812         memory_region_ref(block->mr);
1813
1814         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1815                 block->host, block->max_length);
1816     }
1817
1818     return 0;
1819
1820 fail:
1821     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1822
1823     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1824         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1825             continue;
1826         }
1827         /*
1828          * In case some memory block failed to be write-protected
1829          * remove protection and unregister all succeeded RAM blocks
1830          */
1831         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1832                 false, false);
1833         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1834         /* Cleanup flags and remove reference */
1835         block->flags &= ~RAM_UF_WRITEPROTECT;
1836         memory_region_unref(block->mr);
1837     }
1838
1839     uffd_close_fd(uffd_fd);
1840     rs->uffdio_fd = -1;
1841     return -1;
1842 }
1843
1844 /**
1845  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1846  */
1847 void ram_write_tracking_stop(void)
1848 {
1849     RAMState *rs = ram_state;
1850     RAMBlock *block;
1851
1852     RCU_READ_LOCK_GUARD();
1853
1854     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1855         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1856             continue;
1857         }
1858         /* Remove protection and unregister all affected RAM blocks */
1859         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1860                 false, false);
1861         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1862
1863         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1864                 block->host, block->max_length);
1865
1866         /* Cleanup flags and remove reference */
1867         block->flags &= ~RAM_UF_WRITEPROTECT;
1868         memory_region_unref(block->mr);
1869     }
1870
1871     /* Finally close UFFD file descriptor */
1872     uffd_close_fd(rs->uffdio_fd);
1873     rs->uffdio_fd = -1;
1874 }
1875
1876 #else
1877 /* No target OS support, stubs just fail or ignore */
1878
1879 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1880 {
1881     (void) rs;
1882     (void) offset;
1883
1884     return NULL;
1885 }
1886
1887 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1888         unsigned long start_page)
1889 {
1890     (void) rs;
1891     (void) pss;
1892     (void) start_page;
1893
1894     return 0;
1895 }
1896
1897 bool ram_write_tracking_available(void)
1898 {
1899     return false;
1900 }
1901
1902 bool ram_write_tracking_compatible(void)
1903 {
1904     assert(0);
1905     return false;
1906 }
1907
1908 int ram_write_tracking_start(void)
1909 {
1910     assert(0);
1911     return -1;
1912 }
1913
1914 void ram_write_tracking_stop(void)
1915 {
1916     assert(0);
1917 }
1918 #endif /* defined(__linux__) */
1919
1920 /**
1921  * get_queued_page: unqueue a page from the postcopy requests
1922  *
1923  * Skips pages that are already sent (!dirty)
1924  *
1925  * Returns true if a queued page is found
1926  *
1927  * @rs: current RAM state
1928  * @pss: data about the state of the current dirty page scan
1929  */
1930 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1931 {
1932     RAMBlock  *block;
1933     ram_addr_t offset;
1934     bool dirty;
1935
1936     do {
1937         block = unqueue_page(rs, &offset);
1938         /*
1939          * We're sending this page, and since it's postcopy nothing else
1940          * will dirty it, and we must make sure it doesn't get sent again
1941          * even if this queue request was received after the background
1942          * search already sent it.
1943          */
1944         if (block) {
1945             unsigned long page;
1946
1947             page = offset >> TARGET_PAGE_BITS;
1948             dirty = test_bit(page, block->bmap);
1949             if (!dirty) {
1950                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1951                                                 page);
1952             } else {
1953                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1954             }
1955         }
1956
1957     } while (block && !dirty);
1958
1959     if (!block) {
1960         /*
1961          * Poll write faults too if background snapshot is enabled; that's
1962          * when we have vcpus got blocked by the write protected pages.
1963          */
1964         block = poll_fault_page(rs, &offset);
1965     }
1966
1967     if (block) {
1968         /*
1969          * We want the background search to continue from the queued page
1970          * since the guest is likely to want other pages near to the page
1971          * it just requested.
1972          */
1973         pss->block = block;
1974         pss->page = offset >> TARGET_PAGE_BITS;
1975
1976         /*
1977          * This unqueued page would break the "one round" check, even is
1978          * really rare.
1979          */
1980         pss->complete_round = false;
1981     }
1982
1983     return !!block;
1984 }
1985
1986 /**
1987  * migration_page_queue_free: drop any remaining pages in the ram
1988  * request queue
1989  *
1990  * It should be empty at the end anyway, but in error cases there may
1991  * be some left.  in case that there is any page left, we drop it.
1992  *
1993  */
1994 static void migration_page_queue_free(RAMState *rs)
1995 {
1996     struct RAMSrcPageRequest *mspr, *next_mspr;
1997     /* This queue generally should be empty - but in the case of a failed
1998      * migration might have some droppings in.
1999      */
2000     RCU_READ_LOCK_GUARD();
2001     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2002         memory_region_unref(mspr->rb->mr);
2003         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2004         g_free(mspr);
2005     }
2006 }
2007
2008 /**
2009  * ram_save_queue_pages: queue the page for transmission
2010  *
2011  * A request from postcopy destination for example.
2012  *
2013  * Returns zero on success or negative on error
2014  *
2015  * @rbname: Name of the RAMBLock of the request. NULL means the
2016  *          same that last one.
2017  * @start: starting address from the start of the RAMBlock
2018  * @len: length (in bytes) to send
2019  */
2020 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2021 {
2022     RAMBlock *ramblock;
2023     RAMState *rs = ram_state;
2024
2025     ram_counters.postcopy_requests++;
2026     RCU_READ_LOCK_GUARD();
2027
2028     if (!rbname) {
2029         /* Reuse last RAMBlock */
2030         ramblock = rs->last_req_rb;
2031
2032         if (!ramblock) {
2033             /*
2034              * Shouldn't happen, we can't reuse the last RAMBlock if
2035              * it's the 1st request.
2036              */
2037             error_report("ram_save_queue_pages no previous block");
2038             return -1;
2039         }
2040     } else {
2041         ramblock = qemu_ram_block_by_name(rbname);
2042
2043         if (!ramblock) {
2044             /* We shouldn't be asked for a non-existent RAMBlock */
2045             error_report("ram_save_queue_pages no block '%s'", rbname);
2046             return -1;
2047         }
2048         rs->last_req_rb = ramblock;
2049     }
2050     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2051     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2052         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2053                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2054                      __func__, start, len, ramblock->used_length);
2055         return -1;
2056     }
2057
2058     struct RAMSrcPageRequest *new_entry =
2059         g_malloc0(sizeof(struct RAMSrcPageRequest));
2060     new_entry->rb = ramblock;
2061     new_entry->offset = start;
2062     new_entry->len = len;
2063
2064     memory_region_ref(ramblock->mr);
2065     qemu_mutex_lock(&rs->src_page_req_mutex);
2066     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2067     migration_make_urgent_request();
2068     qemu_mutex_unlock(&rs->src_page_req_mutex);
2069
2070     return 0;
2071 }
2072
2073 static bool save_page_use_compression(RAMState *rs)
2074 {
2075     if (!migrate_use_compression()) {
2076         return false;
2077     }
2078
2079     /*
2080      * If xbzrle is enabled (e.g., after first round of migration), stop
2081      * using the data compression. In theory, xbzrle can do better than
2082      * compression.
2083      */
2084     if (rs->xbzrle_enabled) {
2085         return false;
2086     }
2087
2088     return true;
2089 }
2090
2091 /*
2092  * try to compress the page before posting it out, return true if the page
2093  * has been properly handled by compression, otherwise needs other
2094  * paths to handle it
2095  */
2096 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2097 {
2098     if (!save_page_use_compression(rs)) {
2099         return false;
2100     }
2101
2102     /*
2103      * When starting the process of a new block, the first page of
2104      * the block should be sent out before other pages in the same
2105      * block, and all the pages in last block should have been sent
2106      * out, keeping this order is important, because the 'cont' flag
2107      * is used to avoid resending the block name.
2108      *
2109      * We post the fist page as normal page as compression will take
2110      * much CPU resource.
2111      */
2112     if (block != rs->last_sent_block) {
2113         flush_compressed_data(rs);
2114         return false;
2115     }
2116
2117     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2118         return true;
2119     }
2120
2121     compression_counters.busy++;
2122     return false;
2123 }
2124
2125 /**
2126  * ram_save_target_page: save one target page
2127  *
2128  * Returns the number of pages written
2129  *
2130  * @rs: current RAM state
2131  * @pss: data about the page we want to send
2132  * @last_stage: if we are at the completion stage
2133  */
2134 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2135                                 bool last_stage)
2136 {
2137     RAMBlock *block = pss->block;
2138     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2139     int res;
2140
2141     if (control_save_page(rs, block, offset, &res)) {
2142         return res;
2143     }
2144
2145     if (save_compress_page(rs, block, offset)) {
2146         return 1;
2147     }
2148
2149     res = save_zero_page(rs, block, offset);
2150     if (res > 0) {
2151         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2152          * page would be stale
2153          */
2154         if (!save_page_use_compression(rs)) {
2155             XBZRLE_cache_lock();
2156             xbzrle_cache_zero_page(rs, block->offset + offset);
2157             XBZRLE_cache_unlock();
2158         }
2159         ram_release_pages(block->idstr, offset, res);
2160         return res;
2161     }
2162
2163     /*
2164      * Do not use multifd for:
2165      * 1. Compression as the first page in the new block should be posted out
2166      *    before sending the compressed page
2167      * 2. In postcopy as one whole host page should be placed
2168      */
2169     if (!save_page_use_compression(rs) && migrate_use_multifd()
2170         && !migration_in_postcopy()) {
2171         return ram_save_multifd_page(rs, block, offset);
2172     }
2173
2174     return ram_save_page(rs, pss, last_stage);
2175 }
2176
2177 /**
2178  * ram_save_host_page: save a whole host page
2179  *
2180  * Starting at *offset send pages up to the end of the current host
2181  * page. It's valid for the initial offset to point into the middle of
2182  * a host page in which case the remainder of the hostpage is sent.
2183  * Only dirty target pages are sent. Note that the host page size may
2184  * be a huge page for this block.
2185  * The saving stops at the boundary of the used_length of the block
2186  * if the RAMBlock isn't a multiple of the host page size.
2187  *
2188  * Returns the number of pages written or negative on error
2189  *
2190  * @rs: current RAM state
2191  * @ms: current migration state
2192  * @pss: data about the page we want to send
2193  * @last_stage: if we are at the completion stage
2194  */
2195 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2196                               bool last_stage)
2197 {
2198     int tmppages, pages = 0;
2199     size_t pagesize_bits =
2200         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2201     unsigned long hostpage_boundary =
2202         QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2203     unsigned long start_page = pss->page;
2204     int res;
2205
2206     if (ramblock_is_ignored(pss->block)) {
2207         error_report("block %s should not be migrated !", pss->block->idstr);
2208         return 0;
2209     }
2210
2211     do {
2212         /* Check the pages is dirty and if it is send it */
2213         if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2214             tmppages = ram_save_target_page(rs, pss, last_stage);
2215             if (tmppages < 0) {
2216                 return tmppages;
2217             }
2218
2219             pages += tmppages;
2220             /*
2221              * Allow rate limiting to happen in the middle of huge pages if
2222              * something is sent in the current iteration.
2223              */
2224             if (pagesize_bits > 1 && tmppages > 0) {
2225                 migration_rate_limit();
2226             }
2227         }
2228         pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2229     } while ((pss->page < hostpage_boundary) &&
2230              offset_in_ramblock(pss->block,
2231                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2232     /* The offset we leave with is the min boundary of host page and block */
2233     pss->page = MIN(pss->page, hostpage_boundary) - 1;
2234
2235     res = ram_save_release_protection(rs, pss, start_page);
2236     return (res < 0 ? res : pages);
2237 }
2238
2239 /**
2240  * ram_find_and_save_block: finds a dirty page and sends it to f
2241  *
2242  * Called within an RCU critical section.
2243  *
2244  * Returns the number of pages written where zero means no dirty pages,
2245  * or negative on error
2246  *
2247  * @rs: current RAM state
2248  * @last_stage: if we are at the completion stage
2249  *
2250  * On systems where host-page-size > target-page-size it will send all the
2251  * pages in a host page that are dirty.
2252  */
2253
2254 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2255 {
2256     PageSearchStatus pss;
2257     int pages = 0;
2258     bool again, found;
2259
2260     /* No dirty page as there is zero RAM */
2261     if (!ram_bytes_total()) {
2262         return pages;
2263     }
2264
2265     pss.block = rs->last_seen_block;
2266     pss.page = rs->last_page;
2267     pss.complete_round = false;
2268
2269     if (!pss.block) {
2270         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2271     }
2272
2273     do {
2274         again = true;
2275         found = get_queued_page(rs, &pss);
2276
2277         if (!found) {
2278             /* priority queue empty, so just search for something dirty */
2279             found = find_dirty_block(rs, &pss, &again);
2280         }
2281
2282         if (found) {
2283             pages = ram_save_host_page(rs, &pss, last_stage);
2284         }
2285     } while (!pages && again);
2286
2287     rs->last_seen_block = pss.block;
2288     rs->last_page = pss.page;
2289
2290     return pages;
2291 }
2292
2293 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2294 {
2295     uint64_t pages = size / TARGET_PAGE_SIZE;
2296
2297     if (zero) {
2298         ram_counters.duplicate += pages;
2299     } else {
2300         ram_counters.normal += pages;
2301         ram_counters.transferred += size;
2302         qemu_update_position(f, size);
2303     }
2304 }
2305
2306 static uint64_t ram_bytes_total_common(bool count_ignored)
2307 {
2308     RAMBlock *block;
2309     uint64_t total = 0;
2310
2311     RCU_READ_LOCK_GUARD();
2312
2313     if (count_ignored) {
2314         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2315             total += block->used_length;
2316         }
2317     } else {
2318         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2319             total += block->used_length;
2320         }
2321     }
2322     return total;
2323 }
2324
2325 uint64_t ram_bytes_total(void)
2326 {
2327     return ram_bytes_total_common(false);
2328 }
2329
2330 static void xbzrle_load_setup(void)
2331 {
2332     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2333 }
2334
2335 static void xbzrle_load_cleanup(void)
2336 {
2337     g_free(XBZRLE.decoded_buf);
2338     XBZRLE.decoded_buf = NULL;
2339 }
2340
2341 static void ram_state_cleanup(RAMState **rsp)
2342 {
2343     if (*rsp) {
2344         migration_page_queue_free(*rsp);
2345         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2346         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2347         g_free(*rsp);
2348         *rsp = NULL;
2349     }
2350 }
2351
2352 static void xbzrle_cleanup(void)
2353 {
2354     XBZRLE_cache_lock();
2355     if (XBZRLE.cache) {
2356         cache_fini(XBZRLE.cache);
2357         g_free(XBZRLE.encoded_buf);
2358         g_free(XBZRLE.current_buf);
2359         g_free(XBZRLE.zero_target_page);
2360         XBZRLE.cache = NULL;
2361         XBZRLE.encoded_buf = NULL;
2362         XBZRLE.current_buf = NULL;
2363         XBZRLE.zero_target_page = NULL;
2364     }
2365     XBZRLE_cache_unlock();
2366 }
2367
2368 static void ram_save_cleanup(void *opaque)
2369 {
2370     RAMState **rsp = opaque;
2371     RAMBlock *block;
2372
2373     /* We don't use dirty log with background snapshots */
2374     if (!migrate_background_snapshot()) {
2375         /* caller have hold iothread lock or is in a bh, so there is
2376          * no writing race against the migration bitmap
2377          */
2378         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2379             /*
2380              * do not stop dirty log without starting it, since
2381              * memory_global_dirty_log_stop will assert that
2382              * memory_global_dirty_log_start/stop used in pairs
2383              */
2384             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2385         }
2386     }
2387
2388     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2389         g_free(block->clear_bmap);
2390         block->clear_bmap = NULL;
2391         g_free(block->bmap);
2392         block->bmap = NULL;
2393     }
2394
2395     xbzrle_cleanup();
2396     compress_threads_save_cleanup();
2397     ram_state_cleanup(rsp);
2398 }
2399
2400 static void ram_state_reset(RAMState *rs)
2401 {
2402     rs->last_seen_block = NULL;
2403     rs->last_sent_block = NULL;
2404     rs->last_page = 0;
2405     rs->last_version = ram_list.version;
2406     rs->xbzrle_enabled = false;
2407 }
2408
2409 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2410
2411 /*
2412  * 'expected' is the value you expect the bitmap mostly to be full
2413  * of; it won't bother printing lines that are all this value.
2414  * If 'todump' is null the migration bitmap is dumped.
2415  */
2416 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2417                            unsigned long pages)
2418 {
2419     int64_t cur;
2420     int64_t linelen = 128;
2421     char linebuf[129];
2422
2423     for (cur = 0; cur < pages; cur += linelen) {
2424         int64_t curb;
2425         bool found = false;
2426         /*
2427          * Last line; catch the case where the line length
2428          * is longer than remaining ram
2429          */
2430         if (cur + linelen > pages) {
2431             linelen = pages - cur;
2432         }
2433         for (curb = 0; curb < linelen; curb++) {
2434             bool thisbit = test_bit(cur + curb, todump);
2435             linebuf[curb] = thisbit ? '1' : '.';
2436             found = found || (thisbit != expected);
2437         }
2438         if (found) {
2439             linebuf[curb] = '\0';
2440             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2441         }
2442     }
2443 }
2444
2445 /* **** functions for postcopy ***** */
2446
2447 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2448 {
2449     struct RAMBlock *block;
2450
2451     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2452         unsigned long *bitmap = block->bmap;
2453         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2454         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2455
2456         while (run_start < range) {
2457             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2458             ram_discard_range(block->idstr,
2459                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2460                               ((ram_addr_t)(run_end - run_start))
2461                                 << TARGET_PAGE_BITS);
2462             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2463         }
2464     }
2465 }
2466
2467 /**
2468  * postcopy_send_discard_bm_ram: discard a RAMBlock
2469  *
2470  * Returns zero on success
2471  *
2472  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2473  *
2474  * @ms: current migration state
2475  * @block: RAMBlock to discard
2476  */
2477 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2478 {
2479     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2480     unsigned long current;
2481     unsigned long *bitmap = block->bmap;
2482
2483     for (current = 0; current < end; ) {
2484         unsigned long one = find_next_bit(bitmap, end, current);
2485         unsigned long zero, discard_length;
2486
2487         if (one >= end) {
2488             break;
2489         }
2490
2491         zero = find_next_zero_bit(bitmap, end, one + 1);
2492
2493         if (zero >= end) {
2494             discard_length = end - one;
2495         } else {
2496             discard_length = zero - one;
2497         }
2498         postcopy_discard_send_range(ms, one, discard_length);
2499         current = one + discard_length;
2500     }
2501
2502     return 0;
2503 }
2504
2505 /**
2506  * postcopy_each_ram_send_discard: discard all RAMBlocks
2507  *
2508  * Returns 0 for success or negative for error
2509  *
2510  * Utility for the outgoing postcopy code.
2511  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2512  *   passing it bitmap indexes and name.
2513  * (qemu_ram_foreach_block ends up passing unscaled lengths
2514  *  which would mean postcopy code would have to deal with target page)
2515  *
2516  * @ms: current migration state
2517  */
2518 static int postcopy_each_ram_send_discard(MigrationState *ms)
2519 {
2520     struct RAMBlock *block;
2521     int ret;
2522
2523     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2524         postcopy_discard_send_init(ms, block->idstr);
2525
2526         /*
2527          * Postcopy sends chunks of bitmap over the wire, but it
2528          * just needs indexes at this point, avoids it having
2529          * target page specific code.
2530          */
2531         ret = postcopy_send_discard_bm_ram(ms, block);
2532         postcopy_discard_send_finish(ms);
2533         if (ret) {
2534             return ret;
2535         }
2536     }
2537
2538     return 0;
2539 }
2540
2541 /**
2542  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2543  *
2544  * Helper for postcopy_chunk_hostpages; it's called twice to
2545  * canonicalize the two bitmaps, that are similar, but one is
2546  * inverted.
2547  *
2548  * Postcopy requires that all target pages in a hostpage are dirty or
2549  * clean, not a mix.  This function canonicalizes the bitmaps.
2550  *
2551  * @ms: current migration state
2552  * @block: block that contains the page we want to canonicalize
2553  */
2554 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2555 {
2556     RAMState *rs = ram_state;
2557     unsigned long *bitmap = block->bmap;
2558     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2559     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2560     unsigned long run_start;
2561
2562     if (block->page_size == TARGET_PAGE_SIZE) {
2563         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2564         return;
2565     }
2566
2567     /* Find a dirty page */
2568     run_start = find_next_bit(bitmap, pages, 0);
2569
2570     while (run_start < pages) {
2571
2572         /*
2573          * If the start of this run of pages is in the middle of a host
2574          * page, then we need to fixup this host page.
2575          */
2576         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2577             /* Find the end of this run */
2578             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2579             /*
2580              * If the end isn't at the start of a host page, then the
2581              * run doesn't finish at the end of a host page
2582              * and we need to discard.
2583              */
2584         }
2585
2586         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2587             unsigned long page;
2588             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2589                                                              host_ratio);
2590             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2591
2592             /* Clean up the bitmap */
2593             for (page = fixup_start_addr;
2594                  page < fixup_start_addr + host_ratio; page++) {
2595                 /*
2596                  * Remark them as dirty, updating the count for any pages
2597                  * that weren't previously dirty.
2598                  */
2599                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2600             }
2601         }
2602
2603         /* Find the next dirty page for the next iteration */
2604         run_start = find_next_bit(bitmap, pages, run_start);
2605     }
2606 }
2607
2608 /**
2609  * postcopy_chunk_hostpages: discard any partially sent host page
2610  *
2611  * Utility for the outgoing postcopy code.
2612  *
2613  * Discard any partially sent host-page size chunks, mark any partially
2614  * dirty host-page size chunks as all dirty.  In this case the host-page
2615  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2616  *
2617  * Returns zero on success
2618  *
2619  * @ms: current migration state
2620  * @block: block we want to work with
2621  */
2622 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2623 {
2624     postcopy_discard_send_init(ms, block->idstr);
2625
2626     /*
2627      * Ensure that all partially dirty host pages are made fully dirty.
2628      */
2629     postcopy_chunk_hostpages_pass(ms, block);
2630
2631     postcopy_discard_send_finish(ms);
2632     return 0;
2633 }
2634
2635 /**
2636  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2637  *
2638  * Returns zero on success
2639  *
2640  * Transmit the set of pages to be discarded after precopy to the target
2641  * these are pages that:
2642  *     a) Have been previously transmitted but are now dirty again
2643  *     b) Pages that have never been transmitted, this ensures that
2644  *        any pages on the destination that have been mapped by background
2645  *        tasks get discarded (transparent huge pages is the specific concern)
2646  * Hopefully this is pretty sparse
2647  *
2648  * @ms: current migration state
2649  */
2650 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2651 {
2652     RAMState *rs = ram_state;
2653     RAMBlock *block;
2654     int ret;
2655
2656     RCU_READ_LOCK_GUARD();
2657
2658     /* This should be our last sync, the src is now paused */
2659     migration_bitmap_sync(rs);
2660
2661     /* Easiest way to make sure we don't resume in the middle of a host-page */
2662     rs->last_seen_block = NULL;
2663     rs->last_sent_block = NULL;
2664     rs->last_page = 0;
2665
2666     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2667         /* Deal with TPS != HPS and huge pages */
2668         ret = postcopy_chunk_hostpages(ms, block);
2669         if (ret) {
2670             return ret;
2671         }
2672
2673 #ifdef DEBUG_POSTCOPY
2674         ram_debug_dump_bitmap(block->bmap, true,
2675                               block->used_length >> TARGET_PAGE_BITS);
2676 #endif
2677     }
2678     trace_ram_postcopy_send_discard_bitmap();
2679
2680     return postcopy_each_ram_send_discard(ms);
2681 }
2682
2683 /**
2684  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2685  *
2686  * Returns zero on success
2687  *
2688  * @rbname: name of the RAMBlock of the request. NULL means the
2689  *          same that last one.
2690  * @start: RAMBlock starting page
2691  * @length: RAMBlock size
2692  */
2693 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2694 {
2695     trace_ram_discard_range(rbname, start, length);
2696
2697     RCU_READ_LOCK_GUARD();
2698     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2699
2700     if (!rb) {
2701         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2702         return -1;
2703     }
2704
2705     /*
2706      * On source VM, we don't need to update the received bitmap since
2707      * we don't even have one.
2708      */
2709     if (rb->receivedmap) {
2710         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2711                      length >> qemu_target_page_bits());
2712     }
2713
2714     return ram_block_discard_range(rb, start, length);
2715 }
2716
2717 /*
2718  * For every allocation, we will try not to crash the VM if the
2719  * allocation failed.
2720  */
2721 static int xbzrle_init(void)
2722 {
2723     Error *local_err = NULL;
2724
2725     if (!migrate_use_xbzrle()) {
2726         return 0;
2727     }
2728
2729     XBZRLE_cache_lock();
2730
2731     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2732     if (!XBZRLE.zero_target_page) {
2733         error_report("%s: Error allocating zero page", __func__);
2734         goto err_out;
2735     }
2736
2737     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2738                               TARGET_PAGE_SIZE, &local_err);
2739     if (!XBZRLE.cache) {
2740         error_report_err(local_err);
2741         goto free_zero_page;
2742     }
2743
2744     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2745     if (!XBZRLE.encoded_buf) {
2746         error_report("%s: Error allocating encoded_buf", __func__);
2747         goto free_cache;
2748     }
2749
2750     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2751     if (!XBZRLE.current_buf) {
2752         error_report("%s: Error allocating current_buf", __func__);
2753         goto free_encoded_buf;
2754     }
2755
2756     /* We are all good */
2757     XBZRLE_cache_unlock();
2758     return 0;
2759
2760 free_encoded_buf:
2761     g_free(XBZRLE.encoded_buf);
2762     XBZRLE.encoded_buf = NULL;
2763 free_cache:
2764     cache_fini(XBZRLE.cache);
2765     XBZRLE.cache = NULL;
2766 free_zero_page:
2767     g_free(XBZRLE.zero_target_page);
2768     XBZRLE.zero_target_page = NULL;
2769 err_out:
2770     XBZRLE_cache_unlock();
2771     return -ENOMEM;
2772 }
2773
2774 static int ram_state_init(RAMState **rsp)
2775 {
2776     *rsp = g_try_new0(RAMState, 1);
2777
2778     if (!*rsp) {
2779         error_report("%s: Init ramstate fail", __func__);
2780         return -1;
2781     }
2782
2783     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2784     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2785     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2786
2787     /*
2788      * Count the total number of pages used by ram blocks not including any
2789      * gaps due to alignment or unplugs.
2790      * This must match with the initial values of dirty bitmap.
2791      */
2792     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2793     ram_state_reset(*rsp);
2794
2795     return 0;
2796 }
2797
2798 static void ram_list_init_bitmaps(void)
2799 {
2800     MigrationState *ms = migrate_get_current();
2801     RAMBlock *block;
2802     unsigned long pages;
2803     uint8_t shift;
2804
2805     /* Skip setting bitmap if there is no RAM */
2806     if (ram_bytes_total()) {
2807         shift = ms->clear_bitmap_shift;
2808         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2809             error_report("clear_bitmap_shift (%u) too big, using "
2810                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2811             shift = CLEAR_BITMAP_SHIFT_MAX;
2812         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2813             error_report("clear_bitmap_shift (%u) too small, using "
2814                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2815             shift = CLEAR_BITMAP_SHIFT_MIN;
2816         }
2817
2818         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2819             pages = block->max_length >> TARGET_PAGE_BITS;
2820             /*
2821              * The initial dirty bitmap for migration must be set with all
2822              * ones to make sure we'll migrate every guest RAM page to
2823              * destination.
2824              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2825              * new migration after a failed migration, ram_list.
2826              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2827              * guest memory.
2828              */
2829             block->bmap = bitmap_new(pages);
2830             bitmap_set(block->bmap, 0, pages);
2831             block->clear_bmap_shift = shift;
2832             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2833         }
2834     }
2835 }
2836
2837 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2838 {
2839     unsigned long pages;
2840     RAMBlock *rb;
2841
2842     RCU_READ_LOCK_GUARD();
2843
2844     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2845             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2846             rs->migration_dirty_pages -= pages;
2847     }
2848 }
2849
2850 static void ram_init_bitmaps(RAMState *rs)
2851 {
2852     /* For memory_global_dirty_log_start below.  */
2853     qemu_mutex_lock_iothread();
2854     qemu_mutex_lock_ramlist();
2855
2856     WITH_RCU_READ_LOCK_GUARD() {
2857         ram_list_init_bitmaps();
2858         /* We don't use dirty log with background snapshots */
2859         if (!migrate_background_snapshot()) {
2860             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2861             migration_bitmap_sync_precopy(rs);
2862         }
2863     }
2864     qemu_mutex_unlock_ramlist();
2865     qemu_mutex_unlock_iothread();
2866
2867     /*
2868      * After an eventual first bitmap sync, fixup the initial bitmap
2869      * containing all 1s to exclude any discarded pages from migration.
2870      */
2871     migration_bitmap_clear_discarded_pages(rs);
2872 }
2873
2874 static int ram_init_all(RAMState **rsp)
2875 {
2876     if (ram_state_init(rsp)) {
2877         return -1;
2878     }
2879
2880     if (xbzrle_init()) {
2881         ram_state_cleanup(rsp);
2882         return -1;
2883     }
2884
2885     ram_init_bitmaps(*rsp);
2886
2887     return 0;
2888 }
2889
2890 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2891 {
2892     RAMBlock *block;
2893     uint64_t pages = 0;
2894
2895     /*
2896      * Postcopy is not using xbzrle/compression, so no need for that.
2897      * Also, since source are already halted, we don't need to care
2898      * about dirty page logging as well.
2899      */
2900
2901     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2902         pages += bitmap_count_one(block->bmap,
2903                                   block->used_length >> TARGET_PAGE_BITS);
2904     }
2905
2906     /* This may not be aligned with current bitmaps. Recalculate. */
2907     rs->migration_dirty_pages = pages;
2908
2909     ram_state_reset(rs);
2910
2911     /* Update RAMState cache of output QEMUFile */
2912     rs->f = out;
2913
2914     trace_ram_state_resume_prepare(pages);
2915 }
2916
2917 /*
2918  * This function clears bits of the free pages reported by the caller from the
2919  * migration dirty bitmap. @addr is the host address corresponding to the
2920  * start of the continuous guest free pages, and @len is the total bytes of
2921  * those pages.
2922  */
2923 void qemu_guest_free_page_hint(void *addr, size_t len)
2924 {
2925     RAMBlock *block;
2926     ram_addr_t offset;
2927     size_t used_len, start, npages;
2928     MigrationState *s = migrate_get_current();
2929
2930     /* This function is currently expected to be used during live migration */
2931     if (!migration_is_setup_or_active(s->state)) {
2932         return;
2933     }
2934
2935     for (; len > 0; len -= used_len, addr += used_len) {
2936         block = qemu_ram_block_from_host(addr, false, &offset);
2937         if (unlikely(!block || offset >= block->used_length)) {
2938             /*
2939              * The implementation might not support RAMBlock resize during
2940              * live migration, but it could happen in theory with future
2941              * updates. So we add a check here to capture that case.
2942              */
2943             error_report_once("%s unexpected error", __func__);
2944             return;
2945         }
2946
2947         if (len <= block->used_length - offset) {
2948             used_len = len;
2949         } else {
2950             used_len = block->used_length - offset;
2951         }
2952
2953         start = offset >> TARGET_PAGE_BITS;
2954         npages = used_len >> TARGET_PAGE_BITS;
2955
2956         qemu_mutex_lock(&ram_state->bitmap_mutex);
2957         /*
2958          * The skipped free pages are equavalent to be sent from clear_bmap's
2959          * perspective, so clear the bits from the memory region bitmap which
2960          * are initially set. Otherwise those skipped pages will be sent in
2961          * the next round after syncing from the memory region bitmap.
2962          */
2963         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2964         ram_state->migration_dirty_pages -=
2965                       bitmap_count_one_with_offset(block->bmap, start, npages);
2966         bitmap_clear(block->bmap, start, npages);
2967         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2968     }
2969 }
2970
2971 /*
2972  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2973  * long-running RCU critical section.  When rcu-reclaims in the code
2974  * start to become numerous it will be necessary to reduce the
2975  * granularity of these critical sections.
2976  */
2977
2978 /**
2979  * ram_save_setup: Setup RAM for migration
2980  *
2981  * Returns zero to indicate success and negative for error
2982  *
2983  * @f: QEMUFile where to send the data
2984  * @opaque: RAMState pointer
2985  */
2986 static int ram_save_setup(QEMUFile *f, void *opaque)
2987 {
2988     RAMState **rsp = opaque;
2989     RAMBlock *block;
2990
2991     if (compress_threads_save_setup()) {
2992         return -1;
2993     }
2994
2995     /* migration has already setup the bitmap, reuse it. */
2996     if (!migration_in_colo_state()) {
2997         if (ram_init_all(rsp) != 0) {
2998             compress_threads_save_cleanup();
2999             return -1;
3000         }
3001     }
3002     (*rsp)->f = f;
3003
3004     WITH_RCU_READ_LOCK_GUARD() {
3005         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3006
3007         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3008             qemu_put_byte(f, strlen(block->idstr));
3009             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3010             qemu_put_be64(f, block->used_length);
3011             if (migrate_postcopy_ram() && block->page_size !=
3012                                           qemu_host_page_size) {
3013                 qemu_put_be64(f, block->page_size);
3014             }
3015             if (migrate_ignore_shared()) {
3016                 qemu_put_be64(f, block->mr->addr);
3017             }
3018         }
3019     }
3020
3021     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3022     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3023
3024     multifd_send_sync_main(f);
3025     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3026     qemu_fflush(f);
3027
3028     return 0;
3029 }
3030
3031 /**
3032  * ram_save_iterate: iterative stage for migration
3033  *
3034  * Returns zero to indicate success and negative for error
3035  *
3036  * @f: QEMUFile where to send the data
3037  * @opaque: RAMState pointer
3038  */
3039 static int ram_save_iterate(QEMUFile *f, void *opaque)
3040 {
3041     RAMState **temp = opaque;
3042     RAMState *rs = *temp;
3043     int ret = 0;
3044     int i;
3045     int64_t t0;
3046     int done = 0;
3047
3048     if (blk_mig_bulk_active()) {
3049         /* Avoid transferring ram during bulk phase of block migration as
3050          * the bulk phase will usually take a long time and transferring
3051          * ram updates during that time is pointless. */
3052         goto out;
3053     }
3054
3055     /*
3056      * We'll take this lock a little bit long, but it's okay for two reasons.
3057      * Firstly, the only possible other thread to take it is who calls
3058      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3059      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3060      * guarantees that we'll at least released it in a regular basis.
3061      */
3062     qemu_mutex_lock(&rs->bitmap_mutex);
3063     WITH_RCU_READ_LOCK_GUARD() {
3064         if (ram_list.version != rs->last_version) {
3065             ram_state_reset(rs);
3066         }
3067
3068         /* Read version before ram_list.blocks */
3069         smp_rmb();
3070
3071         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3072
3073         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3074         i = 0;
3075         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3076                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3077             int pages;
3078
3079             if (qemu_file_get_error(f)) {
3080                 break;
3081             }
3082
3083             pages = ram_find_and_save_block(rs, false);
3084             /* no more pages to sent */
3085             if (pages == 0) {
3086                 done = 1;
3087                 break;
3088             }
3089
3090             if (pages < 0) {
3091                 qemu_file_set_error(f, pages);
3092                 break;
3093             }
3094
3095             rs->target_page_count += pages;
3096
3097             /*
3098              * During postcopy, it is necessary to make sure one whole host
3099              * page is sent in one chunk.
3100              */
3101             if (migrate_postcopy_ram()) {
3102                 flush_compressed_data(rs);
3103             }
3104
3105             /*
3106              * we want to check in the 1st loop, just in case it was the 1st
3107              * time and we had to sync the dirty bitmap.
3108              * qemu_clock_get_ns() is a bit expensive, so we only check each
3109              * some iterations
3110              */
3111             if ((i & 63) == 0) {
3112                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3113                               1000000;
3114                 if (t1 > MAX_WAIT) {
3115                     trace_ram_save_iterate_big_wait(t1, i);
3116                     break;
3117                 }
3118             }
3119             i++;
3120         }
3121     }
3122     qemu_mutex_unlock(&rs->bitmap_mutex);
3123
3124     /*
3125      * Must occur before EOS (or any QEMUFile operation)
3126      * because of RDMA protocol.
3127      */
3128     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3129
3130 out:
3131     if (ret >= 0
3132         && migration_is_setup_or_active(migrate_get_current()->state)) {
3133         multifd_send_sync_main(rs->f);
3134         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3135         qemu_fflush(f);
3136         ram_counters.transferred += 8;
3137
3138         ret = qemu_file_get_error(f);
3139     }
3140     if (ret < 0) {
3141         return ret;
3142     }
3143
3144     return done;
3145 }
3146
3147 /**
3148  * ram_save_complete: function called to send the remaining amount of ram
3149  *
3150  * Returns zero to indicate success or negative on error
3151  *
3152  * Called with iothread lock
3153  *
3154  * @f: QEMUFile where to send the data
3155  * @opaque: RAMState pointer
3156  */
3157 static int ram_save_complete(QEMUFile *f, void *opaque)
3158 {
3159     RAMState **temp = opaque;
3160     RAMState *rs = *temp;
3161     int ret = 0;
3162
3163     WITH_RCU_READ_LOCK_GUARD() {
3164         if (!migration_in_postcopy()) {
3165             migration_bitmap_sync_precopy(rs);
3166         }
3167
3168         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3169
3170         /* try transferring iterative blocks of memory */
3171
3172         /* flush all remaining blocks regardless of rate limiting */
3173         while (true) {
3174             int pages;
3175
3176             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3177             /* no more blocks to sent */
3178             if (pages == 0) {
3179                 break;
3180             }
3181             if (pages < 0) {
3182                 ret = pages;
3183                 break;
3184             }
3185         }
3186
3187         flush_compressed_data(rs);
3188         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3189     }
3190
3191     if (ret >= 0) {
3192         multifd_send_sync_main(rs->f);
3193         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3194         qemu_fflush(f);
3195     }
3196
3197     return ret;
3198 }
3199
3200 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3201                              uint64_t *res_precopy_only,
3202                              uint64_t *res_compatible,
3203                              uint64_t *res_postcopy_only)
3204 {
3205     RAMState **temp = opaque;
3206     RAMState *rs = *temp;
3207     uint64_t remaining_size;
3208
3209     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3210
3211     if (!migration_in_postcopy() &&
3212         remaining_size < max_size) {
3213         qemu_mutex_lock_iothread();
3214         WITH_RCU_READ_LOCK_GUARD() {
3215             migration_bitmap_sync_precopy(rs);
3216         }
3217         qemu_mutex_unlock_iothread();
3218         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3219     }
3220
3221     if (migrate_postcopy_ram()) {
3222         /* We can do postcopy, and all the data is postcopiable */
3223         *res_compatible += remaining_size;
3224     } else {
3225         *res_precopy_only += remaining_size;
3226     }
3227 }
3228
3229 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3230 {
3231     unsigned int xh_len;
3232     int xh_flags;
3233     uint8_t *loaded_data;
3234
3235     /* extract RLE header */
3236     xh_flags = qemu_get_byte(f);
3237     xh_len = qemu_get_be16(f);
3238
3239     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3240         error_report("Failed to load XBZRLE page - wrong compression!");
3241         return -1;
3242     }
3243
3244     if (xh_len > TARGET_PAGE_SIZE) {
3245         error_report("Failed to load XBZRLE page - len overflow!");
3246         return -1;
3247     }
3248     loaded_data = XBZRLE.decoded_buf;
3249     /* load data and decode */
3250     /* it can change loaded_data to point to an internal buffer */
3251     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3252
3253     /* decode RLE */
3254     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3255                              TARGET_PAGE_SIZE) == -1) {
3256         error_report("Failed to load XBZRLE page - decode error!");
3257         return -1;
3258     }
3259
3260     return 0;
3261 }
3262
3263 /**
3264  * ram_block_from_stream: read a RAMBlock id from the migration stream
3265  *
3266  * Must be called from within a rcu critical section.
3267  *
3268  * Returns a pointer from within the RCU-protected ram_list.
3269  *
3270  * @f: QEMUFile where to read the data from
3271  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3272  */
3273 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3274 {
3275     static RAMBlock *block;
3276     char id[256];
3277     uint8_t len;
3278
3279     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3280         if (!block) {
3281             error_report("Ack, bad migration stream!");
3282             return NULL;
3283         }
3284         return block;
3285     }
3286
3287     len = qemu_get_byte(f);
3288     qemu_get_buffer(f, (uint8_t *)id, len);
3289     id[len] = 0;
3290
3291     block = qemu_ram_block_by_name(id);
3292     if (!block) {
3293         error_report("Can't find block %s", id);
3294         return NULL;
3295     }
3296
3297     if (ramblock_is_ignored(block)) {
3298         error_report("block %s should not be migrated !", id);
3299         return NULL;
3300     }
3301
3302     return block;
3303 }
3304
3305 static inline void *host_from_ram_block_offset(RAMBlock *block,
3306                                                ram_addr_t offset)
3307 {
3308     if (!offset_in_ramblock(block, offset)) {
3309         return NULL;
3310     }
3311
3312     return block->host + offset;
3313 }
3314
3315 static void *host_page_from_ram_block_offset(RAMBlock *block,
3316                                              ram_addr_t offset)
3317 {
3318     /* Note: Explicitly no check against offset_in_ramblock(). */
3319     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3320                                    block->page_size);
3321 }
3322
3323 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3324                                                          ram_addr_t offset)
3325 {
3326     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3327 }
3328
3329 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3330                              ram_addr_t offset, bool record_bitmap)
3331 {
3332     if (!offset_in_ramblock(block, offset)) {
3333         return NULL;
3334     }
3335     if (!block->colo_cache) {
3336         error_report("%s: colo_cache is NULL in block :%s",
3337                      __func__, block->idstr);
3338         return NULL;
3339     }
3340
3341     /*
3342     * During colo checkpoint, we need bitmap of these migrated pages.
3343     * It help us to decide which pages in ram cache should be flushed
3344     * into VM's RAM later.
3345     */
3346     if (record_bitmap &&
3347         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3348         ram_state->migration_dirty_pages++;
3349     }
3350     return block->colo_cache + offset;
3351 }
3352
3353 /**
3354  * ram_handle_compressed: handle the zero page case
3355  *
3356  * If a page (or a whole RDMA chunk) has been
3357  * determined to be zero, then zap it.
3358  *
3359  * @host: host address for the zero page
3360  * @ch: what the page is filled from.  We only support zero
3361  * @size: size of the zero page
3362  */
3363 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3364 {
3365     if (ch != 0 || !buffer_is_zero(host, size)) {
3366         memset(host, ch, size);
3367     }
3368 }
3369
3370 /* return the size after decompression, or negative value on error */
3371 static int
3372 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3373                      const uint8_t *source, size_t source_len)
3374 {
3375     int err;
3376
3377     err = inflateReset(stream);
3378     if (err != Z_OK) {
3379         return -1;
3380     }
3381
3382     stream->avail_in = source_len;
3383     stream->next_in = (uint8_t *)source;
3384     stream->avail_out = dest_len;
3385     stream->next_out = dest;
3386
3387     err = inflate(stream, Z_NO_FLUSH);
3388     if (err != Z_STREAM_END) {
3389         return -1;
3390     }
3391
3392     return stream->total_out;
3393 }
3394
3395 static void *do_data_decompress(void *opaque)
3396 {
3397     DecompressParam *param = opaque;
3398     unsigned long pagesize;
3399     uint8_t *des;
3400     int len, ret;
3401
3402     qemu_mutex_lock(&param->mutex);
3403     while (!param->quit) {
3404         if (param->des) {
3405             des = param->des;
3406             len = param->len;
3407             param->des = 0;
3408             qemu_mutex_unlock(&param->mutex);
3409
3410             pagesize = TARGET_PAGE_SIZE;
3411
3412             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3413                                        param->compbuf, len);
3414             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3415                 error_report("decompress data failed");
3416                 qemu_file_set_error(decomp_file, ret);
3417             }
3418
3419             qemu_mutex_lock(&decomp_done_lock);
3420             param->done = true;
3421             qemu_cond_signal(&decomp_done_cond);
3422             qemu_mutex_unlock(&decomp_done_lock);
3423
3424             qemu_mutex_lock(&param->mutex);
3425         } else {
3426             qemu_cond_wait(&param->cond, &param->mutex);
3427         }
3428     }
3429     qemu_mutex_unlock(&param->mutex);
3430
3431     return NULL;
3432 }
3433
3434 static int wait_for_decompress_done(void)
3435 {
3436     int idx, thread_count;
3437
3438     if (!migrate_use_compression()) {
3439         return 0;
3440     }
3441
3442     thread_count = migrate_decompress_threads();
3443     qemu_mutex_lock(&decomp_done_lock);
3444     for (idx = 0; idx < thread_count; idx++) {
3445         while (!decomp_param[idx].done) {
3446             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3447         }
3448     }
3449     qemu_mutex_unlock(&decomp_done_lock);
3450     return qemu_file_get_error(decomp_file);
3451 }
3452
3453 static void compress_threads_load_cleanup(void)
3454 {
3455     int i, thread_count;
3456
3457     if (!migrate_use_compression()) {
3458         return;
3459     }
3460     thread_count = migrate_decompress_threads();
3461     for (i = 0; i < thread_count; i++) {
3462         /*
3463          * we use it as a indicator which shows if the thread is
3464          * properly init'd or not
3465          */
3466         if (!decomp_param[i].compbuf) {
3467             break;
3468         }
3469
3470         qemu_mutex_lock(&decomp_param[i].mutex);
3471         decomp_param[i].quit = true;
3472         qemu_cond_signal(&decomp_param[i].cond);
3473         qemu_mutex_unlock(&decomp_param[i].mutex);
3474     }
3475     for (i = 0; i < thread_count; i++) {
3476         if (!decomp_param[i].compbuf) {
3477             break;
3478         }
3479
3480         qemu_thread_join(decompress_threads + i);
3481         qemu_mutex_destroy(&decomp_param[i].mutex);
3482         qemu_cond_destroy(&decomp_param[i].cond);
3483         inflateEnd(&decomp_param[i].stream);
3484         g_free(decomp_param[i].compbuf);
3485         decomp_param[i].compbuf = NULL;
3486     }
3487     g_free(decompress_threads);
3488     g_free(decomp_param);
3489     decompress_threads = NULL;
3490     decomp_param = NULL;
3491     decomp_file = NULL;
3492 }
3493
3494 static int compress_threads_load_setup(QEMUFile *f)
3495 {
3496     int i, thread_count;
3497
3498     if (!migrate_use_compression()) {
3499         return 0;
3500     }
3501
3502     thread_count = migrate_decompress_threads();
3503     decompress_threads = g_new0(QemuThread, thread_count);
3504     decomp_param = g_new0(DecompressParam, thread_count);
3505     qemu_mutex_init(&decomp_done_lock);
3506     qemu_cond_init(&decomp_done_cond);
3507     decomp_file = f;
3508     for (i = 0; i < thread_count; i++) {
3509         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3510             goto exit;
3511         }
3512
3513         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3514         qemu_mutex_init(&decomp_param[i].mutex);
3515         qemu_cond_init(&decomp_param[i].cond);
3516         decomp_param[i].done = true;
3517         decomp_param[i].quit = false;
3518         qemu_thread_create(decompress_threads + i, "decompress",
3519                            do_data_decompress, decomp_param + i,
3520                            QEMU_THREAD_JOINABLE);
3521     }
3522     return 0;
3523 exit:
3524     compress_threads_load_cleanup();
3525     return -1;
3526 }
3527
3528 static void decompress_data_with_multi_threads(QEMUFile *f,
3529                                                void *host, int len)
3530 {
3531     int idx, thread_count;
3532
3533     thread_count = migrate_decompress_threads();
3534     QEMU_LOCK_GUARD(&decomp_done_lock);
3535     while (true) {
3536         for (idx = 0; idx < thread_count; idx++) {
3537             if (decomp_param[idx].done) {
3538                 decomp_param[idx].done = false;
3539                 qemu_mutex_lock(&decomp_param[idx].mutex);
3540                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3541                 decomp_param[idx].des = host;
3542                 decomp_param[idx].len = len;
3543                 qemu_cond_signal(&decomp_param[idx].cond);
3544                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3545                 break;
3546             }
3547         }
3548         if (idx < thread_count) {
3549             break;
3550         } else {
3551             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3552         }
3553     }
3554 }
3555
3556 static void colo_init_ram_state(void)
3557 {
3558     ram_state_init(&ram_state);
3559 }
3560
3561 /*
3562  * colo cache: this is for secondary VM, we cache the whole
3563  * memory of the secondary VM, it is need to hold the global lock
3564  * to call this helper.
3565  */
3566 int colo_init_ram_cache(void)
3567 {
3568     RAMBlock *block;
3569
3570     WITH_RCU_READ_LOCK_GUARD() {
3571         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3572             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3573                                                     NULL, false, false);
3574             if (!block->colo_cache) {
3575                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3576                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3577                              block->used_length);
3578                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3579                     if (block->colo_cache) {
3580                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3581                         block->colo_cache = NULL;
3582                     }
3583                 }
3584                 return -errno;
3585             }
3586             if (!machine_dump_guest_core(current_machine)) {
3587                 qemu_madvise(block->colo_cache, block->used_length,
3588                              QEMU_MADV_DONTDUMP);
3589             }
3590         }
3591     }
3592
3593     /*
3594     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3595     * with to decide which page in cache should be flushed into SVM's RAM. Here
3596     * we use the same name 'ram_bitmap' as for migration.
3597     */
3598     if (ram_bytes_total()) {
3599         RAMBlock *block;
3600
3601         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3602             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3603             block->bmap = bitmap_new(pages);
3604         }
3605     }
3606
3607     colo_init_ram_state();
3608     return 0;
3609 }
3610
3611 /* TODO: duplicated with ram_init_bitmaps */
3612 void colo_incoming_start_dirty_log(void)
3613 {
3614     RAMBlock *block = NULL;
3615     /* For memory_global_dirty_log_start below. */
3616     qemu_mutex_lock_iothread();
3617     qemu_mutex_lock_ramlist();
3618
3619     memory_global_dirty_log_sync();
3620     WITH_RCU_READ_LOCK_GUARD() {
3621         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3622             ramblock_sync_dirty_bitmap(ram_state, block);
3623             /* Discard this dirty bitmap record */
3624             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3625         }
3626         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3627     }
3628     ram_state->migration_dirty_pages = 0;
3629     qemu_mutex_unlock_ramlist();
3630     qemu_mutex_unlock_iothread();
3631 }
3632
3633 /* It is need to hold the global lock to call this helper */
3634 void colo_release_ram_cache(void)
3635 {
3636     RAMBlock *block;
3637
3638     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3639     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3640         g_free(block->bmap);
3641         block->bmap = NULL;
3642     }
3643
3644     WITH_RCU_READ_LOCK_GUARD() {
3645         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3646             if (block->colo_cache) {
3647                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3648                 block->colo_cache = NULL;
3649             }
3650         }
3651     }
3652     ram_state_cleanup(&ram_state);
3653 }
3654
3655 /**
3656  * ram_load_setup: Setup RAM for migration incoming side
3657  *
3658  * Returns zero to indicate success and negative for error
3659  *
3660  * @f: QEMUFile where to receive the data
3661  * @opaque: RAMState pointer
3662  */
3663 static int ram_load_setup(QEMUFile *f, void *opaque)
3664 {
3665     if (compress_threads_load_setup(f)) {
3666         return -1;
3667     }
3668
3669     xbzrle_load_setup();
3670     ramblock_recv_map_init();
3671
3672     return 0;
3673 }
3674
3675 static int ram_load_cleanup(void *opaque)
3676 {
3677     RAMBlock *rb;
3678
3679     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3680         qemu_ram_block_writeback(rb);
3681     }
3682
3683     xbzrle_load_cleanup();
3684     compress_threads_load_cleanup();
3685
3686     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3687         g_free(rb->receivedmap);
3688         rb->receivedmap = NULL;
3689     }
3690
3691     return 0;
3692 }
3693
3694 /**
3695  * ram_postcopy_incoming_init: allocate postcopy data structures
3696  *
3697  * Returns 0 for success and negative if there was one error
3698  *
3699  * @mis: current migration incoming state
3700  *
3701  * Allocate data structures etc needed by incoming migration with
3702  * postcopy-ram. postcopy-ram's similarly names
3703  * postcopy_ram_incoming_init does the work.
3704  */
3705 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3706 {
3707     return postcopy_ram_incoming_init(mis);
3708 }
3709
3710 /**
3711  * ram_load_postcopy: load a page in postcopy case
3712  *
3713  * Returns 0 for success or -errno in case of error
3714  *
3715  * Called in postcopy mode by ram_load().
3716  * rcu_read_lock is taken prior to this being called.
3717  *
3718  * @f: QEMUFile where to send the data
3719  */
3720 static int ram_load_postcopy(QEMUFile *f)
3721 {
3722     int flags = 0, ret = 0;
3723     bool place_needed = false;
3724     bool matches_target_page_size = false;
3725     MigrationIncomingState *mis = migration_incoming_get_current();
3726     /* Temporary page that is later 'placed' */
3727     void *postcopy_host_page = mis->postcopy_tmp_page;
3728     void *host_page = NULL;
3729     bool all_zero = true;
3730     int target_pages = 0;
3731
3732     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3733         ram_addr_t addr;
3734         void *page_buffer = NULL;
3735         void *place_source = NULL;
3736         RAMBlock *block = NULL;
3737         uint8_t ch;
3738         int len;
3739
3740         addr = qemu_get_be64(f);
3741
3742         /*
3743          * If qemu file error, we should stop here, and then "addr"
3744          * may be invalid
3745          */
3746         ret = qemu_file_get_error(f);
3747         if (ret) {
3748             break;
3749         }
3750
3751         flags = addr & ~TARGET_PAGE_MASK;
3752         addr &= TARGET_PAGE_MASK;
3753
3754         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3755         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3756                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3757             block = ram_block_from_stream(f, flags);
3758             if (!block) {
3759                 ret = -EINVAL;
3760                 break;
3761             }
3762
3763             /*
3764              * Relying on used_length is racy and can result in false positives.
3765              * We might place pages beyond used_length in case RAM was shrunk
3766              * while in postcopy, which is fine - trying to place via
3767              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3768              */
3769             if (!block->host || addr >= block->postcopy_length) {
3770                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3771                 ret = -EINVAL;
3772                 break;
3773             }
3774             target_pages++;
3775             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3776             /*
3777              * Postcopy requires that we place whole host pages atomically;
3778              * these may be huge pages for RAMBlocks that are backed by
3779              * hugetlbfs.
3780              * To make it atomic, the data is read into a temporary page
3781              * that's moved into place later.
3782              * The migration protocol uses,  possibly smaller, target-pages
3783              * however the source ensures it always sends all the components
3784              * of a host page in one chunk.
3785              */
3786             page_buffer = postcopy_host_page +
3787                           host_page_offset_from_ram_block_offset(block, addr);
3788             /* If all TP are zero then we can optimise the place */
3789             if (target_pages == 1) {
3790                 host_page = host_page_from_ram_block_offset(block, addr);
3791             } else if (host_page != host_page_from_ram_block_offset(block,
3792                                                                     addr)) {
3793                 /* not the 1st TP within the HP */
3794                 error_report("Non-same host page %p/%p", host_page,
3795                              host_page_from_ram_block_offset(block, addr));
3796                 ret = -EINVAL;
3797                 break;
3798             }
3799
3800             /*
3801              * If it's the last part of a host page then we place the host
3802              * page
3803              */
3804             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3805                 place_needed = true;
3806             }
3807             place_source = postcopy_host_page;
3808         }
3809
3810         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3811         case RAM_SAVE_FLAG_ZERO:
3812             ch = qemu_get_byte(f);
3813             /*
3814              * Can skip to set page_buffer when
3815              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3816              */
3817             if (ch || !matches_target_page_size) {
3818                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3819             }
3820             if (ch) {
3821                 all_zero = false;
3822             }
3823             break;
3824
3825         case RAM_SAVE_FLAG_PAGE:
3826             all_zero = false;
3827             if (!matches_target_page_size) {
3828                 /* For huge pages, we always use temporary buffer */
3829                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3830             } else {
3831                 /*
3832                  * For small pages that matches target page size, we
3833                  * avoid the qemu_file copy.  Instead we directly use
3834                  * the buffer of QEMUFile to place the page.  Note: we
3835                  * cannot do any QEMUFile operation before using that
3836                  * buffer to make sure the buffer is valid when
3837                  * placing the page.
3838                  */
3839                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3840                                          TARGET_PAGE_SIZE);
3841             }
3842             break;
3843         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3844             all_zero = false;
3845             len = qemu_get_be32(f);
3846             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3847                 error_report("Invalid compressed data length: %d", len);
3848                 ret = -EINVAL;
3849                 break;
3850             }
3851             decompress_data_with_multi_threads(f, page_buffer, len);
3852             break;
3853
3854         case RAM_SAVE_FLAG_EOS:
3855             /* normal exit */
3856             multifd_recv_sync_main();
3857             break;
3858         default:
3859             error_report("Unknown combination of migration flags: 0x%x"
3860                          " (postcopy mode)", flags);
3861             ret = -EINVAL;
3862             break;
3863         }
3864
3865         /* Got the whole host page, wait for decompress before placing. */
3866         if (place_needed) {
3867             ret |= wait_for_decompress_done();
3868         }
3869
3870         /* Detect for any possible file errors */
3871         if (!ret && qemu_file_get_error(f)) {
3872             ret = qemu_file_get_error(f);
3873         }
3874
3875         if (!ret && place_needed) {
3876             if (all_zero) {
3877                 ret = postcopy_place_page_zero(mis, host_page, block);
3878             } else {
3879                 ret = postcopy_place_page(mis, host_page, place_source,
3880                                           block);
3881             }
3882             place_needed = false;
3883             target_pages = 0;
3884             /* Assume we have a zero page until we detect something different */
3885             all_zero = true;
3886         }
3887     }
3888
3889     return ret;
3890 }
3891
3892 static bool postcopy_is_advised(void)
3893 {
3894     PostcopyState ps = postcopy_state_get();
3895     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3896 }
3897
3898 static bool postcopy_is_running(void)
3899 {
3900     PostcopyState ps = postcopy_state_get();
3901     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3902 }
3903
3904 /*
3905  * Flush content of RAM cache into SVM's memory.
3906  * Only flush the pages that be dirtied by PVM or SVM or both.
3907  */
3908 void colo_flush_ram_cache(void)
3909 {
3910     RAMBlock *block = NULL;
3911     void *dst_host;
3912     void *src_host;
3913     unsigned long offset = 0;
3914
3915     memory_global_dirty_log_sync();
3916     WITH_RCU_READ_LOCK_GUARD() {
3917         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3918             ramblock_sync_dirty_bitmap(ram_state, block);
3919         }
3920     }
3921
3922     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3923     WITH_RCU_READ_LOCK_GUARD() {
3924         block = QLIST_FIRST_RCU(&ram_list.blocks);
3925
3926         while (block) {
3927             unsigned long num = 0;
3928
3929             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
3930             if (!offset_in_ramblock(block,
3931                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3932                 offset = 0;
3933                 num = 0;
3934                 block = QLIST_NEXT_RCU(block, next);
3935             } else {
3936                 unsigned long i = 0;
3937
3938                 for (i = 0; i < num; i++) {
3939                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
3940                 }
3941                 dst_host = block->host
3942                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3943                 src_host = block->colo_cache
3944                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3945                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3946                 offset += num;
3947             }
3948         }
3949     }
3950     trace_colo_flush_ram_cache_end();
3951 }
3952
3953 /**
3954  * ram_load_precopy: load pages in precopy case
3955  *
3956  * Returns 0 for success or -errno in case of error
3957  *
3958  * Called in precopy mode by ram_load().
3959  * rcu_read_lock is taken prior to this being called.
3960  *
3961  * @f: QEMUFile where to send the data
3962  */
3963 static int ram_load_precopy(QEMUFile *f)
3964 {
3965     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3966     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3967     bool postcopy_advised = postcopy_is_advised();
3968     if (!migrate_use_compression()) {
3969         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3970     }
3971
3972     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3973         ram_addr_t addr, total_ram_bytes;
3974         void *host = NULL, *host_bak = NULL;
3975         uint8_t ch;
3976
3977         /*
3978          * Yield periodically to let main loop run, but an iteration of
3979          * the main loop is expensive, so do it each some iterations
3980          */
3981         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3982             aio_co_schedule(qemu_get_current_aio_context(),
3983                             qemu_coroutine_self());
3984             qemu_coroutine_yield();
3985         }
3986         i++;
3987
3988         addr = qemu_get_be64(f);
3989         flags = addr & ~TARGET_PAGE_MASK;
3990         addr &= TARGET_PAGE_MASK;
3991
3992         if (flags & invalid_flags) {
3993             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3994                 error_report("Received an unexpected compressed page");
3995             }
3996
3997             ret = -EINVAL;
3998             break;
3999         }
4000
4001         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4002                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4003             RAMBlock *block = ram_block_from_stream(f, flags);
4004
4005             host = host_from_ram_block_offset(block, addr);
4006             /*
4007              * After going into COLO stage, we should not load the page
4008              * into SVM's memory directly, we put them into colo_cache firstly.
4009              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4010              * Previously, we copied all these memory in preparing stage of COLO
4011              * while we need to stop VM, which is a time-consuming process.
4012              * Here we optimize it by a trick, back-up every page while in
4013              * migration process while COLO is enabled, though it affects the
4014              * speed of the migration, but it obviously reduce the downtime of
4015              * back-up all SVM'S memory in COLO preparing stage.
4016              */
4017             if (migration_incoming_colo_enabled()) {
4018                 if (migration_incoming_in_colo_state()) {
4019                     /* In COLO stage, put all pages into cache temporarily */
4020                     host = colo_cache_from_block_offset(block, addr, true);
4021                 } else {
4022                    /*
4023                     * In migration stage but before COLO stage,
4024                     * Put all pages into both cache and SVM's memory.
4025                     */
4026                     host_bak = colo_cache_from_block_offset(block, addr, false);
4027                 }
4028             }
4029             if (!host) {
4030                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4031                 ret = -EINVAL;
4032                 break;
4033             }
4034             if (!migration_incoming_in_colo_state()) {
4035                 ramblock_recv_bitmap_set(block, host);
4036             }
4037
4038             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4039         }
4040
4041         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4042         case RAM_SAVE_FLAG_MEM_SIZE:
4043             /* Synchronize RAM block list */
4044             total_ram_bytes = addr;
4045             while (!ret && total_ram_bytes) {
4046                 RAMBlock *block;
4047                 char id[256];
4048                 ram_addr_t length;
4049
4050                 len = qemu_get_byte(f);
4051                 qemu_get_buffer(f, (uint8_t *)id, len);
4052                 id[len] = 0;
4053                 length = qemu_get_be64(f);
4054
4055                 block = qemu_ram_block_by_name(id);
4056                 if (block && !qemu_ram_is_migratable(block)) {
4057                     error_report("block %s should not be migrated !", id);
4058                     ret = -EINVAL;
4059                 } else if (block) {
4060                     if (length != block->used_length) {
4061                         Error *local_err = NULL;
4062
4063                         ret = qemu_ram_resize(block, length,
4064                                               &local_err);
4065                         if (local_err) {
4066                             error_report_err(local_err);
4067                         }
4068                     }
4069                     /* For postcopy we need to check hugepage sizes match */
4070                     if (postcopy_advised && migrate_postcopy_ram() &&
4071                         block->page_size != qemu_host_page_size) {
4072                         uint64_t remote_page_size = qemu_get_be64(f);
4073                         if (remote_page_size != block->page_size) {
4074                             error_report("Mismatched RAM page size %s "
4075                                          "(local) %zd != %" PRId64,
4076                                          id, block->page_size,
4077                                          remote_page_size);
4078                             ret = -EINVAL;
4079                         }
4080                     }
4081                     if (migrate_ignore_shared()) {
4082                         hwaddr addr = qemu_get_be64(f);
4083                         if (ramblock_is_ignored(block) &&
4084                             block->mr->addr != addr) {
4085                             error_report("Mismatched GPAs for block %s "
4086                                          "%" PRId64 "!= %" PRId64,
4087                                          id, (uint64_t)addr,
4088                                          (uint64_t)block->mr->addr);
4089                             ret = -EINVAL;
4090                         }
4091                     }
4092                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4093                                           block->idstr);
4094                 } else {
4095                     error_report("Unknown ramblock \"%s\", cannot "
4096                                  "accept migration", id);
4097                     ret = -EINVAL;
4098                 }
4099
4100                 total_ram_bytes -= length;
4101             }
4102             break;
4103
4104         case RAM_SAVE_FLAG_ZERO:
4105             ch = qemu_get_byte(f);
4106             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4107             break;
4108
4109         case RAM_SAVE_FLAG_PAGE:
4110             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4111             break;
4112
4113         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4114             len = qemu_get_be32(f);
4115             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4116                 error_report("Invalid compressed data length: %d", len);
4117                 ret = -EINVAL;
4118                 break;
4119             }
4120             decompress_data_with_multi_threads(f, host, len);
4121             break;
4122
4123         case RAM_SAVE_FLAG_XBZRLE:
4124             if (load_xbzrle(f, addr, host) < 0) {
4125                 error_report("Failed to decompress XBZRLE page at "
4126                              RAM_ADDR_FMT, addr);
4127                 ret = -EINVAL;
4128                 break;
4129             }
4130             break;
4131         case RAM_SAVE_FLAG_EOS:
4132             /* normal exit */
4133             multifd_recv_sync_main();
4134             break;
4135         default:
4136             if (flags & RAM_SAVE_FLAG_HOOK) {
4137                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4138             } else {
4139                 error_report("Unknown combination of migration flags: 0x%x",
4140                              flags);
4141                 ret = -EINVAL;
4142             }
4143         }
4144         if (!ret) {
4145             ret = qemu_file_get_error(f);
4146         }
4147         if (!ret && host_bak) {
4148             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4149         }
4150     }
4151
4152     ret |= wait_for_decompress_done();
4153     return ret;
4154 }
4155
4156 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4157 {
4158     int ret = 0;
4159     static uint64_t seq_iter;
4160     /*
4161      * If system is running in postcopy mode, page inserts to host memory must
4162      * be atomic
4163      */
4164     bool postcopy_running = postcopy_is_running();
4165
4166     seq_iter++;
4167
4168     if (version_id != 4) {
4169         return -EINVAL;
4170     }
4171
4172     /*
4173      * This RCU critical section can be very long running.
4174      * When RCU reclaims in the code start to become numerous,
4175      * it will be necessary to reduce the granularity of this
4176      * critical section.
4177      */
4178     WITH_RCU_READ_LOCK_GUARD() {
4179         if (postcopy_running) {
4180             ret = ram_load_postcopy(f);
4181         } else {
4182             ret = ram_load_precopy(f);
4183         }
4184     }
4185     trace_ram_load_complete(ret, seq_iter);
4186
4187     return ret;
4188 }
4189
4190 static bool ram_has_postcopy(void *opaque)
4191 {
4192     RAMBlock *rb;
4193     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4194         if (ramblock_is_pmem(rb)) {
4195             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4196                          "is not supported now!", rb->idstr, rb->host);
4197             return false;
4198         }
4199     }
4200
4201     return migrate_postcopy_ram();
4202 }
4203
4204 /* Sync all the dirty bitmap with destination VM.  */
4205 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4206 {
4207     RAMBlock *block;
4208     QEMUFile *file = s->to_dst_file;
4209     int ramblock_count = 0;
4210
4211     trace_ram_dirty_bitmap_sync_start();
4212
4213     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4214         qemu_savevm_send_recv_bitmap(file, block->idstr);
4215         trace_ram_dirty_bitmap_request(block->idstr);
4216         ramblock_count++;
4217     }
4218
4219     trace_ram_dirty_bitmap_sync_wait();
4220
4221     /* Wait until all the ramblocks' dirty bitmap synced */
4222     while (ramblock_count--) {
4223         qemu_sem_wait(&s->rp_state.rp_sem);
4224     }
4225
4226     trace_ram_dirty_bitmap_sync_complete();
4227
4228     return 0;
4229 }
4230
4231 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4232 {
4233     qemu_sem_post(&s->rp_state.rp_sem);
4234 }
4235
4236 /*
4237  * Read the received bitmap, revert it as the initial dirty bitmap.
4238  * This is only used when the postcopy migration is paused but wants
4239  * to resume from a middle point.
4240  */
4241 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4242 {
4243     int ret = -EINVAL;
4244     /* from_dst_file is always valid because we're within rp_thread */
4245     QEMUFile *file = s->rp_state.from_dst_file;
4246     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4247     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4248     uint64_t size, end_mark;
4249
4250     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4251
4252     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4253         error_report("%s: incorrect state %s", __func__,
4254                      MigrationStatus_str(s->state));
4255         return -EINVAL;
4256     }
4257
4258     /*
4259      * Note: see comments in ramblock_recv_bitmap_send() on why we
4260      * need the endianness conversion, and the paddings.
4261      */
4262     local_size = ROUND_UP(local_size, 8);
4263
4264     /* Add paddings */
4265     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4266
4267     size = qemu_get_be64(file);
4268
4269     /* The size of the bitmap should match with our ramblock */
4270     if (size != local_size) {
4271         error_report("%s: ramblock '%s' bitmap size mismatch "
4272                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4273                      block->idstr, size, local_size);
4274         ret = -EINVAL;
4275         goto out;
4276     }
4277
4278     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4279     end_mark = qemu_get_be64(file);
4280
4281     ret = qemu_file_get_error(file);
4282     if (ret || size != local_size) {
4283         error_report("%s: read bitmap failed for ramblock '%s': %d"
4284                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4285                      __func__, block->idstr, ret, local_size, size);
4286         ret = -EIO;
4287         goto out;
4288     }
4289
4290     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4291         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4292                      __func__, block->idstr, end_mark);
4293         ret = -EINVAL;
4294         goto out;
4295     }
4296
4297     /*
4298      * Endianness conversion. We are during postcopy (though paused).
4299      * The dirty bitmap won't change. We can directly modify it.
4300      */
4301     bitmap_from_le(block->bmap, le_bitmap, nbits);
4302
4303     /*
4304      * What we received is "received bitmap". Revert it as the initial
4305      * dirty bitmap for this ramblock.
4306      */
4307     bitmap_complement(block->bmap, block->bmap, nbits);
4308
4309     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4310     ramblock_dirty_bitmap_clear_discarded_pages(block);
4311
4312     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4313     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4314
4315     /*
4316      * We succeeded to sync bitmap for current ramblock. If this is
4317      * the last one to sync, we need to notify the main send thread.
4318      */
4319     ram_dirty_bitmap_reload_notify(s);
4320
4321     ret = 0;
4322 out:
4323     g_free(le_bitmap);
4324     return ret;
4325 }
4326
4327 static int ram_resume_prepare(MigrationState *s, void *opaque)
4328 {
4329     RAMState *rs = *(RAMState **)opaque;
4330     int ret;
4331
4332     ret = ram_dirty_bitmap_sync_all(s, rs);
4333     if (ret) {
4334         return ret;
4335     }
4336
4337     ram_state_resume_prepare(rs, s->to_dst_file);
4338
4339     return 0;
4340 }
4341
4342 static SaveVMHandlers savevm_ram_handlers = {
4343     .save_setup = ram_save_setup,
4344     .save_live_iterate = ram_save_iterate,
4345     .save_live_complete_postcopy = ram_save_complete,
4346     .save_live_complete_precopy = ram_save_complete,
4347     .has_postcopy = ram_has_postcopy,
4348     .save_live_pending = ram_save_pending,
4349     .load_state = ram_load,
4350     .save_cleanup = ram_save_cleanup,
4351     .load_setup = ram_load_setup,
4352     .load_cleanup = ram_load_cleanup,
4353     .resume_prepare = ram_resume_prepare,
4354 };
4355
4356 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4357                                       size_t old_size, size_t new_size)
4358 {
4359     PostcopyState ps = postcopy_state_get();
4360     ram_addr_t offset;
4361     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4362     Error *err = NULL;
4363
4364     if (ramblock_is_ignored(rb)) {
4365         return;
4366     }
4367
4368     if (!migration_is_idle()) {
4369         /*
4370          * Precopy code on the source cannot deal with the size of RAM blocks
4371          * changing at random points in time - especially after sending the
4372          * RAM block sizes in the migration stream, they must no longer change.
4373          * Abort and indicate a proper reason.
4374          */
4375         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4376         migration_cancel(err);
4377         error_free(err);
4378     }
4379
4380     switch (ps) {
4381     case POSTCOPY_INCOMING_ADVISE:
4382         /*
4383          * Update what ram_postcopy_incoming_init()->init_range() does at the
4384          * time postcopy was advised. Syncing RAM blocks with the source will
4385          * result in RAM resizes.
4386          */
4387         if (old_size < new_size) {
4388             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4389                 error_report("RAM block '%s' discard of resized RAM failed",
4390                              rb->idstr);
4391             }
4392         }
4393         rb->postcopy_length = new_size;
4394         break;
4395     case POSTCOPY_INCOMING_NONE:
4396     case POSTCOPY_INCOMING_RUNNING:
4397     case POSTCOPY_INCOMING_END:
4398         /*
4399          * Once our guest is running, postcopy does no longer care about
4400          * resizes. When growing, the new memory was not available on the
4401          * source, no handler needed.
4402          */
4403         break;
4404     default:
4405         error_report("RAM block '%s' resized during postcopy state: %d",
4406                      rb->idstr, ps);
4407         exit(-1);
4408     }
4409 }
4410
4411 static RAMBlockNotifier ram_mig_ram_notifier = {
4412     .ram_block_resized = ram_mig_ram_block_resized,
4413 };
4414
4415 void ram_mig_init(void)
4416 {
4417     qemu_mutex_init(&XBZRLE.lock);
4418     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4419     ram_block_notifier_add(&ram_mig_ram_notifier);
4420 }