migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/main-loop.h"
  34 #include "xbzrle.h"
  35 #include "ram.h"
  36 #include "migration.h"
  37 #include "migration/register.h"
  38 #include "migration/misc.h"
  39 #include "qemu-file.h"
  40 #include "postcopy-ram.h"
  41 #include "page_cache.h"
  42 #include "qemu/error-report.h"
  43 #include "qapi/error.h"
  44 #include "qapi/qapi-types-migration.h"
  45 #include "qapi/qapi-events-migration.h"
  46 #include "qapi/qmp/qerror.h"
  47 #include "trace.h"
  48 #include "exec/ram_addr.h"
  49 #include "exec/target_page.h"
  50 #include "qemu/rcu_queue.h"
  51 #include "migration/colo.h"
  52 #include "block.h"
  53 #include "sysemu/cpu-throttle.h"
  54 #include "savevm.h"
  55 #include "qemu/iov.h"
  56 #include "multifd.h"
  57 #include "sysemu/runstate.h"
  58
  59 #if defined(__linux__)
  60 #include "qemu/userfaultfd.h"
  61 #endif /* defined(__linux__) */
  62
  63 /***********************************************************/
  64 /* ram save/restore */
  65
  66 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  67  * worked for pages that where filled with the same char.  We switched
  68  * it to only search for the zero value.  And to avoid confusion with
  69  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  70  */
  71
  72 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  73 #define RAM_SAVE_FLAG_ZERO     0x02
  74 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  75 #define RAM_SAVE_FLAG_PAGE     0x08
  76 #define RAM_SAVE_FLAG_EOS      0x10
  77 #define RAM_SAVE_FLAG_CONTINUE 0x20
  78 #define RAM_SAVE_FLAG_XBZRLE   0x40
  79 /* 0x80 is reserved in migration.h start with 0x100 next */
  80 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  81
  82 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  83 {
  84     return buffer_is_zero(p, size);
  85 }
  86
  87 XBZRLECacheStats xbzrle_counters;
  88
  89 /* struct contains XBZRLE cache and a static page
  90    used by the compression */
  91 static struct {
  92     /* buffer used for XBZRLE encoding */
  93     uint8_t *encoded_buf;
  94     /* buffer for storing page content */
  95     uint8_t *current_buf;
  96     /* Cache for XBZRLE, Protected by lock. */
  97     PageCache *cache;
  98     QemuMutex lock;
  99     /* it will store a page full of zeros */
 100     uint8_t *zero_target_page;
 101     /* buffer used for XBZRLE decoding */
 102     uint8_t *decoded_buf;
 103 } XBZRLE;
 104
 105 static void XBZRLE_cache_lock(void)
 106 {
 107     if (migrate_use_xbzrle()) {
 108         qemu_mutex_lock(&XBZRLE.lock);
 109     }
 110 }
 111
 112 static void XBZRLE_cache_unlock(void)
 113 {
 114     if (migrate_use_xbzrle()) {
 115         qemu_mutex_unlock(&XBZRLE.lock);
 116     }
 117 }
 118
 119 /**
 120  * xbzrle_cache_resize: resize the xbzrle cache
 121  *
 122  * This function is called from migrate_params_apply in main
 123  * thread, possibly while a migration is in progress.  A running
 124  * migration may be using the cache and might finish during this call,
 125  * hence changes to the cache are protected by XBZRLE.lock().
 126  *
 127  * Returns 0 for success or -1 for error
 128  *
 129  * @new_size: new cache size
 130  * @errp: set *errp if the check failed, with reason
 131  */
 132 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 133 {
 134     PageCache *new_cache;
 135     int64_t ret = 0;
 136
 137     /* Check for truncation */
 138     if (new_size != (size_t)new_size) {
 139         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 140                    "exceeding address space");
 141         return -1;
 142     }
 143
 144     if (new_size == migrate_xbzrle_cache_size()) {
 145         /* nothing to do */
 146         return 0;
 147     }
 148
 149     XBZRLE_cache_lock();
 150
 151     if (XBZRLE.cache != NULL) {
 152         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 153         if (!new_cache) {
 154             ret = -1;
 155             goto out;
 156         }
 157
 158         cache_fini(XBZRLE.cache);
 159         XBZRLE.cache = new_cache;
 160     }
 161 out:
 162     XBZRLE_cache_unlock();
 163     return ret;
 164 }
 165
 166 bool ramblock_is_ignored(RAMBlock *block)
 167 {
 168     return !qemu_ram_is_migratable(block) ||
 169            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 170 }
 171
 172 #undef RAMBLOCK_FOREACH
 173
 174 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 175 {
 176     RAMBlock *block;
 177     int ret = 0;
 178
 179     RCU_READ_LOCK_GUARD();
 180
 181     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 182         ret = func(block, opaque);
 183         if (ret) {
 184             break;
 185         }
 186     }
 187     return ret;
 188 }
 189
 190 static void ramblock_recv_map_init(void)
 191 {
 192     RAMBlock *rb;
 193
 194     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 195         assert(!rb->receivedmap);
 196         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 197     }
 198 }
 199
 200 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 201 {
 202     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 203                     rb->receivedmap);
 204 }
 205
 206 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 207 {
 208     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 209 }
 210
 211 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 212 {
 213     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 214 }
 215
 216 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 217                                     size_t nr)
 218 {
 219     bitmap_set_atomic(rb->receivedmap,
 220                       ramblock_recv_bitmap_offset(host_addr, rb),
 221                       nr);
 222 }
 223
 224 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 225
 226 /*
 227  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 228  *
 229  * Returns >0 if success with sent bytes, or <0 if error.
 230  */
 231 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 232                                   const char *block_name)
 233 {
 234     RAMBlock *block = qemu_ram_block_by_name(block_name);
 235     unsigned long *le_bitmap, nbits;
 236     uint64_t size;
 237
 238     if (!block) {
 239         error_report("%s: invalid block name: %s", __func__, block_name);
 240         return -1;
 241     }
 242
 243     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 244
 245     /*
 246      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 247      * machines we may need 4 more bytes for padding (see below
 248      * comment). So extend it a bit before hand.
 249      */
 250     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 251
 252     /*
 253      * Always use little endian when sending the bitmap. This is
 254      * required that when source and destination VMs are not using the
 255      * same endianness. (Note: big endian won't work.)
 256      */
 257     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 258
 259     /* Size of the bitmap, in bytes */
 260     size = DIV_ROUND_UP(nbits, 8);
 261
 262     /*
 263      * size is always aligned to 8 bytes for 64bit machines, but it
 264      * may not be true for 32bit machines. We need this padding to
 265      * make sure the migration can survive even between 32bit and
 266      * 64bit machines.
 267      */
 268     size = ROUND_UP(size, 8);
 269
 270     qemu_put_be64(file, size);
 271     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 272     /*
 273      * Mark as an end, in case the middle part is screwed up due to
 274      * some "mysterious" reason.
 275      */
 276     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 277     qemu_fflush(file);
 278
 279     g_free(le_bitmap);
 280
 281     if (qemu_file_get_error(file)) {
 282         return qemu_file_get_error(file);
 283     }
 284
 285     return size + sizeof(size);
 286 }
 287
 288 /*
 289  * An outstanding page request, on the source, having been received
 290  * and queued
 291  */
 292 struct RAMSrcPageRequest {
 293     RAMBlock *rb;
 294     hwaddr    offset;
 295     hwaddr    len;
 296
 297     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 298 };
 299
 300 /* State of RAM for migration */
 301 struct RAMState {
 302     /* QEMUFile used for this migration */
 303     QEMUFile *f;
 304     /* UFFD file descriptor, used in 'write-tracking' migration */
 305     int uffdio_fd;
 306     /* Last block that we have visited searching for dirty pages */
 307     RAMBlock *last_seen_block;
 308     /* Last block from where we have sent data */
 309     RAMBlock *last_sent_block;
 310     /* Last dirty target page we have sent */
 311     ram_addr_t last_page;
 312     /* last ram version we have seen */
 313     uint32_t last_version;
 314     /* How many times we have dirty too many pages */
 315     int dirty_rate_high_cnt;
 316     /* these variables are used for bitmap sync */
 317     /* last time we did a full bitmap_sync */
 318     int64_t time_last_bitmap_sync;
 319     /* bytes transferred at start_time */
 320     uint64_t bytes_xfer_prev;
 321     /* number of dirty pages since start_time */
 322     uint64_t num_dirty_pages_period;
 323     /* xbzrle misses since the beginning of the period */
 324     uint64_t xbzrle_cache_miss_prev;
 325     /* Amount of xbzrle pages since the beginning of the period */
 326     uint64_t xbzrle_pages_prev;
 327     /* Amount of xbzrle encoded bytes since the beginning of the period */
 328     uint64_t xbzrle_bytes_prev;
 329     /* Start using XBZRLE (e.g., after the first round). */
 330     bool xbzrle_enabled;
 331
 332     /* compression statistics since the beginning of the period */
 333     /* amount of count that no free thread to compress data */
 334     uint64_t compress_thread_busy_prev;
 335     /* amount bytes after compression */
 336     uint64_t compressed_size_prev;
 337     /* amount of compressed pages */
 338     uint64_t compress_pages_prev;
 339
 340     /* total handled target pages at the beginning of period */
 341     uint64_t target_page_count_prev;
 342     /* total handled target pages since start */
 343     uint64_t target_page_count;
 344     /* number of dirty bits in the bitmap */
 345     uint64_t migration_dirty_pages;
 346     /* Protects modification of the bitmap and migration dirty pages */
 347     QemuMutex bitmap_mutex;
 348     /* The RAMBlock used in the last src_page_requests */
 349     RAMBlock *last_req_rb;
 350     /* Queue of outstanding page requests from the destination */
 351     QemuMutex src_page_req_mutex;
 352     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 353 };
 354 typedef struct RAMState RAMState;
 355
 356 static RAMState *ram_state;
 357
 358 static NotifierWithReturnList precopy_notifier_list;
 359
 360 void precopy_infrastructure_init(void)
 361 {
 362     notifier_with_return_list_init(&precopy_notifier_list);
 363 }
 364
 365 void precopy_add_notifier(NotifierWithReturn *n)
 366 {
 367     notifier_with_return_list_add(&precopy_notifier_list, n);
 368 }
 369
 370 void precopy_remove_notifier(NotifierWithReturn *n)
 371 {
 372     notifier_with_return_remove(n);
 373 }
 374
 375 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 376 {
 377     PrecopyNotifyData pnd;
 378     pnd.reason = reason;
 379     pnd.errp = errp;
 380
 381     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 382 }
 383
 384 uint64_t ram_bytes_remaining(void)
 385 {
 386     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 387                        0;
 388 }
 389
 390 MigrationStats ram_counters;
 391
 392 /* used by the search for pages to send */
 393 struct PageSearchStatus {
 394     /* Current block being searched */
 395     RAMBlock    *block;
 396     /* Current page to search from */
 397     unsigned long page;
 398     /* Set once we wrap around */
 399     bool         complete_round;
 400 };
 401 typedef struct PageSearchStatus PageSearchStatus;
 402
 403 CompressionStats compression_counters;
 404
 405 struct CompressParam {
 406     bool done;
 407     bool quit;
 408     bool zero_page;
 409     QEMUFile *file;
 410     QemuMutex mutex;
 411     QemuCond cond;
 412     RAMBlock *block;
 413     ram_addr_t offset;
 414
 415     /* internally used fields */
 416     z_stream stream;
 417     uint8_t *originbuf;
 418 };
 419 typedef struct CompressParam CompressParam;
 420
 421 struct DecompressParam {
 422     bool done;
 423     bool quit;
 424     QemuMutex mutex;
 425     QemuCond cond;
 426     void *des;
 427     uint8_t *compbuf;
 428     int len;
 429     z_stream stream;
 430 };
 431 typedef struct DecompressParam DecompressParam;
 432
 433 static CompressParam *comp_param;
 434 static QemuThread *compress_threads;
 435 /* comp_done_cond is used to wake up the migration thread when
 436  * one of the compression threads has finished the compression.
 437  * comp_done_lock is used to co-work with comp_done_cond.
 438  */
 439 static QemuMutex comp_done_lock;
 440 static QemuCond comp_done_cond;
 441 /* The empty QEMUFileOps will be used by file in CompressParam */
 442 static const QEMUFileOps empty_ops = { };
 443
 444 static QEMUFile *decomp_file;
 445 static DecompressParam *decomp_param;
 446 static QemuThread *decompress_threads;
 447 static QemuMutex decomp_done_lock;
 448 static QemuCond decomp_done_cond;
 449
 450 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 451                                  ram_addr_t offset, uint8_t *source_buf);
 452
 453 static void *do_data_compress(void *opaque)
 454 {
 455     CompressParam *param = opaque;
 456     RAMBlock *block;
 457     ram_addr_t offset;
 458     bool zero_page;
 459
 460     qemu_mutex_lock(&param->mutex);
 461     while (!param->quit) {
 462         if (param->block) {
 463             block = param->block;
 464             offset = param->offset;
 465             param->block = NULL;
 466             qemu_mutex_unlock(&param->mutex);
 467
 468             zero_page = do_compress_ram_page(param->file, &param->stream,
 469                                              block, offset, param->originbuf);
 470
 471             qemu_mutex_lock(&comp_done_lock);
 472             param->done = true;
 473             param->zero_page = zero_page;
 474             qemu_cond_signal(&comp_done_cond);
 475             qemu_mutex_unlock(&comp_done_lock);
 476
 477             qemu_mutex_lock(&param->mutex);
 478         } else {
 479             qemu_cond_wait(&param->cond, &param->mutex);
 480         }
 481     }
 482     qemu_mutex_unlock(&param->mutex);
 483
 484     return NULL;
 485 }
 486
 487 static void compress_threads_save_cleanup(void)
 488 {
 489     int i, thread_count;
 490
 491     if (!migrate_use_compression() || !comp_param) {
 492         return;
 493     }
 494
 495     thread_count = migrate_compress_threads();
 496     for (i = 0; i < thread_count; i++) {
 497         /*
 498          * we use it as a indicator which shows if the thread is
 499          * properly init'd or not
 500          */
 501         if (!comp_param[i].file) {
 502             break;
 503         }
 504
 505         qemu_mutex_lock(&comp_param[i].mutex);
 506         comp_param[i].quit = true;
 507         qemu_cond_signal(&comp_param[i].cond);
 508         qemu_mutex_unlock(&comp_param[i].mutex);
 509
 510         qemu_thread_join(compress_threads + i);
 511         qemu_mutex_destroy(&comp_param[i].mutex);
 512         qemu_cond_destroy(&comp_param[i].cond);
 513         deflateEnd(&comp_param[i].stream);
 514         g_free(comp_param[i].originbuf);
 515         qemu_fclose(comp_param[i].file);
 516         comp_param[i].file = NULL;
 517     }
 518     qemu_mutex_destroy(&comp_done_lock);
 519     qemu_cond_destroy(&comp_done_cond);
 520     g_free(compress_threads);
 521     g_free(comp_param);
 522     compress_threads = NULL;
 523     comp_param = NULL;
 524 }
 525
 526 static int compress_threads_save_setup(void)
 527 {
 528     int i, thread_count;
 529
 530     if (!migrate_use_compression()) {
 531         return 0;
 532     }
 533     thread_count = migrate_compress_threads();
 534     compress_threads = g_new0(QemuThread, thread_count);
 535     comp_param = g_new0(CompressParam, thread_count);
 536     qemu_cond_init(&comp_done_cond);
 537     qemu_mutex_init(&comp_done_lock);
 538     for (i = 0; i < thread_count; i++) {
 539         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 540         if (!comp_param[i].originbuf) {
 541             goto exit;
 542         }
 543
 544         if (deflateInit(&comp_param[i].stream,
 545                         migrate_compress_level()) != Z_OK) {
 546             g_free(comp_param[i].originbuf);
 547             goto exit;
 548         }
 549
 550         /* comp_param[i].file is just used as a dummy buffer to save data,
 551          * set its ops to empty.
 552          */
 553         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 554         comp_param[i].done = true;
 555         comp_param[i].quit = false;
 556         qemu_mutex_init(&comp_param[i].mutex);
 557         qemu_cond_init(&comp_param[i].cond);
 558         qemu_thread_create(compress_threads + i, "compress",
 559                            do_data_compress, comp_param + i,
 560                            QEMU_THREAD_JOINABLE);
 561     }
 562     return 0;
 563
 564 exit:
 565     compress_threads_save_cleanup();
 566     return -1;
 567 }
 568
 569 /**
 570  * save_page_header: write page header to wire
 571  *
 572  * If this is the 1st block, it also writes the block identification
 573  *
 574  * Returns the number of bytes written
 575  *
 576  * @f: QEMUFile where to send the data
 577  * @block: block that contains the page we want to send
 578  * @offset: offset inside the block for the page
 579  *          in the lower bits, it contains flags
 580  */
 581 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 582                                ram_addr_t offset)
 583 {
 584     size_t size, len;
 585
 586     if (block == rs->last_sent_block) {
 587         offset |= RAM_SAVE_FLAG_CONTINUE;
 588     }
 589     qemu_put_be64(f, offset);
 590     size = 8;
 591
 592     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 593         len = strlen(block->idstr);
 594         qemu_put_byte(f, len);
 595         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 596         size += 1 + len;
 597         rs->last_sent_block = block;
 598     }
 599     return size;
 600 }
 601
 602 /**
 603  * mig_throttle_guest_down: throotle down the guest
 604  *
 605  * Reduce amount of guest cpu execution to hopefully slow down memory
 606  * writes. If guest dirty memory rate is reduced below the rate at
 607  * which we can transfer pages to the destination then we should be
 608  * able to complete migration. Some workloads dirty memory way too
 609  * fast and will not effectively converge, even with auto-converge.
 610  */
 611 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 612                                     uint64_t bytes_dirty_threshold)
 613 {
 614     MigrationState *s = migrate_get_current();
 615     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 616     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 617     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 618     int pct_max = s->parameters.max_cpu_throttle;
 619
 620     uint64_t throttle_now = cpu_throttle_get_percentage();
 621     uint64_t cpu_now, cpu_ideal, throttle_inc;
 622
 623     /* We have not started throttling yet. Let's start it. */
 624     if (!cpu_throttle_active()) {
 625         cpu_throttle_set(pct_initial);
 626     } else {
 627         /* Throttling already on, just increase the rate */
 628         if (!pct_tailslow) {
 629             throttle_inc = pct_increment;
 630         } else {
 631             /* Compute the ideal CPU percentage used by Guest, which may
 632              * make the dirty rate match the dirty rate threshold. */
 633             cpu_now = 100 - throttle_now;
 634             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 635                         bytes_dirty_period);
 636             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 637         }
 638         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 639     }
 640 }
 641
 642 /**
 643  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 644  *
 645  * @rs: current RAM state
 646  * @current_addr: address for the zero page
 647  *
 648  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 649  * The important thing is that a stale (not-yet-0'd) page be replaced
 650  * by the new data.
 651  * As a bonus, if the page wasn't in the cache it gets added so that
 652  * when a small write is made into the 0'd page it gets XBZRLE sent.
 653  */
 654 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 655 {
 656     if (!rs->xbzrle_enabled) {
 657         return;
 658     }
 659
 660     /* We don't care if this fails to allocate a new cache page
 661      * as long as it updated an old one */
 662     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 663                  ram_counters.dirty_sync_count);
 664 }
 665
 666 #define ENCODING_FLAG_XBZRLE 0x1
 667
 668 /**
 669  * save_xbzrle_page: compress and send current page
 670  *
 671  * Returns: 1 means that we wrote the page
 672  *          0 means that page is identical to the one already sent
 673  *          -1 means that xbzrle would be longer than normal
 674  *
 675  * @rs: current RAM state
 676  * @current_data: pointer to the address of the page contents
 677  * @current_addr: addr of the page
 678  * @block: block that contains the page we want to send
 679  * @offset: offset inside the block for the page
 680  * @last_stage: if we are at the completion stage
 681  */
 682 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 683                             ram_addr_t current_addr, RAMBlock *block,
 684                             ram_addr_t offset, bool last_stage)
 685 {
 686     int encoded_len = 0, bytes_xbzrle;
 687     uint8_t *prev_cached_page;
 688
 689     if (!cache_is_cached(XBZRLE.cache, current_addr,
 690                          ram_counters.dirty_sync_count)) {
 691         xbzrle_counters.cache_miss++;
 692         if (!last_stage) {
 693             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 694                              ram_counters.dirty_sync_count) == -1) {
 695                 return -1;
 696             } else {
 697                 /* update *current_data when the page has been
 698                    inserted into cache */
 699                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 700             }
 701         }
 702         return -1;
 703     }
 704
 705     /*
 706      * Reaching here means the page has hit the xbzrle cache, no matter what
 707      * encoding result it is (normal encoding, overflow or skipping the page),
 708      * count the page as encoded. This is used to calculate the encoding rate.
 709      *
 710      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 711      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 712      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 713      * skipped page included. In this way, the encoding rate can tell if the
 714      * guest page is good for xbzrle encoding.
 715      */
 716     xbzrle_counters.pages++;
 717     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 718
 719     /* save current buffer into memory */
 720     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 721
 722     /* XBZRLE encoding (if there is no overflow) */
 723     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 724                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 725                                        TARGET_PAGE_SIZE);
 726
 727     /*
 728      * Update the cache contents, so that it corresponds to the data
 729      * sent, in all cases except where we skip the page.
 730      */
 731     if (!last_stage && encoded_len != 0) {
 732         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 733         /*
 734          * In the case where we couldn't compress, ensure that the caller
 735          * sends the data from the cache, since the guest might have
 736          * changed the RAM since we copied it.
 737          */
 738         *current_data = prev_cached_page;
 739     }
 740
 741     if (encoded_len == 0) {
 742         trace_save_xbzrle_page_skipping();
 743         return 0;
 744     } else if (encoded_len == -1) {
 745         trace_save_xbzrle_page_overflow();
 746         xbzrle_counters.overflow++;
 747         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 748         return -1;
 749     }
 750
 751     /* Send XBZRLE based compressed page */
 752     bytes_xbzrle = save_page_header(rs, rs->f, block,
 753                                     offset | RAM_SAVE_FLAG_XBZRLE);
 754     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 755     qemu_put_be16(rs->f, encoded_len);
 756     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 757     bytes_xbzrle += encoded_len + 1 + 2;
 758     /*
 759      * Like compressed_size (please see update_compress_thread_counts),
 760      * the xbzrle encoded bytes don't count the 8 byte header with
 761      * RAM_SAVE_FLAG_CONTINUE.
 762      */
 763     xbzrle_counters.bytes += bytes_xbzrle - 8;
 764     ram_counters.transferred += bytes_xbzrle;
 765
 766     return 1;
 767 }
 768
 769 /**
 770  * migration_bitmap_find_dirty: find the next dirty page from start
 771  *
 772  * Returns the page offset within memory region of the start of a dirty page
 773  *
 774  * @rs: current RAM state
 775  * @rb: RAMBlock where to search for dirty pages
 776  * @start: page where we start the search
 777  */
 778 static inline
 779 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 780                                           unsigned long start)
 781 {
 782     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 783     unsigned long *bitmap = rb->bmap;
 784
 785     if (ramblock_is_ignored(rb)) {
 786         return size;
 787     }
 788
 789     return find_next_bit(bitmap, size, start);
 790 }
 791
 792 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 793                                                 RAMBlock *rb,
 794                                                 unsigned long page)
 795 {
 796     bool ret;
 797
 798     QEMU_LOCK_GUARD(&rs->bitmap_mutex);
 799
 800     /*
 801      * Clear dirty bitmap if needed.  This _must_ be called before we
 802      * send any of the page in the chunk because we need to make sure
 803      * we can capture further page content changes when we sync dirty
 804      * log the next time.  So as long as we are going to send any of
 805      * the page in the chunk we clear the remote dirty bitmap for all.
 806      * Clearing it earlier won't be a problem, but too late will.
 807      */
 808     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
 809         uint8_t shift = rb->clear_bmap_shift;
 810         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
 811         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
 812
 813         /*
 814          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 815          * can make things easier sometimes since then start address
 816          * of the small chunk will always be 64 pages aligned so the
 817          * bitmap will always be aligned to unsigned long.  We should
 818          * even be able to remove this restriction but I'm simply
 819          * keeping it.
 820          */
 821         assert(shift >= 6);
 822         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 823         memory_region_clear_dirty_bitmap(rb->mr, start, size);
 824     }
 825
 826     ret = test_and_clear_bit(page, rb->bmap);
 827
 828     if (ret) {
 829         rs->migration_dirty_pages--;
 830     }
 831
 832     return ret;
 833 }
 834
 835 /* Called with RCU critical section */
 836 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 837 {
 838     uint64_t new_dirty_pages =
 839         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
 840
 841     rs->migration_dirty_pages += new_dirty_pages;
 842     rs->num_dirty_pages_period += new_dirty_pages;
 843 }
 844
 845 /**
 846  * ram_pagesize_summary: calculate all the pagesizes of a VM
 847  *
 848  * Returns a summary bitmap of the page sizes of all RAMBlocks
 849  *
 850  * For VMs with just normal pages this is equivalent to the host page
 851  * size. If it's got some huge pages then it's the OR of all the
 852  * different page sizes.
 853  */
 854 uint64_t ram_pagesize_summary(void)
 855 {
 856     RAMBlock *block;
 857     uint64_t summary = 0;
 858
 859     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 860         summary |= block->page_size;
 861     }
 862
 863     return summary;
 864 }
 865
 866 uint64_t ram_get_total_transferred_pages(void)
 867 {
 868     return  ram_counters.normal + ram_counters.duplicate +
 869                 compression_counters.pages + xbzrle_counters.pages;
 870 }
 871
 872 static void migration_update_rates(RAMState *rs, int64_t end_time)
 873 {
 874     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 875     double compressed_size;
 876
 877     /* calculate period counters */
 878     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 879                 / (end_time - rs->time_last_bitmap_sync);
 880
 881     if (!page_count) {
 882         return;
 883     }
 884
 885     if (migrate_use_xbzrle()) {
 886         double encoded_size, unencoded_size;
 887
 888         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 889             rs->xbzrle_cache_miss_prev) / page_count;
 890         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 891         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
 892                          TARGET_PAGE_SIZE;
 893         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
 894         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
 895             xbzrle_counters.encoding_rate = 0;
 896         } else {
 897             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
 898         }
 899         rs->xbzrle_pages_prev = xbzrle_counters.pages;
 900         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
 901     }
 902
 903     if (migrate_use_compression()) {
 904         compression_counters.busy_rate = (double)(compression_counters.busy -
 905             rs->compress_thread_busy_prev) / page_count;
 906         rs->compress_thread_busy_prev = compression_counters.busy;
 907
 908         compressed_size = compression_counters.compressed_size -
 909                           rs->compressed_size_prev;
 910         if (compressed_size) {
 911             double uncompressed_size = (compression_counters.pages -
 912                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
 913
 914             /* Compression-Ratio = Uncompressed-size / Compressed-size */
 915             compression_counters.compression_rate =
 916                                         uncompressed_size / compressed_size;
 917
 918             rs->compress_pages_prev = compression_counters.pages;
 919             rs->compressed_size_prev = compression_counters.compressed_size;
 920         }
 921     }
 922 }
 923
 924 static void migration_trigger_throttle(RAMState *rs)
 925 {
 926     MigrationState *s = migrate_get_current();
 927     uint64_t threshold = s->parameters.throttle_trigger_threshold;
 928
 929     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
 930     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
 931     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
 932
 933     /* During block migration the auto-converge logic incorrectly detects
 934      * that ram migration makes no progress. Avoid this by disabling the
 935      * throttling logic during the bulk phase of block migration. */
 936     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 937         /* The following detection logic can be refined later. For now:
 938            Check to see if the ratio between dirtied bytes and the approx.
 939            amount of bytes that just got transferred since the last time
 940            we were in this routine reaches the threshold. If that happens
 941            twice, start or increase throttling. */
 942
 943         if ((bytes_dirty_period > bytes_dirty_threshold) &&
 944             (++rs->dirty_rate_high_cnt >= 2)) {
 945             trace_migration_throttle();
 946             rs->dirty_rate_high_cnt = 0;
 947             mig_throttle_guest_down(bytes_dirty_period,
 948                                     bytes_dirty_threshold);
 949         }
 950     }
 951 }
 952
 953 static void migration_bitmap_sync(RAMState *rs)
 954 {
 955     RAMBlock *block;
 956     int64_t end_time;
 957
 958     ram_counters.dirty_sync_count++;
 959
 960     if (!rs->time_last_bitmap_sync) {
 961         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 962     }
 963
 964     trace_migration_bitmap_sync_start();
 965     memory_global_dirty_log_sync();
 966
 967     qemu_mutex_lock(&rs->bitmap_mutex);
 968     WITH_RCU_READ_LOCK_GUARD() {
 969         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 970             ramblock_sync_dirty_bitmap(rs, block);
 971         }
 972         ram_counters.remaining = ram_bytes_remaining();
 973     }
 974     qemu_mutex_unlock(&rs->bitmap_mutex);
 975
 976     memory_global_after_dirty_log_sync();
 977     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 978
 979     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 980
 981     /* more than 1 second = 1000 millisecons */
 982     if (end_time > rs->time_last_bitmap_sync + 1000) {
 983         migration_trigger_throttle(rs);
 984
 985         migration_update_rates(rs, end_time);
 986
 987         rs->target_page_count_prev = rs->target_page_count;
 988
 989         /* reset period counters */
 990         rs->time_last_bitmap_sync = end_time;
 991         rs->num_dirty_pages_period = 0;
 992         rs->bytes_xfer_prev = ram_counters.transferred;
 993     }
 994     if (migrate_use_events()) {
 995         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
 996     }
 997 }
 998
 999 static void migration_bitmap_sync_precopy(RAMState *rs)
1000 {
1001     Error *local_err = NULL;
1002
1003     /*
1004      * The current notifier usage is just an optimization to migration, so we
1005      * don't stop the normal migration process in the error case.
1006      */
1007     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1008         error_report_err(local_err);
1009         local_err = NULL;
1010     }
1011
1012     migration_bitmap_sync(rs);
1013
1014     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1015         error_report_err(local_err);
1016     }
1017 }
1018
1019 /**
1020  * save_zero_page_to_file: send the zero page to the file
1021  *
1022  * Returns the size of data written to the file, 0 means the page is not
1023  * a zero page
1024  *
1025  * @rs: current RAM state
1026  * @file: the file where the data is saved
1027  * @block: block that contains the page we want to send
1028  * @offset: offset inside the block for the page
1029  */
1030 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1031                                   RAMBlock *block, ram_addr_t offset)
1032 {
1033     uint8_t *p = block->host + offset;
1034     int len = 0;
1035
1036     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1037         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1038         qemu_put_byte(file, 0);
1039         len += 1;
1040     }
1041     return len;
1042 }
1043
1044 /**
1045  * save_zero_page: send the zero page to the stream
1046  *
1047  * Returns the number of pages written.
1048  *
1049  * @rs: current RAM state
1050  * @block: block that contains the page we want to send
1051  * @offset: offset inside the block for the page
1052  */
1053 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1054 {
1055     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1056
1057     if (len) {
1058         ram_counters.duplicate++;
1059         ram_counters.transferred += len;
1060         return 1;
1061     }
1062     return -1;
1063 }
1064
1065 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1066 {
1067     if (!migrate_release_ram() || !migration_in_postcopy()) {
1068         return;
1069     }
1070
1071     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1072 }
1073
1074 /*
1075  * @pages: the number of pages written by the control path,
1076  *        < 0 - error
1077  *        > 0 - number of pages written
1078  *
1079  * Return true if the pages has been saved, otherwise false is returned.
1080  */
1081 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1082                               int *pages)
1083 {
1084     uint64_t bytes_xmit = 0;
1085     int ret;
1086
1087     *pages = -1;
1088     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1089                                 &bytes_xmit);
1090     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1091         return false;
1092     }
1093
1094     if (bytes_xmit) {
1095         ram_counters.transferred += bytes_xmit;
1096         *pages = 1;
1097     }
1098
1099     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1100         return true;
1101     }
1102
1103     if (bytes_xmit > 0) {
1104         ram_counters.normal++;
1105     } else if (bytes_xmit == 0) {
1106         ram_counters.duplicate++;
1107     }
1108
1109     return true;
1110 }
1111
1112 /*
1113  * directly send the page to the stream
1114  *
1115  * Returns the number of pages written.
1116  *
1117  * @rs: current RAM state
1118  * @block: block that contains the page we want to send
1119  * @offset: offset inside the block for the page
1120  * @buf: the page to be sent
1121  * @async: send to page asyncly
1122  */
1123 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1124                             uint8_t *buf, bool async)
1125 {
1126     ram_counters.transferred += save_page_header(rs, rs->f, block,
1127                                                  offset | RAM_SAVE_FLAG_PAGE);
1128     if (async) {
1129         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1130                               migrate_release_ram() &
1131                               migration_in_postcopy());
1132     } else {
1133         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1134     }
1135     ram_counters.transferred += TARGET_PAGE_SIZE;
1136     ram_counters.normal++;
1137     return 1;
1138 }
1139
1140 /**
1141  * ram_save_page: send the given page to the stream
1142  *
1143  * Returns the number of pages written.
1144  *          < 0 - error
1145  *          >=0 - Number of pages written - this might legally be 0
1146  *                if xbzrle noticed the page was the same.
1147  *
1148  * @rs: current RAM state
1149  * @block: block that contains the page we want to send
1150  * @offset: offset inside the block for the page
1151  * @last_stage: if we are at the completion stage
1152  */
1153 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1154 {
1155     int pages = -1;
1156     uint8_t *p;
1157     bool send_async = true;
1158     RAMBlock *block = pss->block;
1159     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1160     ram_addr_t current_addr = block->offset + offset;
1161
1162     p = block->host + offset;
1163     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1164
1165     XBZRLE_cache_lock();
1166     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1167         pages = save_xbzrle_page(rs, &p, current_addr, block,
1168                                  offset, last_stage);
1169         if (!last_stage) {
1170             /* Can't send this cached data async, since the cache page
1171              * might get updated before it gets to the wire
1172              */
1173             send_async = false;
1174         }
1175     }
1176
1177     /* XBZRLE overflow or normal page */
1178     if (pages == -1) {
1179         pages = save_normal_page(rs, block, offset, p, send_async);
1180     }
1181
1182     XBZRLE_cache_unlock();
1183
1184     return pages;
1185 }
1186
1187 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1188                                  ram_addr_t offset)
1189 {
1190     if (multifd_queue_page(rs->f, block, offset) < 0) {
1191         return -1;
1192     }
1193     ram_counters.normal++;
1194
1195     return 1;
1196 }
1197
1198 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1199                                  ram_addr_t offset, uint8_t *source_buf)
1200 {
1201     RAMState *rs = ram_state;
1202     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1203     bool zero_page = false;
1204     int ret;
1205
1206     if (save_zero_page_to_file(rs, f, block, offset)) {
1207         zero_page = true;
1208         goto exit;
1209     }
1210
1211     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1212
1213     /*
1214      * copy it to a internal buffer to avoid it being modified by VM
1215      * so that we can catch up the error during compression and
1216      * decompression
1217      */
1218     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1219     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1220     if (ret < 0) {
1221         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1222         error_report("compressed data failed!");
1223         return false;
1224     }
1225
1226 exit:
1227     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1228     return zero_page;
1229 }
1230
1231 static void
1232 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1233 {
1234     ram_counters.transferred += bytes_xmit;
1235
1236     if (param->zero_page) {
1237         ram_counters.duplicate++;
1238         return;
1239     }
1240
1241     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1242     compression_counters.compressed_size += bytes_xmit - 8;
1243     compression_counters.pages++;
1244 }
1245
1246 static bool save_page_use_compression(RAMState *rs);
1247
1248 static void flush_compressed_data(RAMState *rs)
1249 {
1250     int idx, len, thread_count;
1251
1252     if (!save_page_use_compression(rs)) {
1253         return;
1254     }
1255     thread_count = migrate_compress_threads();
1256
1257     qemu_mutex_lock(&comp_done_lock);
1258     for (idx = 0; idx < thread_count; idx++) {
1259         while (!comp_param[idx].done) {
1260             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1261         }
1262     }
1263     qemu_mutex_unlock(&comp_done_lock);
1264
1265     for (idx = 0; idx < thread_count; idx++) {
1266         qemu_mutex_lock(&comp_param[idx].mutex);
1267         if (!comp_param[idx].quit) {
1268             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1269             /*
1270              * it's safe to fetch zero_page without holding comp_done_lock
1271              * as there is no further request submitted to the thread,
1272              * i.e, the thread should be waiting for a request at this point.
1273              */
1274             update_compress_thread_counts(&comp_param[idx], len);
1275         }
1276         qemu_mutex_unlock(&comp_param[idx].mutex);
1277     }
1278 }
1279
1280 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1281                                        ram_addr_t offset)
1282 {
1283     param->block = block;
1284     param->offset = offset;
1285 }
1286
1287 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1288                                            ram_addr_t offset)
1289 {
1290     int idx, thread_count, bytes_xmit = -1, pages = -1;
1291     bool wait = migrate_compress_wait_thread();
1292
1293     thread_count = migrate_compress_threads();
1294     qemu_mutex_lock(&comp_done_lock);
1295 retry:
1296     for (idx = 0; idx < thread_count; idx++) {
1297         if (comp_param[idx].done) {
1298             comp_param[idx].done = false;
1299             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1300             qemu_mutex_lock(&comp_param[idx].mutex);
1301             set_compress_params(&comp_param[idx], block, offset);
1302             qemu_cond_signal(&comp_param[idx].cond);
1303             qemu_mutex_unlock(&comp_param[idx].mutex);
1304             pages = 1;
1305             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1306             break;
1307         }
1308     }
1309
1310     /*
1311      * wait for the free thread if the user specifies 'compress-wait-thread',
1312      * otherwise we will post the page out in the main thread as normal page.
1313      */
1314     if (pages < 0 && wait) {
1315         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1316         goto retry;
1317     }
1318     qemu_mutex_unlock(&comp_done_lock);
1319
1320     return pages;
1321 }
1322
1323 /**
1324  * find_dirty_block: find the next dirty page and update any state
1325  * associated with the search process.
1326  *
1327  * Returns true if a page is found
1328  *
1329  * @rs: current RAM state
1330  * @pss: data about the state of the current dirty page scan
1331  * @again: set to false if the search has scanned the whole of RAM
1332  */
1333 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1334 {
1335     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1336     if (pss->complete_round && pss->block == rs->last_seen_block &&
1337         pss->page >= rs->last_page) {
1338         /*
1339          * We've been once around the RAM and haven't found anything.
1340          * Give up.
1341          */
1342         *again = false;
1343         return false;
1344     }
1345     if (!offset_in_ramblock(pss->block,
1346                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1347         /* Didn't find anything in this RAM Block */
1348         pss->page = 0;
1349         pss->block = QLIST_NEXT_RCU(pss->block, next);
1350         if (!pss->block) {
1351             /*
1352              * If memory migration starts over, we will meet a dirtied page
1353              * which may still exists in compression threads's ring, so we
1354              * should flush the compressed data to make sure the new page
1355              * is not overwritten by the old one in the destination.
1356              *
1357              * Also If xbzrle is on, stop using the data compression at this
1358              * point. In theory, xbzrle can do better than compression.
1359              */
1360             flush_compressed_data(rs);
1361
1362             /* Hit the end of the list */
1363             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1364             /* Flag that we've looped */
1365             pss->complete_round = true;
1366             /* After the first round, enable XBZRLE. */
1367             if (migrate_use_xbzrle()) {
1368                 rs->xbzrle_enabled = true;
1369             }
1370         }
1371         /* Didn't find anything this time, but try again on the new block */
1372         *again = true;
1373         return false;
1374     } else {
1375         /* Can go around again, but... */
1376         *again = true;
1377         /* We've found something so probably don't need to */
1378         return true;
1379     }
1380 }
1381
1382 /**
1383  * unqueue_page: gets a page of the queue
1384  *
1385  * Helper for 'get_queued_page' - gets a page off the queue
1386  *
1387  * Returns the block of the page (or NULL if none available)
1388  *
1389  * @rs: current RAM state
1390  * @offset: used to return the offset within the RAMBlock
1391  */
1392 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1393 {
1394     RAMBlock *block = NULL;
1395
1396     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1397         return NULL;
1398     }
1399
1400     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1401     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1402         struct RAMSrcPageRequest *entry =
1403                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1404         block = entry->rb;
1405         *offset = entry->offset;
1406
1407         if (entry->len > TARGET_PAGE_SIZE) {
1408             entry->len -= TARGET_PAGE_SIZE;
1409             entry->offset += TARGET_PAGE_SIZE;
1410         } else {
1411             memory_region_unref(block->mr);
1412             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1413             g_free(entry);
1414             migration_consume_urgent_request();
1415         }
1416     }
1417
1418     return block;
1419 }
1420
1421 #if defined(__linux__)
1422 /**
1423  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1424  *   is found, return RAM block pointer and page offset
1425  *
1426  * Returns pointer to the RAMBlock containing faulting page,
1427  *   NULL if no write faults are pending
1428  *
1429  * @rs: current RAM state
1430  * @offset: page offset from the beginning of the block
1431  */
1432 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1433 {
1434     struct uffd_msg uffd_msg;
1435     void *page_address;
1436     RAMBlock *block;
1437     int res;
1438
1439     if (!migrate_background_snapshot()) {
1440         return NULL;
1441     }
1442
1443     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1444     if (res <= 0) {
1445         return NULL;
1446     }
1447
1448     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1449     block = qemu_ram_block_from_host(page_address, false, offset);
1450     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1451     return block;
1452 }
1453
1454 /**
1455  * ram_save_release_protection: release UFFD write protection after
1456  *   a range of pages has been saved
1457  *
1458  * @rs: current RAM state
1459  * @pss: page-search-status structure
1460  * @start_page: index of the first page in the range relative to pss->block
1461  *
1462  * Returns 0 on success, negative value in case of an error
1463 */
1464 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1465         unsigned long start_page)
1466 {
1467     int res = 0;
1468
1469     /* Check if page is from UFFD-managed region. */
1470     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1471         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1472         uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1473
1474         /* Flush async buffers before un-protect. */
1475         qemu_fflush(rs->f);
1476         /* Un-protect memory range. */
1477         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1478                 false, false);
1479     }
1480
1481     return res;
1482 }
1483
1484 /* ram_write_tracking_available: check if kernel supports required UFFD features
1485  *
1486  * Returns true if supports, false otherwise
1487  */
1488 bool ram_write_tracking_available(void)
1489 {
1490     uint64_t uffd_features;
1491     int res;
1492
1493     res = uffd_query_features(&uffd_features);
1494     return (res == 0 &&
1495             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1496 }
1497
1498 /* ram_write_tracking_compatible: check if guest configuration is
1499  *   compatible with 'write-tracking'
1500  *
1501  * Returns true if compatible, false otherwise
1502  */
1503 bool ram_write_tracking_compatible(void)
1504 {
1505     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1506     int uffd_fd;
1507     RAMBlock *block;
1508     bool ret = false;
1509
1510     /* Open UFFD file descriptor */
1511     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1512     if (uffd_fd < 0) {
1513         return false;
1514     }
1515
1516     RCU_READ_LOCK_GUARD();
1517
1518     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1519         uint64_t uffd_ioctls;
1520
1521         /* Nothing to do with read-only and MMIO-writable regions */
1522         if (block->mr->readonly || block->mr->rom_device) {
1523             continue;
1524         }
1525         /* Try to register block memory via UFFD-IO to track writes */
1526         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1527                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1528             goto out;
1529         }
1530         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1531             goto out;
1532         }
1533     }
1534     ret = true;
1535
1536 out:
1537     uffd_close_fd(uffd_fd);
1538     return ret;
1539 }
1540
1541 /*
1542  * ram_block_populate_pages: populate memory in the RAM block by reading
1543  *   an integer from the beginning of each page.
1544  *
1545  * Since it's solely used for userfault_fd WP feature, here we just
1546  *   hardcode page size to qemu_real_host_page_size.
1547  *
1548  * @block: RAM block to populate
1549  */
1550 static void ram_block_populate_pages(RAMBlock *block)
1551 {
1552     char *ptr = (char *) block->host;
1553
1554     for (ram_addr_t offset = 0; offset < block->used_length;
1555             offset += qemu_real_host_page_size) {
1556         char tmp = *(ptr + offset);
1557
1558         /* Don't optimize the read out */
1559         asm volatile("" : "+r" (tmp));
1560     }
1561 }
1562
1563 /*
1564  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1565  */
1566 void ram_write_tracking_prepare(void)
1567 {
1568     RAMBlock *block;
1569
1570     RCU_READ_LOCK_GUARD();
1571
1572     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1573         /* Nothing to do with read-only and MMIO-writable regions */
1574         if (block->mr->readonly || block->mr->rom_device) {
1575             continue;
1576         }
1577
1578         /*
1579          * Populate pages of the RAM block before enabling userfault_fd
1580          * write protection.
1581          *
1582          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1583          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1584          * pages with pte_none() entries in page table.
1585          */
1586         ram_block_populate_pages(block);
1587     }
1588 }
1589
1590 /*
1591  * ram_write_tracking_start: start UFFD-WP memory tracking
1592  *
1593  * Returns 0 for success or negative value in case of error
1594  */
1595 int ram_write_tracking_start(void)
1596 {
1597     int uffd_fd;
1598     RAMState *rs = ram_state;
1599     RAMBlock *block;
1600
1601     /* Open UFFD file descriptor */
1602     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1603     if (uffd_fd < 0) {
1604         return uffd_fd;
1605     }
1606     rs->uffdio_fd = uffd_fd;
1607
1608     RCU_READ_LOCK_GUARD();
1609
1610     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1611         /* Nothing to do with read-only and MMIO-writable regions */
1612         if (block->mr->readonly || block->mr->rom_device) {
1613             continue;
1614         }
1615
1616         /* Register block memory with UFFD to track writes */
1617         if (uffd_register_memory(rs->uffdio_fd, block->host,
1618                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1619             goto fail;
1620         }
1621         /* Apply UFFD write protection to the block memory range */
1622         if (uffd_change_protection(rs->uffdio_fd, block->host,
1623                 block->max_length, true, false)) {
1624             goto fail;
1625         }
1626         block->flags |= RAM_UF_WRITEPROTECT;
1627         memory_region_ref(block->mr);
1628
1629         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1630                 block->host, block->max_length);
1631     }
1632
1633     return 0;
1634
1635 fail:
1636     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1637
1638     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1639         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1640             continue;
1641         }
1642         /*
1643          * In case some memory block failed to be write-protected
1644          * remove protection and unregister all succeeded RAM blocks
1645          */
1646         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1647                 false, false);
1648         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1649         /* Cleanup flags and remove reference */
1650         block->flags &= ~RAM_UF_WRITEPROTECT;
1651         memory_region_unref(block->mr);
1652     }
1653
1654     uffd_close_fd(uffd_fd);
1655     rs->uffdio_fd = -1;
1656     return -1;
1657 }
1658
1659 /**
1660  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1661  */
1662 void ram_write_tracking_stop(void)
1663 {
1664     RAMState *rs = ram_state;
1665     RAMBlock *block;
1666
1667     RCU_READ_LOCK_GUARD();
1668
1669     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1670         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1671             continue;
1672         }
1673         /* Remove protection and unregister all affected RAM blocks */
1674         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1675                 false, false);
1676         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1677
1678         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1679                 block->host, block->max_length);
1680
1681         /* Cleanup flags and remove reference */
1682         block->flags &= ~RAM_UF_WRITEPROTECT;
1683         memory_region_unref(block->mr);
1684     }
1685
1686     /* Finally close UFFD file descriptor */
1687     uffd_close_fd(rs->uffdio_fd);
1688     rs->uffdio_fd = -1;
1689 }
1690
1691 #else
1692 /* No target OS support, stubs just fail or ignore */
1693
1694 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1695 {
1696     (void) rs;
1697     (void) offset;
1698
1699     return NULL;
1700 }
1701
1702 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1703         unsigned long start_page)
1704 {
1705     (void) rs;
1706     (void) pss;
1707     (void) start_page;
1708
1709     return 0;
1710 }
1711
1712 bool ram_write_tracking_available(void)
1713 {
1714     return false;
1715 }
1716
1717 bool ram_write_tracking_compatible(void)
1718 {
1719     assert(0);
1720     return false;
1721 }
1722
1723 int ram_write_tracking_start(void)
1724 {
1725     assert(0);
1726     return -1;
1727 }
1728
1729 void ram_write_tracking_stop(void)
1730 {
1731     assert(0);
1732 }
1733 #endif /* defined(__linux__) */
1734
1735 /**
1736  * get_queued_page: unqueue a page from the postcopy requests
1737  *
1738  * Skips pages that are already sent (!dirty)
1739  *
1740  * Returns true if a queued page is found
1741  *
1742  * @rs: current RAM state
1743  * @pss: data about the state of the current dirty page scan
1744  */
1745 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1746 {
1747     RAMBlock  *block;
1748     ram_addr_t offset;
1749     bool dirty;
1750
1751     do {
1752         block = unqueue_page(rs, &offset);
1753         /*
1754          * We're sending this page, and since it's postcopy nothing else
1755          * will dirty it, and we must make sure it doesn't get sent again
1756          * even if this queue request was received after the background
1757          * search already sent it.
1758          */
1759         if (block) {
1760             unsigned long page;
1761
1762             page = offset >> TARGET_PAGE_BITS;
1763             dirty = test_bit(page, block->bmap);
1764             if (!dirty) {
1765                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1766                                                 page);
1767             } else {
1768                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1769             }
1770         }
1771
1772     } while (block && !dirty);
1773
1774     if (!block) {
1775         /*
1776          * Poll write faults too if background snapshot is enabled; that's
1777          * when we have vcpus got blocked by the write protected pages.
1778          */
1779         block = poll_fault_page(rs, &offset);
1780     }
1781
1782     if (block) {
1783         /*
1784          * We want the background search to continue from the queued page
1785          * since the guest is likely to want other pages near to the page
1786          * it just requested.
1787          */
1788         pss->block = block;
1789         pss->page = offset >> TARGET_PAGE_BITS;
1790
1791         /*
1792          * This unqueued page would break the "one round" check, even is
1793          * really rare.
1794          */
1795         pss->complete_round = false;
1796     }
1797
1798     return !!block;
1799 }
1800
1801 /**
1802  * migration_page_queue_free: drop any remaining pages in the ram
1803  * request queue
1804  *
1805  * It should be empty at the end anyway, but in error cases there may
1806  * be some left.  in case that there is any page left, we drop it.
1807  *
1808  */
1809 static void migration_page_queue_free(RAMState *rs)
1810 {
1811     struct RAMSrcPageRequest *mspr, *next_mspr;
1812     /* This queue generally should be empty - but in the case of a failed
1813      * migration might have some droppings in.
1814      */
1815     RCU_READ_LOCK_GUARD();
1816     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1817         memory_region_unref(mspr->rb->mr);
1818         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1819         g_free(mspr);
1820     }
1821 }
1822
1823 /**
1824  * ram_save_queue_pages: queue the page for transmission
1825  *
1826  * A request from postcopy destination for example.
1827  *
1828  * Returns zero on success or negative on error
1829  *
1830  * @rbname: Name of the RAMBLock of the request. NULL means the
1831  *          same that last one.
1832  * @start: starting address from the start of the RAMBlock
1833  * @len: length (in bytes) to send
1834  */
1835 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1836 {
1837     RAMBlock *ramblock;
1838     RAMState *rs = ram_state;
1839
1840     ram_counters.postcopy_requests++;
1841     RCU_READ_LOCK_GUARD();
1842
1843     if (!rbname) {
1844         /* Reuse last RAMBlock */
1845         ramblock = rs->last_req_rb;
1846
1847         if (!ramblock) {
1848             /*
1849              * Shouldn't happen, we can't reuse the last RAMBlock if
1850              * it's the 1st request.
1851              */
1852             error_report("ram_save_queue_pages no previous block");
1853             return -1;
1854         }
1855     } else {
1856         ramblock = qemu_ram_block_by_name(rbname);
1857
1858         if (!ramblock) {
1859             /* We shouldn't be asked for a non-existent RAMBlock */
1860             error_report("ram_save_queue_pages no block '%s'", rbname);
1861             return -1;
1862         }
1863         rs->last_req_rb = ramblock;
1864     }
1865     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1866     if (!offset_in_ramblock(ramblock, start + len - 1)) {
1867         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1868                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1869                      __func__, start, len, ramblock->used_length);
1870         return -1;
1871     }
1872
1873     struct RAMSrcPageRequest *new_entry =
1874         g_malloc0(sizeof(struct RAMSrcPageRequest));
1875     new_entry->rb = ramblock;
1876     new_entry->offset = start;
1877     new_entry->len = len;
1878
1879     memory_region_ref(ramblock->mr);
1880     qemu_mutex_lock(&rs->src_page_req_mutex);
1881     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1882     migration_make_urgent_request();
1883     qemu_mutex_unlock(&rs->src_page_req_mutex);
1884
1885     return 0;
1886 }
1887
1888 static bool save_page_use_compression(RAMState *rs)
1889 {
1890     if (!migrate_use_compression()) {
1891         return false;
1892     }
1893
1894     /*
1895      * If xbzrle is enabled (e.g., after first round of migration), stop
1896      * using the data compression. In theory, xbzrle can do better than
1897      * compression.
1898      */
1899     if (rs->xbzrle_enabled) {
1900         return false;
1901     }
1902
1903     return true;
1904 }
1905
1906 /*
1907  * try to compress the page before posting it out, return true if the page
1908  * has been properly handled by compression, otherwise needs other
1909  * paths to handle it
1910  */
1911 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1912 {
1913     if (!save_page_use_compression(rs)) {
1914         return false;
1915     }
1916
1917     /*
1918      * When starting the process of a new block, the first page of
1919      * the block should be sent out before other pages in the same
1920      * block, and all the pages in last block should have been sent
1921      * out, keeping this order is important, because the 'cont' flag
1922      * is used to avoid resending the block name.
1923      *
1924      * We post the fist page as normal page as compression will take
1925      * much CPU resource.
1926      */
1927     if (block != rs->last_sent_block) {
1928         flush_compressed_data(rs);
1929         return false;
1930     }
1931
1932     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1933         return true;
1934     }
1935
1936     compression_counters.busy++;
1937     return false;
1938 }
1939
1940 /**
1941  * ram_save_target_page: save one target page
1942  *
1943  * Returns the number of pages written
1944  *
1945  * @rs: current RAM state
1946  * @pss: data about the page we want to send
1947  * @last_stage: if we are at the completion stage
1948  */
1949 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1950                                 bool last_stage)
1951 {
1952     RAMBlock *block = pss->block;
1953     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1954     int res;
1955
1956     if (control_save_page(rs, block, offset, &res)) {
1957         return res;
1958     }
1959
1960     if (save_compress_page(rs, block, offset)) {
1961         return 1;
1962     }
1963
1964     res = save_zero_page(rs, block, offset);
1965     if (res > 0) {
1966         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1967          * page would be stale
1968          */
1969         if (!save_page_use_compression(rs)) {
1970             XBZRLE_cache_lock();
1971             xbzrle_cache_zero_page(rs, block->offset + offset);
1972             XBZRLE_cache_unlock();
1973         }
1974         ram_release_pages(block->idstr, offset, res);
1975         return res;
1976     }
1977
1978     /*
1979      * Do not use multifd for:
1980      * 1. Compression as the first page in the new block should be posted out
1981      *    before sending the compressed page
1982      * 2. In postcopy as one whole host page should be placed
1983      */
1984     if (!save_page_use_compression(rs) && migrate_use_multifd()
1985         && !migration_in_postcopy()) {
1986         return ram_save_multifd_page(rs, block, offset);
1987     }
1988
1989     return ram_save_page(rs, pss, last_stage);
1990 }
1991
1992 /**
1993  * ram_save_host_page: save a whole host page
1994  *
1995  * Starting at *offset send pages up to the end of the current host
1996  * page. It's valid for the initial offset to point into the middle of
1997  * a host page in which case the remainder of the hostpage is sent.
1998  * Only dirty target pages are sent. Note that the host page size may
1999  * be a huge page for this block.
2000  * The saving stops at the boundary of the used_length of the block
2001  * if the RAMBlock isn't a multiple of the host page size.
2002  *
2003  * Returns the number of pages written or negative on error
2004  *
2005  * @rs: current RAM state
2006  * @ms: current migration state
2007  * @pss: data about the page we want to send
2008  * @last_stage: if we are at the completion stage
2009  */
2010 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2011                               bool last_stage)
2012 {
2013     int tmppages, pages = 0;
2014     size_t pagesize_bits =
2015         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2016     unsigned long hostpage_boundary =
2017         QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2018     unsigned long start_page = pss->page;
2019     int res;
2020
2021     if (ramblock_is_ignored(pss->block)) {
2022         error_report("block %s should not be migrated !", pss->block->idstr);
2023         return 0;
2024     }
2025
2026     do {
2027         /* Check the pages is dirty and if it is send it */
2028         if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2029             tmppages = ram_save_target_page(rs, pss, last_stage);
2030             if (tmppages < 0) {
2031                 return tmppages;
2032             }
2033
2034             pages += tmppages;
2035             /*
2036              * Allow rate limiting to happen in the middle of huge pages if
2037              * something is sent in the current iteration.
2038              */
2039             if (pagesize_bits > 1 && tmppages > 0) {
2040                 migration_rate_limit();
2041             }
2042         }
2043         pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2044     } while ((pss->page < hostpage_boundary) &&
2045              offset_in_ramblock(pss->block,
2046                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2047     /* The offset we leave with is the min boundary of host page and block */
2048     pss->page = MIN(pss->page, hostpage_boundary) - 1;
2049
2050     res = ram_save_release_protection(rs, pss, start_page);
2051     return (res < 0 ? res : pages);
2052 }
2053
2054 /**
2055  * ram_find_and_save_block: finds a dirty page and sends it to f
2056  *
2057  * Called within an RCU critical section.
2058  *
2059  * Returns the number of pages written where zero means no dirty pages,
2060  * or negative on error
2061  *
2062  * @rs: current RAM state
2063  * @last_stage: if we are at the completion stage
2064  *
2065  * On systems where host-page-size > target-page-size it will send all the
2066  * pages in a host page that are dirty.
2067  */
2068
2069 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2070 {
2071     PageSearchStatus pss;
2072     int pages = 0;
2073     bool again, found;
2074
2075     /* No dirty page as there is zero RAM */
2076     if (!ram_bytes_total()) {
2077         return pages;
2078     }
2079
2080     pss.block = rs->last_seen_block;
2081     pss.page = rs->last_page;
2082     pss.complete_round = false;
2083
2084     if (!pss.block) {
2085         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2086     }
2087
2088     do {
2089         again = true;
2090         found = get_queued_page(rs, &pss);
2091
2092         if (!found) {
2093             /* priority queue empty, so just search for something dirty */
2094             found = find_dirty_block(rs, &pss, &again);
2095         }
2096
2097         if (found) {
2098             pages = ram_save_host_page(rs, &pss, last_stage);
2099         }
2100     } while (!pages && again);
2101
2102     rs->last_seen_block = pss.block;
2103     rs->last_page = pss.page;
2104
2105     return pages;
2106 }
2107
2108 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2109 {
2110     uint64_t pages = size / TARGET_PAGE_SIZE;
2111
2112     if (zero) {
2113         ram_counters.duplicate += pages;
2114     } else {
2115         ram_counters.normal += pages;
2116         ram_counters.transferred += size;
2117         qemu_update_position(f, size);
2118     }
2119 }
2120
2121 static uint64_t ram_bytes_total_common(bool count_ignored)
2122 {
2123     RAMBlock *block;
2124     uint64_t total = 0;
2125
2126     RCU_READ_LOCK_GUARD();
2127
2128     if (count_ignored) {
2129         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2130             total += block->used_length;
2131         }
2132     } else {
2133         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2134             total += block->used_length;
2135         }
2136     }
2137     return total;
2138 }
2139
2140 uint64_t ram_bytes_total(void)
2141 {
2142     return ram_bytes_total_common(false);
2143 }
2144
2145 static void xbzrle_load_setup(void)
2146 {
2147     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2148 }
2149
2150 static void xbzrle_load_cleanup(void)
2151 {
2152     g_free(XBZRLE.decoded_buf);
2153     XBZRLE.decoded_buf = NULL;
2154 }
2155
2156 static void ram_state_cleanup(RAMState **rsp)
2157 {
2158     if (*rsp) {
2159         migration_page_queue_free(*rsp);
2160         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2161         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2162         g_free(*rsp);
2163         *rsp = NULL;
2164     }
2165 }
2166
2167 static void xbzrle_cleanup(void)
2168 {
2169     XBZRLE_cache_lock();
2170     if (XBZRLE.cache) {
2171         cache_fini(XBZRLE.cache);
2172         g_free(XBZRLE.encoded_buf);
2173         g_free(XBZRLE.current_buf);
2174         g_free(XBZRLE.zero_target_page);
2175         XBZRLE.cache = NULL;
2176         XBZRLE.encoded_buf = NULL;
2177         XBZRLE.current_buf = NULL;
2178         XBZRLE.zero_target_page = NULL;
2179     }
2180     XBZRLE_cache_unlock();
2181 }
2182
2183 static void ram_save_cleanup(void *opaque)
2184 {
2185     RAMState **rsp = opaque;
2186     RAMBlock *block;
2187
2188     /* We don't use dirty log with background snapshots */
2189     if (!migrate_background_snapshot()) {
2190         /* caller have hold iothread lock or is in a bh, so there is
2191          * no writing race against the migration bitmap
2192          */
2193         memory_global_dirty_log_stop();
2194     }
2195
2196     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2197         g_free(block->clear_bmap);
2198         block->clear_bmap = NULL;
2199         g_free(block->bmap);
2200         block->bmap = NULL;
2201     }
2202
2203     xbzrle_cleanup();
2204     compress_threads_save_cleanup();
2205     ram_state_cleanup(rsp);
2206 }
2207
2208 static void ram_state_reset(RAMState *rs)
2209 {
2210     rs->last_seen_block = NULL;
2211     rs->last_sent_block = NULL;
2212     rs->last_page = 0;
2213     rs->last_version = ram_list.version;
2214     rs->xbzrle_enabled = false;
2215 }
2216
2217 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2218
2219 /*
2220  * 'expected' is the value you expect the bitmap mostly to be full
2221  * of; it won't bother printing lines that are all this value.
2222  * If 'todump' is null the migration bitmap is dumped.
2223  */
2224 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2225                            unsigned long pages)
2226 {
2227     int64_t cur;
2228     int64_t linelen = 128;
2229     char linebuf[129];
2230
2231     for (cur = 0; cur < pages; cur += linelen) {
2232         int64_t curb;
2233         bool found = false;
2234         /*
2235          * Last line; catch the case where the line length
2236          * is longer than remaining ram
2237          */
2238         if (cur + linelen > pages) {
2239             linelen = pages - cur;
2240         }
2241         for (curb = 0; curb < linelen; curb++) {
2242             bool thisbit = test_bit(cur + curb, todump);
2243             linebuf[curb] = thisbit ? '1' : '.';
2244             found = found || (thisbit != expected);
2245         }
2246         if (found) {
2247             linebuf[curb] = '\0';
2248             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2249         }
2250     }
2251 }
2252
2253 /* **** functions for postcopy ***** */
2254
2255 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2256 {
2257     struct RAMBlock *block;
2258
2259     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2260         unsigned long *bitmap = block->bmap;
2261         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2262         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2263
2264         while (run_start < range) {
2265             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2266             ram_discard_range(block->idstr,
2267                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2268                               ((ram_addr_t)(run_end - run_start))
2269                                 << TARGET_PAGE_BITS);
2270             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2271         }
2272     }
2273 }
2274
2275 /**
2276  * postcopy_send_discard_bm_ram: discard a RAMBlock
2277  *
2278  * Returns zero on success
2279  *
2280  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2281  *
2282  * @ms: current migration state
2283  * @block: RAMBlock to discard
2284  */
2285 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2286 {
2287     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2288     unsigned long current;
2289     unsigned long *bitmap = block->bmap;
2290
2291     for (current = 0; current < end; ) {
2292         unsigned long one = find_next_bit(bitmap, end, current);
2293         unsigned long zero, discard_length;
2294
2295         if (one >= end) {
2296             break;
2297         }
2298
2299         zero = find_next_zero_bit(bitmap, end, one + 1);
2300
2301         if (zero >= end) {
2302             discard_length = end - one;
2303         } else {
2304             discard_length = zero - one;
2305         }
2306         postcopy_discard_send_range(ms, one, discard_length);
2307         current = one + discard_length;
2308     }
2309
2310     return 0;
2311 }
2312
2313 /**
2314  * postcopy_each_ram_send_discard: discard all RAMBlocks
2315  *
2316  * Returns 0 for success or negative for error
2317  *
2318  * Utility for the outgoing postcopy code.
2319  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2320  *   passing it bitmap indexes and name.
2321  * (qemu_ram_foreach_block ends up passing unscaled lengths
2322  *  which would mean postcopy code would have to deal with target page)
2323  *
2324  * @ms: current migration state
2325  */
2326 static int postcopy_each_ram_send_discard(MigrationState *ms)
2327 {
2328     struct RAMBlock *block;
2329     int ret;
2330
2331     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2332         postcopy_discard_send_init(ms, block->idstr);
2333
2334         /*
2335          * Postcopy sends chunks of bitmap over the wire, but it
2336          * just needs indexes at this point, avoids it having
2337          * target page specific code.
2338          */
2339         ret = postcopy_send_discard_bm_ram(ms, block);
2340         postcopy_discard_send_finish(ms);
2341         if (ret) {
2342             return ret;
2343         }
2344     }
2345
2346     return 0;
2347 }
2348
2349 /**
2350  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2351  *
2352  * Helper for postcopy_chunk_hostpages; it's called twice to
2353  * canonicalize the two bitmaps, that are similar, but one is
2354  * inverted.
2355  *
2356  * Postcopy requires that all target pages in a hostpage are dirty or
2357  * clean, not a mix.  This function canonicalizes the bitmaps.
2358  *
2359  * @ms: current migration state
2360  * @block: block that contains the page we want to canonicalize
2361  */
2362 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2363 {
2364     RAMState *rs = ram_state;
2365     unsigned long *bitmap = block->bmap;
2366     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2367     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2368     unsigned long run_start;
2369
2370     if (block->page_size == TARGET_PAGE_SIZE) {
2371         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2372         return;
2373     }
2374
2375     /* Find a dirty page */
2376     run_start = find_next_bit(bitmap, pages, 0);
2377
2378     while (run_start < pages) {
2379
2380         /*
2381          * If the start of this run of pages is in the middle of a host
2382          * page, then we need to fixup this host page.
2383          */
2384         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2385             /* Find the end of this run */
2386             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2387             /*
2388              * If the end isn't at the start of a host page, then the
2389              * run doesn't finish at the end of a host page
2390              * and we need to discard.
2391              */
2392         }
2393
2394         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2395             unsigned long page;
2396             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2397                                                              host_ratio);
2398             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2399
2400             /* Clean up the bitmap */
2401             for (page = fixup_start_addr;
2402                  page < fixup_start_addr + host_ratio; page++) {
2403                 /*
2404                  * Remark them as dirty, updating the count for any pages
2405                  * that weren't previously dirty.
2406                  */
2407                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2408             }
2409         }
2410
2411         /* Find the next dirty page for the next iteration */
2412         run_start = find_next_bit(bitmap, pages, run_start);
2413     }
2414 }
2415
2416 /**
2417  * postcopy_chunk_hostpages: discard any partially sent host page
2418  *
2419  * Utility for the outgoing postcopy code.
2420  *
2421  * Discard any partially sent host-page size chunks, mark any partially
2422  * dirty host-page size chunks as all dirty.  In this case the host-page
2423  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2424  *
2425  * Returns zero on success
2426  *
2427  * @ms: current migration state
2428  * @block: block we want to work with
2429  */
2430 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2431 {
2432     postcopy_discard_send_init(ms, block->idstr);
2433
2434     /*
2435      * Ensure that all partially dirty host pages are made fully dirty.
2436      */
2437     postcopy_chunk_hostpages_pass(ms, block);
2438
2439     postcopy_discard_send_finish(ms);
2440     return 0;
2441 }
2442
2443 /**
2444  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2445  *
2446  * Returns zero on success
2447  *
2448  * Transmit the set of pages to be discarded after precopy to the target
2449  * these are pages that:
2450  *     a) Have been previously transmitted but are now dirty again
2451  *     b) Pages that have never been transmitted, this ensures that
2452  *        any pages on the destination that have been mapped by background
2453  *        tasks get discarded (transparent huge pages is the specific concern)
2454  * Hopefully this is pretty sparse
2455  *
2456  * @ms: current migration state
2457  */
2458 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2459 {
2460     RAMState *rs = ram_state;
2461     RAMBlock *block;
2462     int ret;
2463
2464     RCU_READ_LOCK_GUARD();
2465
2466     /* This should be our last sync, the src is now paused */
2467     migration_bitmap_sync(rs);
2468
2469     /* Easiest way to make sure we don't resume in the middle of a host-page */
2470     rs->last_seen_block = NULL;
2471     rs->last_sent_block = NULL;
2472     rs->last_page = 0;
2473
2474     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2475         /* Deal with TPS != HPS and huge pages */
2476         ret = postcopy_chunk_hostpages(ms, block);
2477         if (ret) {
2478             return ret;
2479         }
2480
2481 #ifdef DEBUG_POSTCOPY
2482         ram_debug_dump_bitmap(block->bmap, true,
2483                               block->used_length >> TARGET_PAGE_BITS);
2484 #endif
2485     }
2486     trace_ram_postcopy_send_discard_bitmap();
2487
2488     return postcopy_each_ram_send_discard(ms);
2489 }
2490
2491 /**
2492  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2493  *
2494  * Returns zero on success
2495  *
2496  * @rbname: name of the RAMBlock of the request. NULL means the
2497  *          same that last one.
2498  * @start: RAMBlock starting page
2499  * @length: RAMBlock size
2500  */
2501 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2502 {
2503     trace_ram_discard_range(rbname, start, length);
2504
2505     RCU_READ_LOCK_GUARD();
2506     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2507
2508     if (!rb) {
2509         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2510         return -1;
2511     }
2512
2513     /*
2514      * On source VM, we don't need to update the received bitmap since
2515      * we don't even have one.
2516      */
2517     if (rb->receivedmap) {
2518         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2519                      length >> qemu_target_page_bits());
2520     }
2521
2522     return ram_block_discard_range(rb, start, length);
2523 }
2524
2525 /*
2526  * For every allocation, we will try not to crash the VM if the
2527  * allocation failed.
2528  */
2529 static int xbzrle_init(void)
2530 {
2531     Error *local_err = NULL;
2532
2533     if (!migrate_use_xbzrle()) {
2534         return 0;
2535     }
2536
2537     XBZRLE_cache_lock();
2538
2539     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2540     if (!XBZRLE.zero_target_page) {
2541         error_report("%s: Error allocating zero page", __func__);
2542         goto err_out;
2543     }
2544
2545     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2546                               TARGET_PAGE_SIZE, &local_err);
2547     if (!XBZRLE.cache) {
2548         error_report_err(local_err);
2549         goto free_zero_page;
2550     }
2551
2552     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2553     if (!XBZRLE.encoded_buf) {
2554         error_report("%s: Error allocating encoded_buf", __func__);
2555         goto free_cache;
2556     }
2557
2558     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2559     if (!XBZRLE.current_buf) {
2560         error_report("%s: Error allocating current_buf", __func__);
2561         goto free_encoded_buf;
2562     }
2563
2564     /* We are all good */
2565     XBZRLE_cache_unlock();
2566     return 0;
2567
2568 free_encoded_buf:
2569     g_free(XBZRLE.encoded_buf);
2570     XBZRLE.encoded_buf = NULL;
2571 free_cache:
2572     cache_fini(XBZRLE.cache);
2573     XBZRLE.cache = NULL;
2574 free_zero_page:
2575     g_free(XBZRLE.zero_target_page);
2576     XBZRLE.zero_target_page = NULL;
2577 err_out:
2578     XBZRLE_cache_unlock();
2579     return -ENOMEM;
2580 }
2581
2582 static int ram_state_init(RAMState **rsp)
2583 {
2584     *rsp = g_try_new0(RAMState, 1);
2585
2586     if (!*rsp) {
2587         error_report("%s: Init ramstate fail", __func__);
2588         return -1;
2589     }
2590
2591     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2592     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2593     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2594
2595     /*
2596      * Count the total number of pages used by ram blocks not including any
2597      * gaps due to alignment or unplugs.
2598      * This must match with the initial values of dirty bitmap.
2599      */
2600     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2601     ram_state_reset(*rsp);
2602
2603     return 0;
2604 }
2605
2606 static void ram_list_init_bitmaps(void)
2607 {
2608     MigrationState *ms = migrate_get_current();
2609     RAMBlock *block;
2610     unsigned long pages;
2611     uint8_t shift;
2612
2613     /* Skip setting bitmap if there is no RAM */
2614     if (ram_bytes_total()) {
2615         shift = ms->clear_bitmap_shift;
2616         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2617             error_report("clear_bitmap_shift (%u) too big, using "
2618                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2619             shift = CLEAR_BITMAP_SHIFT_MAX;
2620         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2621             error_report("clear_bitmap_shift (%u) too small, using "
2622                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2623             shift = CLEAR_BITMAP_SHIFT_MIN;
2624         }
2625
2626         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2627             pages = block->max_length >> TARGET_PAGE_BITS;
2628             /*
2629              * The initial dirty bitmap for migration must be set with all
2630              * ones to make sure we'll migrate every guest RAM page to
2631              * destination.
2632              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2633              * new migration after a failed migration, ram_list.
2634              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2635              * guest memory.
2636              */
2637             block->bmap = bitmap_new(pages);
2638             bitmap_set(block->bmap, 0, pages);
2639             block->clear_bmap_shift = shift;
2640             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2641         }
2642     }
2643 }
2644
2645 static void ram_init_bitmaps(RAMState *rs)
2646 {
2647     /* For memory_global_dirty_log_start below.  */
2648     qemu_mutex_lock_iothread();
2649     qemu_mutex_lock_ramlist();
2650
2651     WITH_RCU_READ_LOCK_GUARD() {
2652         ram_list_init_bitmaps();
2653         /* We don't use dirty log with background snapshots */
2654         if (!migrate_background_snapshot()) {
2655             memory_global_dirty_log_start();
2656             migration_bitmap_sync_precopy(rs);
2657         }
2658     }
2659     qemu_mutex_unlock_ramlist();
2660     qemu_mutex_unlock_iothread();
2661 }
2662
2663 static int ram_init_all(RAMState **rsp)
2664 {
2665     if (ram_state_init(rsp)) {
2666         return -1;
2667     }
2668
2669     if (xbzrle_init()) {
2670         ram_state_cleanup(rsp);
2671         return -1;
2672     }
2673
2674     ram_init_bitmaps(*rsp);
2675
2676     return 0;
2677 }
2678
2679 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2680 {
2681     RAMBlock *block;
2682     uint64_t pages = 0;
2683
2684     /*
2685      * Postcopy is not using xbzrle/compression, so no need for that.
2686      * Also, since source are already halted, we don't need to care
2687      * about dirty page logging as well.
2688      */
2689
2690     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2691         pages += bitmap_count_one(block->bmap,
2692                                   block->used_length >> TARGET_PAGE_BITS);
2693     }
2694
2695     /* This may not be aligned with current bitmaps. Recalculate. */
2696     rs->migration_dirty_pages = pages;
2697
2698     ram_state_reset(rs);
2699
2700     /* Update RAMState cache of output QEMUFile */
2701     rs->f = out;
2702
2703     trace_ram_state_resume_prepare(pages);
2704 }
2705
2706 /*
2707  * This function clears bits of the free pages reported by the caller from the
2708  * migration dirty bitmap. @addr is the host address corresponding to the
2709  * start of the continuous guest free pages, and @len is the total bytes of
2710  * those pages.
2711  */
2712 void qemu_guest_free_page_hint(void *addr, size_t len)
2713 {
2714     RAMBlock *block;
2715     ram_addr_t offset;
2716     size_t used_len, start, npages;
2717     MigrationState *s = migrate_get_current();
2718
2719     /* This function is currently expected to be used during live migration */
2720     if (!migration_is_setup_or_active(s->state)) {
2721         return;
2722     }
2723
2724     for (; len > 0; len -= used_len, addr += used_len) {
2725         block = qemu_ram_block_from_host(addr, false, &offset);
2726         if (unlikely(!block || offset >= block->used_length)) {
2727             /*
2728              * The implementation might not support RAMBlock resize during
2729              * live migration, but it could happen in theory with future
2730              * updates. So we add a check here to capture that case.
2731              */
2732             error_report_once("%s unexpected error", __func__);
2733             return;
2734         }
2735
2736         if (len <= block->used_length - offset) {
2737             used_len = len;
2738         } else {
2739             used_len = block->used_length - offset;
2740         }
2741
2742         start = offset >> TARGET_PAGE_BITS;
2743         npages = used_len >> TARGET_PAGE_BITS;
2744
2745         qemu_mutex_lock(&ram_state->bitmap_mutex);
2746         ram_state->migration_dirty_pages -=
2747                       bitmap_count_one_with_offset(block->bmap, start, npages);
2748         bitmap_clear(block->bmap, start, npages);
2749         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2750     }
2751 }
2752
2753 /*
2754  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2755  * long-running RCU critical section.  When rcu-reclaims in the code
2756  * start to become numerous it will be necessary to reduce the
2757  * granularity of these critical sections.
2758  */
2759
2760 /**
2761  * ram_save_setup: Setup RAM for migration
2762  *
2763  * Returns zero to indicate success and negative for error
2764  *
2765  * @f: QEMUFile where to send the data
2766  * @opaque: RAMState pointer
2767  */
2768 static int ram_save_setup(QEMUFile *f, void *opaque)
2769 {
2770     RAMState **rsp = opaque;
2771     RAMBlock *block;
2772
2773     if (compress_threads_save_setup()) {
2774         return -1;
2775     }
2776
2777     /* migration has already setup the bitmap, reuse it. */
2778     if (!migration_in_colo_state()) {
2779         if (ram_init_all(rsp) != 0) {
2780             compress_threads_save_cleanup();
2781             return -1;
2782         }
2783     }
2784     (*rsp)->f = f;
2785
2786     WITH_RCU_READ_LOCK_GUARD() {
2787         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2788
2789         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2790             qemu_put_byte(f, strlen(block->idstr));
2791             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2792             qemu_put_be64(f, block->used_length);
2793             if (migrate_postcopy_ram() && block->page_size !=
2794                                           qemu_host_page_size) {
2795                 qemu_put_be64(f, block->page_size);
2796             }
2797             if (migrate_ignore_shared()) {
2798                 qemu_put_be64(f, block->mr->addr);
2799             }
2800         }
2801     }
2802
2803     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2804     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2805
2806     multifd_send_sync_main(f);
2807     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2808     qemu_fflush(f);
2809
2810     return 0;
2811 }
2812
2813 /**
2814  * ram_save_iterate: iterative stage for migration
2815  *
2816  * Returns zero to indicate success and negative for error
2817  *
2818  * @f: QEMUFile where to send the data
2819  * @opaque: RAMState pointer
2820  */
2821 static int ram_save_iterate(QEMUFile *f, void *opaque)
2822 {
2823     RAMState **temp = opaque;
2824     RAMState *rs = *temp;
2825     int ret = 0;
2826     int i;
2827     int64_t t0;
2828     int done = 0;
2829
2830     if (blk_mig_bulk_active()) {
2831         /* Avoid transferring ram during bulk phase of block migration as
2832          * the bulk phase will usually take a long time and transferring
2833          * ram updates during that time is pointless. */
2834         goto out;
2835     }
2836
2837     WITH_RCU_READ_LOCK_GUARD() {
2838         if (ram_list.version != rs->last_version) {
2839             ram_state_reset(rs);
2840         }
2841
2842         /* Read version before ram_list.blocks */
2843         smp_rmb();
2844
2845         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2846
2847         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2848         i = 0;
2849         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2850                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2851             int pages;
2852
2853             if (qemu_file_get_error(f)) {
2854                 break;
2855             }
2856
2857             pages = ram_find_and_save_block(rs, false);
2858             /* no more pages to sent */
2859             if (pages == 0) {
2860                 done = 1;
2861                 break;
2862             }
2863
2864             if (pages < 0) {
2865                 qemu_file_set_error(f, pages);
2866                 break;
2867             }
2868
2869             rs->target_page_count += pages;
2870
2871             /*
2872              * During postcopy, it is necessary to make sure one whole host
2873              * page is sent in one chunk.
2874              */
2875             if (migrate_postcopy_ram()) {
2876                 flush_compressed_data(rs);
2877             }
2878
2879             /*
2880              * we want to check in the 1st loop, just in case it was the 1st
2881              * time and we had to sync the dirty bitmap.
2882              * qemu_clock_get_ns() is a bit expensive, so we only check each
2883              * some iterations
2884              */
2885             if ((i & 63) == 0) {
2886                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2887                               1000000;
2888                 if (t1 > MAX_WAIT) {
2889                     trace_ram_save_iterate_big_wait(t1, i);
2890                     break;
2891                 }
2892             }
2893             i++;
2894         }
2895     }
2896
2897     /*
2898      * Must occur before EOS (or any QEMUFile operation)
2899      * because of RDMA protocol.
2900      */
2901     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2902
2903 out:
2904     if (ret >= 0
2905         && migration_is_setup_or_active(migrate_get_current()->state)) {
2906         multifd_send_sync_main(rs->f);
2907         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2908         qemu_fflush(f);
2909         ram_counters.transferred += 8;
2910
2911         ret = qemu_file_get_error(f);
2912     }
2913     if (ret < 0) {
2914         return ret;
2915     }
2916
2917     return done;
2918 }
2919
2920 /**
2921  * ram_save_complete: function called to send the remaining amount of ram
2922  *
2923  * Returns zero to indicate success or negative on error
2924  *
2925  * Called with iothread lock
2926  *
2927  * @f: QEMUFile where to send the data
2928  * @opaque: RAMState pointer
2929  */
2930 static int ram_save_complete(QEMUFile *f, void *opaque)
2931 {
2932     RAMState **temp = opaque;
2933     RAMState *rs = *temp;
2934     int ret = 0;
2935
2936     WITH_RCU_READ_LOCK_GUARD() {
2937         if (!migration_in_postcopy()) {
2938             migration_bitmap_sync_precopy(rs);
2939         }
2940
2941         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2942
2943         /* try transferring iterative blocks of memory */
2944
2945         /* flush all remaining blocks regardless of rate limiting */
2946         while (true) {
2947             int pages;
2948
2949             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2950             /* no more blocks to sent */
2951             if (pages == 0) {
2952                 break;
2953             }
2954             if (pages < 0) {
2955                 ret = pages;
2956                 break;
2957             }
2958         }
2959
2960         flush_compressed_data(rs);
2961         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2962     }
2963
2964     if (ret >= 0) {
2965         multifd_send_sync_main(rs->f);
2966         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2967         qemu_fflush(f);
2968     }
2969
2970     return ret;
2971 }
2972
2973 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2974                              uint64_t *res_precopy_only,
2975                              uint64_t *res_compatible,
2976                              uint64_t *res_postcopy_only)
2977 {
2978     RAMState **temp = opaque;
2979     RAMState *rs = *temp;
2980     uint64_t remaining_size;
2981
2982     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2983
2984     if (!migration_in_postcopy() &&
2985         remaining_size < max_size) {
2986         qemu_mutex_lock_iothread();
2987         WITH_RCU_READ_LOCK_GUARD() {
2988             migration_bitmap_sync_precopy(rs);
2989         }
2990         qemu_mutex_unlock_iothread();
2991         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2992     }
2993
2994     if (migrate_postcopy_ram()) {
2995         /* We can do postcopy, and all the data is postcopiable */
2996         *res_compatible += remaining_size;
2997     } else {
2998         *res_precopy_only += remaining_size;
2999     }
3000 }
3001
3002 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3003 {
3004     unsigned int xh_len;
3005     int xh_flags;
3006     uint8_t *loaded_data;
3007
3008     /* extract RLE header */
3009     xh_flags = qemu_get_byte(f);
3010     xh_len = qemu_get_be16(f);
3011
3012     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3013         error_report("Failed to load XBZRLE page - wrong compression!");
3014         return -1;
3015     }
3016
3017     if (xh_len > TARGET_PAGE_SIZE) {
3018         error_report("Failed to load XBZRLE page - len overflow!");
3019         return -1;
3020     }
3021     loaded_data = XBZRLE.decoded_buf;
3022     /* load data and decode */
3023     /* it can change loaded_data to point to an internal buffer */
3024     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3025
3026     /* decode RLE */
3027     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3028                              TARGET_PAGE_SIZE) == -1) {
3029         error_report("Failed to load XBZRLE page - decode error!");
3030         return -1;
3031     }
3032
3033     return 0;
3034 }
3035
3036 /**
3037  * ram_block_from_stream: read a RAMBlock id from the migration stream
3038  *
3039  * Must be called from within a rcu critical section.
3040  *
3041  * Returns a pointer from within the RCU-protected ram_list.
3042  *
3043  * @f: QEMUFile where to read the data from
3044  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3045  */
3046 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3047 {
3048     static RAMBlock *block;
3049     char id[256];
3050     uint8_t len;
3051
3052     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3053         if (!block) {
3054             error_report("Ack, bad migration stream!");
3055             return NULL;
3056         }
3057         return block;
3058     }
3059
3060     len = qemu_get_byte(f);
3061     qemu_get_buffer(f, (uint8_t *)id, len);
3062     id[len] = 0;
3063
3064     block = qemu_ram_block_by_name(id);
3065     if (!block) {
3066         error_report("Can't find block %s", id);
3067         return NULL;
3068     }
3069
3070     if (ramblock_is_ignored(block)) {
3071         error_report("block %s should not be migrated !", id);
3072         return NULL;
3073     }
3074
3075     return block;
3076 }
3077
3078 static inline void *host_from_ram_block_offset(RAMBlock *block,
3079                                                ram_addr_t offset)
3080 {
3081     if (!offset_in_ramblock(block, offset)) {
3082         return NULL;
3083     }
3084
3085     return block->host + offset;
3086 }
3087
3088 static void *host_page_from_ram_block_offset(RAMBlock *block,
3089                                              ram_addr_t offset)
3090 {
3091     /* Note: Explicitly no check against offset_in_ramblock(). */
3092     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3093                                    block->page_size);
3094 }
3095
3096 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3097                                                          ram_addr_t offset)
3098 {
3099     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3100 }
3101
3102 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3103                              ram_addr_t offset, bool record_bitmap)
3104 {
3105     if (!offset_in_ramblock(block, offset)) {
3106         return NULL;
3107     }
3108     if (!block->colo_cache) {
3109         error_report("%s: colo_cache is NULL in block :%s",
3110                      __func__, block->idstr);
3111         return NULL;
3112     }
3113
3114     /*
3115     * During colo checkpoint, we need bitmap of these migrated pages.
3116     * It help us to decide which pages in ram cache should be flushed
3117     * into VM's RAM later.
3118     */
3119     if (record_bitmap &&
3120         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3121         ram_state->migration_dirty_pages++;
3122     }
3123     return block->colo_cache + offset;
3124 }
3125
3126 /**
3127  * ram_handle_compressed: handle the zero page case
3128  *
3129  * If a page (or a whole RDMA chunk) has been
3130  * determined to be zero, then zap it.
3131  *
3132  * @host: host address for the zero page
3133  * @ch: what the page is filled from.  We only support zero
3134  * @size: size of the zero page
3135  */
3136 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3137 {
3138     if (ch != 0 || !is_zero_range(host, size)) {
3139         memset(host, ch, size);
3140     }
3141 }
3142
3143 /* return the size after decompression, or negative value on error */
3144 static int
3145 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3146                      const uint8_t *source, size_t source_len)
3147 {
3148     int err;
3149
3150     err = inflateReset(stream);
3151     if (err != Z_OK) {
3152         return -1;
3153     }
3154
3155     stream->avail_in = source_len;
3156     stream->next_in = (uint8_t *)source;
3157     stream->avail_out = dest_len;
3158     stream->next_out = dest;
3159
3160     err = inflate(stream, Z_NO_FLUSH);
3161     if (err != Z_STREAM_END) {
3162         return -1;
3163     }
3164
3165     return stream->total_out;
3166 }
3167
3168 static void *do_data_decompress(void *opaque)
3169 {
3170     DecompressParam *param = opaque;
3171     unsigned long pagesize;
3172     uint8_t *des;
3173     int len, ret;
3174
3175     qemu_mutex_lock(&param->mutex);
3176     while (!param->quit) {
3177         if (param->des) {
3178             des = param->des;
3179             len = param->len;
3180             param->des = 0;
3181             qemu_mutex_unlock(&param->mutex);
3182
3183             pagesize = TARGET_PAGE_SIZE;
3184
3185             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3186                                        param->compbuf, len);
3187             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3188                 error_report("decompress data failed");
3189                 qemu_file_set_error(decomp_file, ret);
3190             }
3191
3192             qemu_mutex_lock(&decomp_done_lock);
3193             param->done = true;
3194             qemu_cond_signal(&decomp_done_cond);
3195             qemu_mutex_unlock(&decomp_done_lock);
3196
3197             qemu_mutex_lock(&param->mutex);
3198         } else {
3199             qemu_cond_wait(&param->cond, &param->mutex);
3200         }
3201     }
3202     qemu_mutex_unlock(&param->mutex);
3203
3204     return NULL;
3205 }
3206
3207 static int wait_for_decompress_done(void)
3208 {
3209     int idx, thread_count;
3210
3211     if (!migrate_use_compression()) {
3212         return 0;
3213     }
3214
3215     thread_count = migrate_decompress_threads();
3216     qemu_mutex_lock(&decomp_done_lock);
3217     for (idx = 0; idx < thread_count; idx++) {
3218         while (!decomp_param[idx].done) {
3219             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3220         }
3221     }
3222     qemu_mutex_unlock(&decomp_done_lock);
3223     return qemu_file_get_error(decomp_file);
3224 }
3225
3226 static void compress_threads_load_cleanup(void)
3227 {
3228     int i, thread_count;
3229
3230     if (!migrate_use_compression()) {
3231         return;
3232     }
3233     thread_count = migrate_decompress_threads();
3234     for (i = 0; i < thread_count; i++) {
3235         /*
3236          * we use it as a indicator which shows if the thread is
3237          * properly init'd or not
3238          */
3239         if (!decomp_param[i].compbuf) {
3240             break;
3241         }
3242
3243         qemu_mutex_lock(&decomp_param[i].mutex);
3244         decomp_param[i].quit = true;
3245         qemu_cond_signal(&decomp_param[i].cond);
3246         qemu_mutex_unlock(&decomp_param[i].mutex);
3247     }
3248     for (i = 0; i < thread_count; i++) {
3249         if (!decomp_param[i].compbuf) {
3250             break;
3251         }
3252
3253         qemu_thread_join(decompress_threads + i);
3254         qemu_mutex_destroy(&decomp_param[i].mutex);
3255         qemu_cond_destroy(&decomp_param[i].cond);
3256         inflateEnd(&decomp_param[i].stream);
3257         g_free(decomp_param[i].compbuf);
3258         decomp_param[i].compbuf = NULL;
3259     }
3260     g_free(decompress_threads);
3261     g_free(decomp_param);
3262     decompress_threads = NULL;
3263     decomp_param = NULL;
3264     decomp_file = NULL;
3265 }
3266
3267 static int compress_threads_load_setup(QEMUFile *f)
3268 {
3269     int i, thread_count;
3270
3271     if (!migrate_use_compression()) {
3272         return 0;
3273     }
3274
3275     thread_count = migrate_decompress_threads();
3276     decompress_threads = g_new0(QemuThread, thread_count);
3277     decomp_param = g_new0(DecompressParam, thread_count);
3278     qemu_mutex_init(&decomp_done_lock);
3279     qemu_cond_init(&decomp_done_cond);
3280     decomp_file = f;
3281     for (i = 0; i < thread_count; i++) {
3282         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3283             goto exit;
3284         }
3285
3286         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3287         qemu_mutex_init(&decomp_param[i].mutex);
3288         qemu_cond_init(&decomp_param[i].cond);
3289         decomp_param[i].done = true;
3290         decomp_param[i].quit = false;
3291         qemu_thread_create(decompress_threads + i, "decompress",
3292                            do_data_decompress, decomp_param + i,
3293                            QEMU_THREAD_JOINABLE);
3294     }
3295     return 0;
3296 exit:
3297     compress_threads_load_cleanup();
3298     return -1;
3299 }
3300
3301 static void decompress_data_with_multi_threads(QEMUFile *f,
3302                                                void *host, int len)
3303 {
3304     int idx, thread_count;
3305
3306     thread_count = migrate_decompress_threads();
3307     QEMU_LOCK_GUARD(&decomp_done_lock);
3308     while (true) {
3309         for (idx = 0; idx < thread_count; idx++) {
3310             if (decomp_param[idx].done) {
3311                 decomp_param[idx].done = false;
3312                 qemu_mutex_lock(&decomp_param[idx].mutex);
3313                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3314                 decomp_param[idx].des = host;
3315                 decomp_param[idx].len = len;
3316                 qemu_cond_signal(&decomp_param[idx].cond);
3317                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3318                 break;
3319             }
3320         }
3321         if (idx < thread_count) {
3322             break;
3323         } else {
3324             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3325         }
3326     }
3327 }
3328
3329 static void colo_init_ram_state(void)
3330 {
3331     ram_state_init(&ram_state);
3332 }
3333
3334 /*
3335  * colo cache: this is for secondary VM, we cache the whole
3336  * memory of the secondary VM, it is need to hold the global lock
3337  * to call this helper.
3338  */
3339 int colo_init_ram_cache(void)
3340 {
3341     RAMBlock *block;
3342
3343     WITH_RCU_READ_LOCK_GUARD() {
3344         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3345             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3346                                                     NULL,
3347                                                     false);
3348             if (!block->colo_cache) {
3349                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3350                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3351                              block->used_length);
3352                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3353                     if (block->colo_cache) {
3354                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3355                         block->colo_cache = NULL;
3356                     }
3357                 }
3358                 return -errno;
3359             }
3360         }
3361     }
3362
3363     /*
3364     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3365     * with to decide which page in cache should be flushed into SVM's RAM. Here
3366     * we use the same name 'ram_bitmap' as for migration.
3367     */
3368     if (ram_bytes_total()) {
3369         RAMBlock *block;
3370
3371         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3372             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3373             block->bmap = bitmap_new(pages);
3374         }
3375     }
3376
3377     colo_init_ram_state();
3378     return 0;
3379 }
3380
3381 /* TODO: duplicated with ram_init_bitmaps */
3382 void colo_incoming_start_dirty_log(void)
3383 {
3384     RAMBlock *block = NULL;
3385     /* For memory_global_dirty_log_start below. */
3386     qemu_mutex_lock_iothread();
3387     qemu_mutex_lock_ramlist();
3388
3389     memory_global_dirty_log_sync();
3390     WITH_RCU_READ_LOCK_GUARD() {
3391         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3392             ramblock_sync_dirty_bitmap(ram_state, block);
3393             /* Discard this dirty bitmap record */
3394             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3395         }
3396         memory_global_dirty_log_start();
3397     }
3398     ram_state->migration_dirty_pages = 0;
3399     qemu_mutex_unlock_ramlist();
3400     qemu_mutex_unlock_iothread();
3401 }
3402
3403 /* It is need to hold the global lock to call this helper */
3404 void colo_release_ram_cache(void)
3405 {
3406     RAMBlock *block;
3407
3408     memory_global_dirty_log_stop();
3409     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3410         g_free(block->bmap);
3411         block->bmap = NULL;
3412     }
3413
3414     WITH_RCU_READ_LOCK_GUARD() {
3415         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3416             if (block->colo_cache) {
3417                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3418                 block->colo_cache = NULL;
3419             }
3420         }
3421     }
3422     ram_state_cleanup(&ram_state);
3423 }
3424
3425 /**
3426  * ram_load_setup: Setup RAM for migration incoming side
3427  *
3428  * Returns zero to indicate success and negative for error
3429  *
3430  * @f: QEMUFile where to receive the data
3431  * @opaque: RAMState pointer
3432  */
3433 static int ram_load_setup(QEMUFile *f, void *opaque)
3434 {
3435     if (compress_threads_load_setup(f)) {
3436         return -1;
3437     }
3438
3439     xbzrle_load_setup();
3440     ramblock_recv_map_init();
3441
3442     return 0;
3443 }
3444
3445 static int ram_load_cleanup(void *opaque)
3446 {
3447     RAMBlock *rb;
3448
3449     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3450         qemu_ram_block_writeback(rb);
3451     }
3452
3453     xbzrle_load_cleanup();
3454     compress_threads_load_cleanup();
3455
3456     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3457         g_free(rb->receivedmap);
3458         rb->receivedmap = NULL;
3459     }
3460
3461     return 0;
3462 }
3463
3464 /**
3465  * ram_postcopy_incoming_init: allocate postcopy data structures
3466  *
3467  * Returns 0 for success and negative if there was one error
3468  *
3469  * @mis: current migration incoming state
3470  *
3471  * Allocate data structures etc needed by incoming migration with
3472  * postcopy-ram. postcopy-ram's similarly names
3473  * postcopy_ram_incoming_init does the work.
3474  */
3475 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3476 {
3477     return postcopy_ram_incoming_init(mis);
3478 }
3479
3480 /**
3481  * ram_load_postcopy: load a page in postcopy case
3482  *
3483  * Returns 0 for success or -errno in case of error
3484  *
3485  * Called in postcopy mode by ram_load().
3486  * rcu_read_lock is taken prior to this being called.
3487  *
3488  * @f: QEMUFile where to send the data
3489  */
3490 static int ram_load_postcopy(QEMUFile *f)
3491 {
3492     int flags = 0, ret = 0;
3493     bool place_needed = false;
3494     bool matches_target_page_size = false;
3495     MigrationIncomingState *mis = migration_incoming_get_current();
3496     /* Temporary page that is later 'placed' */
3497     void *postcopy_host_page = mis->postcopy_tmp_page;
3498     void *host_page = NULL;
3499     bool all_zero = true;
3500     int target_pages = 0;
3501
3502     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3503         ram_addr_t addr;
3504         void *page_buffer = NULL;
3505         void *place_source = NULL;
3506         RAMBlock *block = NULL;
3507         uint8_t ch;
3508         int len;
3509
3510         addr = qemu_get_be64(f);
3511
3512         /*
3513          * If qemu file error, we should stop here, and then "addr"
3514          * may be invalid
3515          */
3516         ret = qemu_file_get_error(f);
3517         if (ret) {
3518             break;
3519         }
3520
3521         flags = addr & ~TARGET_PAGE_MASK;
3522         addr &= TARGET_PAGE_MASK;
3523
3524         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3525         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3526                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3527             block = ram_block_from_stream(f, flags);
3528             if (!block) {
3529                 ret = -EINVAL;
3530                 break;
3531             }
3532
3533             /*
3534              * Relying on used_length is racy and can result in false positives.
3535              * We might place pages beyond used_length in case RAM was shrunk
3536              * while in postcopy, which is fine - trying to place via
3537              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3538              */
3539             if (!block->host || addr >= block->postcopy_length) {
3540                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3541                 ret = -EINVAL;
3542                 break;
3543             }
3544             target_pages++;
3545             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3546             /*
3547              * Postcopy requires that we place whole host pages atomically;
3548              * these may be huge pages for RAMBlocks that are backed by
3549              * hugetlbfs.
3550              * To make it atomic, the data is read into a temporary page
3551              * that's moved into place later.
3552              * The migration protocol uses,  possibly smaller, target-pages
3553              * however the source ensures it always sends all the components
3554              * of a host page in one chunk.
3555              */
3556             page_buffer = postcopy_host_page +
3557                           host_page_offset_from_ram_block_offset(block, addr);
3558             /* If all TP are zero then we can optimise the place */
3559             if (target_pages == 1) {
3560                 host_page = host_page_from_ram_block_offset(block, addr);
3561             } else if (host_page != host_page_from_ram_block_offset(block,
3562                                                                     addr)) {
3563                 /* not the 1st TP within the HP */
3564                 error_report("Non-same host page %p/%p", host_page,
3565                              host_page_from_ram_block_offset(block, addr));
3566                 ret = -EINVAL;
3567                 break;
3568             }
3569
3570             /*
3571              * If it's the last part of a host page then we place the host
3572              * page
3573              */
3574             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3575                 place_needed = true;
3576             }
3577             place_source = postcopy_host_page;
3578         }
3579
3580         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3581         case RAM_SAVE_FLAG_ZERO:
3582             ch = qemu_get_byte(f);
3583             /*
3584              * Can skip to set page_buffer when
3585              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3586              */
3587             if (ch || !matches_target_page_size) {
3588                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3589             }
3590             if (ch) {
3591                 all_zero = false;
3592             }
3593             break;
3594
3595         case RAM_SAVE_FLAG_PAGE:
3596             all_zero = false;
3597             if (!matches_target_page_size) {
3598                 /* For huge pages, we always use temporary buffer */
3599                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3600             } else {
3601                 /*
3602                  * For small pages that matches target page size, we
3603                  * avoid the qemu_file copy.  Instead we directly use
3604                  * the buffer of QEMUFile to place the page.  Note: we
3605                  * cannot do any QEMUFile operation before using that
3606                  * buffer to make sure the buffer is valid when
3607                  * placing the page.
3608                  */
3609                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3610                                          TARGET_PAGE_SIZE);
3611             }
3612             break;
3613         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3614             all_zero = false;
3615             len = qemu_get_be32(f);
3616             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3617                 error_report("Invalid compressed data length: %d", len);
3618                 ret = -EINVAL;
3619                 break;
3620             }
3621             decompress_data_with_multi_threads(f, page_buffer, len);
3622             break;
3623
3624         case RAM_SAVE_FLAG_EOS:
3625             /* normal exit */
3626             multifd_recv_sync_main();
3627             break;
3628         default:
3629             error_report("Unknown combination of migration flags: 0x%x"
3630                          " (postcopy mode)", flags);
3631             ret = -EINVAL;
3632             break;
3633         }
3634
3635         /* Got the whole host page, wait for decompress before placing. */
3636         if (place_needed) {
3637             ret |= wait_for_decompress_done();
3638         }
3639
3640         /* Detect for any possible file errors */
3641         if (!ret && qemu_file_get_error(f)) {
3642             ret = qemu_file_get_error(f);
3643         }
3644
3645         if (!ret && place_needed) {
3646             if (all_zero) {
3647                 ret = postcopy_place_page_zero(mis, host_page, block);
3648             } else {
3649                 ret = postcopy_place_page(mis, host_page, place_source,
3650                                           block);
3651             }
3652             place_needed = false;
3653             target_pages = 0;
3654             /* Assume we have a zero page until we detect something different */
3655             all_zero = true;
3656         }
3657     }
3658
3659     return ret;
3660 }
3661
3662 static bool postcopy_is_advised(void)
3663 {
3664     PostcopyState ps = postcopy_state_get();
3665     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3666 }
3667
3668 static bool postcopy_is_running(void)
3669 {
3670     PostcopyState ps = postcopy_state_get();
3671     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3672 }
3673
3674 /*
3675  * Flush content of RAM cache into SVM's memory.
3676  * Only flush the pages that be dirtied by PVM or SVM or both.
3677  */
3678 void colo_flush_ram_cache(void)
3679 {
3680     RAMBlock *block = NULL;
3681     void *dst_host;
3682     void *src_host;
3683     unsigned long offset = 0;
3684
3685     memory_global_dirty_log_sync();
3686     WITH_RCU_READ_LOCK_GUARD() {
3687         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3688             ramblock_sync_dirty_bitmap(ram_state, block);
3689         }
3690     }
3691
3692     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3693     WITH_RCU_READ_LOCK_GUARD() {
3694         block = QLIST_FIRST_RCU(&ram_list.blocks);
3695
3696         while (block) {
3697             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3698
3699             if (!offset_in_ramblock(block,
3700                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3701                 offset = 0;
3702                 block = QLIST_NEXT_RCU(block, next);
3703             } else {
3704                 migration_bitmap_clear_dirty(ram_state, block, offset);
3705                 dst_host = block->host
3706                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3707                 src_host = block->colo_cache
3708                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3709                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3710             }
3711         }
3712     }
3713     trace_colo_flush_ram_cache_end();
3714 }
3715
3716 /**
3717  * ram_load_precopy: load pages in precopy case
3718  *
3719  * Returns 0 for success or -errno in case of error
3720  *
3721  * Called in precopy mode by ram_load().
3722  * rcu_read_lock is taken prior to this being called.
3723  *
3724  * @f: QEMUFile where to send the data
3725  */
3726 static int ram_load_precopy(QEMUFile *f)
3727 {
3728     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3729     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3730     bool postcopy_advised = postcopy_is_advised();
3731     if (!migrate_use_compression()) {
3732         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3733     }
3734
3735     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3736         ram_addr_t addr, total_ram_bytes;
3737         void *host = NULL, *host_bak = NULL;
3738         uint8_t ch;
3739
3740         /*
3741          * Yield periodically to let main loop run, but an iteration of
3742          * the main loop is expensive, so do it each some iterations
3743          */
3744         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3745             aio_co_schedule(qemu_get_current_aio_context(),
3746                             qemu_coroutine_self());
3747             qemu_coroutine_yield();
3748         }
3749         i++;
3750
3751         addr = qemu_get_be64(f);
3752         flags = addr & ~TARGET_PAGE_MASK;
3753         addr &= TARGET_PAGE_MASK;
3754
3755         if (flags & invalid_flags) {
3756             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3757                 error_report("Received an unexpected compressed page");
3758             }
3759
3760             ret = -EINVAL;
3761             break;
3762         }
3763
3764         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3765                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3766             RAMBlock *block = ram_block_from_stream(f, flags);
3767
3768             host = host_from_ram_block_offset(block, addr);
3769             /*
3770              * After going into COLO stage, we should not load the page
3771              * into SVM's memory directly, we put them into colo_cache firstly.
3772              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3773              * Previously, we copied all these memory in preparing stage of COLO
3774              * while we need to stop VM, which is a time-consuming process.
3775              * Here we optimize it by a trick, back-up every page while in
3776              * migration process while COLO is enabled, though it affects the
3777              * speed of the migration, but it obviously reduce the downtime of
3778              * back-up all SVM'S memory in COLO preparing stage.
3779              */
3780             if (migration_incoming_colo_enabled()) {
3781                 if (migration_incoming_in_colo_state()) {
3782                     /* In COLO stage, put all pages into cache temporarily */
3783                     host = colo_cache_from_block_offset(block, addr, true);
3784                 } else {
3785                    /*
3786                     * In migration stage but before COLO stage,
3787                     * Put all pages into both cache and SVM's memory.
3788                     */
3789                     host_bak = colo_cache_from_block_offset(block, addr, false);
3790                 }
3791             }
3792             if (!host) {
3793                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3794                 ret = -EINVAL;
3795                 break;
3796             }
3797             if (!migration_incoming_in_colo_state()) {
3798                 ramblock_recv_bitmap_set(block, host);
3799             }
3800
3801             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3802         }
3803
3804         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3805         case RAM_SAVE_FLAG_MEM_SIZE:
3806             /* Synchronize RAM block list */
3807             total_ram_bytes = addr;
3808             while (!ret && total_ram_bytes) {
3809                 RAMBlock *block;
3810                 char id[256];
3811                 ram_addr_t length;
3812
3813                 len = qemu_get_byte(f);
3814                 qemu_get_buffer(f, (uint8_t *)id, len);
3815                 id[len] = 0;
3816                 length = qemu_get_be64(f);
3817
3818                 block = qemu_ram_block_by_name(id);
3819                 if (block && !qemu_ram_is_migratable(block)) {
3820                     error_report("block %s should not be migrated !", id);
3821                     ret = -EINVAL;
3822                 } else if (block) {
3823                     if (length != block->used_length) {
3824                         Error *local_err = NULL;
3825
3826                         ret = qemu_ram_resize(block, length,
3827                                               &local_err);
3828                         if (local_err) {
3829                             error_report_err(local_err);
3830                         }
3831                     }
3832                     /* For postcopy we need to check hugepage sizes match */
3833                     if (postcopy_advised && migrate_postcopy_ram() &&
3834                         block->page_size != qemu_host_page_size) {
3835                         uint64_t remote_page_size = qemu_get_be64(f);
3836                         if (remote_page_size != block->page_size) {
3837                             error_report("Mismatched RAM page size %s "
3838                                          "(local) %zd != %" PRId64,
3839                                          id, block->page_size,
3840                                          remote_page_size);
3841                             ret = -EINVAL;
3842                         }
3843                     }
3844                     if (migrate_ignore_shared()) {
3845                         hwaddr addr = qemu_get_be64(f);
3846                         if (ramblock_is_ignored(block) &&
3847                             block->mr->addr != addr) {
3848                             error_report("Mismatched GPAs for block %s "
3849                                          "%" PRId64 "!= %" PRId64,
3850                                          id, (uint64_t)addr,
3851                                          (uint64_t)block->mr->addr);
3852                             ret = -EINVAL;
3853                         }
3854                     }
3855                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3856                                           block->idstr);
3857                 } else {
3858                     error_report("Unknown ramblock \"%s\", cannot "
3859                                  "accept migration", id);
3860                     ret = -EINVAL;
3861                 }
3862
3863                 total_ram_bytes -= length;
3864             }
3865             break;
3866
3867         case RAM_SAVE_FLAG_ZERO:
3868             ch = qemu_get_byte(f);
3869             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3870             break;
3871
3872         case RAM_SAVE_FLAG_PAGE:
3873             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3874             break;
3875
3876         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3877             len = qemu_get_be32(f);
3878             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3879                 error_report("Invalid compressed data length: %d", len);
3880                 ret = -EINVAL;
3881                 break;
3882             }
3883             decompress_data_with_multi_threads(f, host, len);
3884             break;
3885
3886         case RAM_SAVE_FLAG_XBZRLE:
3887             if (load_xbzrle(f, addr, host) < 0) {
3888                 error_report("Failed to decompress XBZRLE page at "
3889                              RAM_ADDR_FMT, addr);
3890                 ret = -EINVAL;
3891                 break;
3892             }
3893             break;
3894         case RAM_SAVE_FLAG_EOS:
3895             /* normal exit */
3896             multifd_recv_sync_main();
3897             break;
3898         default:
3899             if (flags & RAM_SAVE_FLAG_HOOK) {
3900                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3901             } else {
3902                 error_report("Unknown combination of migration flags: 0x%x",
3903                              flags);
3904                 ret = -EINVAL;
3905             }
3906         }
3907         if (!ret) {
3908             ret = qemu_file_get_error(f);
3909         }
3910         if (!ret && host_bak) {
3911             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3912         }
3913     }
3914
3915     ret |= wait_for_decompress_done();
3916     return ret;
3917 }
3918
3919 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3920 {
3921     int ret = 0;
3922     static uint64_t seq_iter;
3923     /*
3924      * If system is running in postcopy mode, page inserts to host memory must
3925      * be atomic
3926      */
3927     bool postcopy_running = postcopy_is_running();
3928
3929     seq_iter++;
3930
3931     if (version_id != 4) {
3932         return -EINVAL;
3933     }
3934
3935     /*
3936      * This RCU critical section can be very long running.
3937      * When RCU reclaims in the code start to become numerous,
3938      * it will be necessary to reduce the granularity of this
3939      * critical section.
3940      */
3941     WITH_RCU_READ_LOCK_GUARD() {
3942         if (postcopy_running) {
3943             ret = ram_load_postcopy(f);
3944         } else {
3945             ret = ram_load_precopy(f);
3946         }
3947     }
3948     trace_ram_load_complete(ret, seq_iter);
3949
3950     return ret;
3951 }
3952
3953 static bool ram_has_postcopy(void *opaque)
3954 {
3955     RAMBlock *rb;
3956     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3957         if (ramblock_is_pmem(rb)) {
3958             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3959                          "is not supported now!", rb->idstr, rb->host);
3960             return false;
3961         }
3962     }
3963
3964     return migrate_postcopy_ram();
3965 }
3966
3967 /* Sync all the dirty bitmap with destination VM.  */
3968 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3969 {
3970     RAMBlock *block;
3971     QEMUFile *file = s->to_dst_file;
3972     int ramblock_count = 0;
3973
3974     trace_ram_dirty_bitmap_sync_start();
3975
3976     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3977         qemu_savevm_send_recv_bitmap(file, block->idstr);
3978         trace_ram_dirty_bitmap_request(block->idstr);
3979         ramblock_count++;
3980     }
3981
3982     trace_ram_dirty_bitmap_sync_wait();
3983
3984     /* Wait until all the ramblocks' dirty bitmap synced */
3985     while (ramblock_count--) {
3986         qemu_sem_wait(&s->rp_state.rp_sem);
3987     }
3988
3989     trace_ram_dirty_bitmap_sync_complete();
3990
3991     return 0;
3992 }
3993
3994 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3995 {
3996     qemu_sem_post(&s->rp_state.rp_sem);
3997 }
3998
3999 /*
4000  * Read the received bitmap, revert it as the initial dirty bitmap.
4001  * This is only used when the postcopy migration is paused but wants
4002  * to resume from a middle point.
4003  */
4004 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4005 {
4006     int ret = -EINVAL;
4007     QEMUFile *file = s->rp_state.from_dst_file;
4008     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4009     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4010     uint64_t size, end_mark;
4011
4012     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4013
4014     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4015         error_report("%s: incorrect state %s", __func__,
4016                      MigrationStatus_str(s->state));
4017         return -EINVAL;
4018     }
4019
4020     /*
4021      * Note: see comments in ramblock_recv_bitmap_send() on why we
4022      * need the endianness conversion, and the paddings.
4023      */
4024     local_size = ROUND_UP(local_size, 8);
4025
4026     /* Add paddings */
4027     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4028
4029     size = qemu_get_be64(file);
4030
4031     /* The size of the bitmap should match with our ramblock */
4032     if (size != local_size) {
4033         error_report("%s: ramblock '%s' bitmap size mismatch "
4034                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4035                      block->idstr, size, local_size);
4036         ret = -EINVAL;
4037         goto out;
4038     }
4039
4040     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4041     end_mark = qemu_get_be64(file);
4042
4043     ret = qemu_file_get_error(file);
4044     if (ret || size != local_size) {
4045         error_report("%s: read bitmap failed for ramblock '%s': %d"
4046                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4047                      __func__, block->idstr, ret, local_size, size);
4048         ret = -EIO;
4049         goto out;
4050     }
4051
4052     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4053         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4054                      __func__, block->idstr, end_mark);
4055         ret = -EINVAL;
4056         goto out;
4057     }
4058
4059     /*
4060      * Endianness conversion. We are during postcopy (though paused).
4061      * The dirty bitmap won't change. We can directly modify it.
4062      */
4063     bitmap_from_le(block->bmap, le_bitmap, nbits);
4064
4065     /*
4066      * What we received is "received bitmap". Revert it as the initial
4067      * dirty bitmap for this ramblock.
4068      */
4069     bitmap_complement(block->bmap, block->bmap, nbits);
4070
4071     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4072
4073     /*
4074      * We succeeded to sync bitmap for current ramblock. If this is
4075      * the last one to sync, we need to notify the main send thread.
4076      */
4077     ram_dirty_bitmap_reload_notify(s);
4078
4079     ret = 0;
4080 out:
4081     g_free(le_bitmap);
4082     return ret;
4083 }
4084
4085 static int ram_resume_prepare(MigrationState *s, void *opaque)
4086 {
4087     RAMState *rs = *(RAMState **)opaque;
4088     int ret;
4089
4090     ret = ram_dirty_bitmap_sync_all(s, rs);
4091     if (ret) {
4092         return ret;
4093     }
4094
4095     ram_state_resume_prepare(rs, s->to_dst_file);
4096
4097     return 0;
4098 }
4099
4100 static SaveVMHandlers savevm_ram_handlers = {
4101     .save_setup = ram_save_setup,
4102     .save_live_iterate = ram_save_iterate,
4103     .save_live_complete_postcopy = ram_save_complete,
4104     .save_live_complete_precopy = ram_save_complete,
4105     .has_postcopy = ram_has_postcopy,
4106     .save_live_pending = ram_save_pending,
4107     .load_state = ram_load,
4108     .save_cleanup = ram_save_cleanup,
4109     .load_setup = ram_load_setup,
4110     .load_cleanup = ram_load_cleanup,
4111     .resume_prepare = ram_resume_prepare,
4112 };
4113
4114 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4115                                       size_t old_size, size_t new_size)
4116 {
4117     PostcopyState ps = postcopy_state_get();
4118     ram_addr_t offset;
4119     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4120     Error *err = NULL;
4121
4122     if (ramblock_is_ignored(rb)) {
4123         return;
4124     }
4125
4126     if (!migration_is_idle()) {
4127         /*
4128          * Precopy code on the source cannot deal with the size of RAM blocks
4129          * changing at random points in time - especially after sending the
4130          * RAM block sizes in the migration stream, they must no longer change.
4131          * Abort and indicate a proper reason.
4132          */
4133         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4134         migrate_set_error(migrate_get_current(), err);
4135         error_free(err);
4136         migration_cancel();
4137     }
4138
4139     switch (ps) {
4140     case POSTCOPY_INCOMING_ADVISE:
4141         /*
4142          * Update what ram_postcopy_incoming_init()->init_range() does at the
4143          * time postcopy was advised. Syncing RAM blocks with the source will
4144          * result in RAM resizes.
4145          */
4146         if (old_size < new_size) {
4147             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4148                 error_report("RAM block '%s' discard of resized RAM failed",
4149                              rb->idstr);
4150             }
4151         }
4152         rb->postcopy_length = new_size;
4153         break;
4154     case POSTCOPY_INCOMING_NONE:
4155     case POSTCOPY_INCOMING_RUNNING:
4156     case POSTCOPY_INCOMING_END:
4157         /*
4158          * Once our guest is running, postcopy does no longer care about
4159          * resizes. When growing, the new memory was not available on the
4160          * source, no handler needed.
4161          */
4162         break;
4163     default:
4164         error_report("RAM block '%s' resized during postcopy state: %d",
4165                      rb->idstr, ps);
4166         exit(-1);
4167     }
4168 }
4169
4170 static RAMBlockNotifier ram_mig_ram_notifier = {
4171     .ram_block_resized = ram_mig_ram_block_resized,
4172 };
4173
4174 void ram_mig_init(void)
4175 {
4176     qemu_mutex_init(&XBZRLE.lock);
4177     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4178     ram_block_notifier_add(&ram_mig_ram_notifier);
4179 }