migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/main-loop.h"
  34 #include "xbzrle.h"
  35 #include "ram.h"
  36 #include "migration.h"
  37 #include "migration/register.h"
  38 #include "migration/misc.h"
  39 #include "qemu-file.h"
  40 #include "postcopy-ram.h"
  41 #include "page_cache.h"
  42 #include "qemu/error-report.h"
  43 #include "qapi/error.h"
  44 #include "qapi/qapi-types-migration.h"
  45 #include "qapi/qapi-events-migration.h"
  46 #include "qapi/qmp/qerror.h"
  47 #include "trace.h"
  48 #include "exec/ram_addr.h"
  49 #include "exec/target_page.h"
  50 #include "qemu/rcu_queue.h"
  51 #include "migration/colo.h"
  52 #include "block.h"
  53 #include "sysemu/cpu-throttle.h"
  54 #include "savevm.h"
  55 #include "qemu/iov.h"
  56 #include "multifd.h"
  57 #include "sysemu/runstate.h"
  58
  59 #if defined(__linux__)
  60 #include "qemu/userfaultfd.h"
  61 #endif /* defined(__linux__) */
  62
  63 /***********************************************************/
  64 /* ram save/restore */
  65
  66 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  67  * worked for pages that where filled with the same char.  We switched
  68  * it to only search for the zero value.  And to avoid confusion with
  69  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  70  */
  71
  72 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  73 #define RAM_SAVE_FLAG_ZERO     0x02
  74 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  75 #define RAM_SAVE_FLAG_PAGE     0x08
  76 #define RAM_SAVE_FLAG_EOS      0x10
  77 #define RAM_SAVE_FLAG_CONTINUE 0x20
  78 #define RAM_SAVE_FLAG_XBZRLE   0x40
  79 /* 0x80 is reserved in migration.h start with 0x100 next */
  80 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  81
  82 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  83 {
  84     return buffer_is_zero(p, size);
  85 }
  86
  87 XBZRLECacheStats xbzrle_counters;
  88
  89 /* struct contains XBZRLE cache and a static page
  90    used by the compression */
  91 static struct {
  92     /* buffer used for XBZRLE encoding */
  93     uint8_t *encoded_buf;
  94     /* buffer for storing page content */
  95     uint8_t *current_buf;
  96     /* Cache for XBZRLE, Protected by lock. */
  97     PageCache *cache;
  98     QemuMutex lock;
  99     /* it will store a page full of zeros */
 100     uint8_t *zero_target_page;
 101     /* buffer used for XBZRLE decoding */
 102     uint8_t *decoded_buf;
 103 } XBZRLE;
 104
 105 static void XBZRLE_cache_lock(void)
 106 {
 107     if (migrate_use_xbzrle()) {
 108         qemu_mutex_lock(&XBZRLE.lock);
 109     }
 110 }
 111
 112 static void XBZRLE_cache_unlock(void)
 113 {
 114     if (migrate_use_xbzrle()) {
 115         qemu_mutex_unlock(&XBZRLE.lock);
 116     }
 117 }
 118
 119 /**
 120  * xbzrle_cache_resize: resize the xbzrle cache
 121  *
 122  * This function is called from migrate_params_apply in main
 123  * thread, possibly while a migration is in progress.  A running
 124  * migration may be using the cache and might finish during this call,
 125  * hence changes to the cache are protected by XBZRLE.lock().
 126  *
 127  * Returns 0 for success or -1 for error
 128  *
 129  * @new_size: new cache size
 130  * @errp: set *errp if the check failed, with reason
 131  */
 132 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 133 {
 134     PageCache *new_cache;
 135     int64_t ret = 0;
 136
 137     /* Check for truncation */
 138     if (new_size != (size_t)new_size) {
 139         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 140                    "exceeding address space");
 141         return -1;
 142     }
 143
 144     if (new_size == migrate_xbzrle_cache_size()) {
 145         /* nothing to do */
 146         return 0;
 147     }
 148
 149     XBZRLE_cache_lock();
 150
 151     if (XBZRLE.cache != NULL) {
 152         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 153         if (!new_cache) {
 154             ret = -1;
 155             goto out;
 156         }
 157
 158         cache_fini(XBZRLE.cache);
 159         XBZRLE.cache = new_cache;
 160     }
 161 out:
 162     XBZRLE_cache_unlock();
 163     return ret;
 164 }
 165
 166 bool ramblock_is_ignored(RAMBlock *block)
 167 {
 168     return !qemu_ram_is_migratable(block) ||
 169            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 170 }
 171
 172 #undef RAMBLOCK_FOREACH
 173
 174 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 175 {
 176     RAMBlock *block;
 177     int ret = 0;
 178
 179     RCU_READ_LOCK_GUARD();
 180
 181     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 182         ret = func(block, opaque);
 183         if (ret) {
 184             break;
 185         }
 186     }
 187     return ret;
 188 }
 189
 190 static void ramblock_recv_map_init(void)
 191 {
 192     RAMBlock *rb;
 193
 194     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 195         assert(!rb->receivedmap);
 196         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 197     }
 198 }
 199
 200 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 201 {
 202     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 203                     rb->receivedmap);
 204 }
 205
 206 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 207 {
 208     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 209 }
 210
 211 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 212 {
 213     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 214 }
 215
 216 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 217                                     size_t nr)
 218 {
 219     bitmap_set_atomic(rb->receivedmap,
 220                       ramblock_recv_bitmap_offset(host_addr, rb),
 221                       nr);
 222 }
 223
 224 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 225
 226 /*
 227  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 228  *
 229  * Returns >0 if success with sent bytes, or <0 if error.
 230  */
 231 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 232                                   const char *block_name)
 233 {
 234     RAMBlock *block = qemu_ram_block_by_name(block_name);
 235     unsigned long *le_bitmap, nbits;
 236     uint64_t size;
 237
 238     if (!block) {
 239         error_report("%s: invalid block name: %s", __func__, block_name);
 240         return -1;
 241     }
 242
 243     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 244
 245     /*
 246      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 247      * machines we may need 4 more bytes for padding (see below
 248      * comment). So extend it a bit before hand.
 249      */
 250     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 251
 252     /*
 253      * Always use little endian when sending the bitmap. This is
 254      * required that when source and destination VMs are not using the
 255      * same endianness. (Note: big endian won't work.)
 256      */
 257     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 258
 259     /* Size of the bitmap, in bytes */
 260     size = DIV_ROUND_UP(nbits, 8);
 261
 262     /*
 263      * size is always aligned to 8 bytes for 64bit machines, but it
 264      * may not be true for 32bit machines. We need this padding to
 265      * make sure the migration can survive even between 32bit and
 266      * 64bit machines.
 267      */
 268     size = ROUND_UP(size, 8);
 269
 270     qemu_put_be64(file, size);
 271     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 272     /*
 273      * Mark as an end, in case the middle part is screwed up due to
 274      * some "mysterious" reason.
 275      */
 276     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 277     qemu_fflush(file);
 278
 279     g_free(le_bitmap);
 280
 281     if (qemu_file_get_error(file)) {
 282         return qemu_file_get_error(file);
 283     }
 284
 285     return size + sizeof(size);
 286 }
 287
 288 /*
 289  * An outstanding page request, on the source, having been received
 290  * and queued
 291  */
 292 struct RAMSrcPageRequest {
 293     RAMBlock *rb;
 294     hwaddr    offset;
 295     hwaddr    len;
 296
 297     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 298 };
 299
 300 /* State of RAM for migration */
 301 struct RAMState {
 302     /* QEMUFile used for this migration */
 303     QEMUFile *f;
 304     /* UFFD file descriptor, used in 'write-tracking' migration */
 305     int uffdio_fd;
 306     /* Last block that we have visited searching for dirty pages */
 307     RAMBlock *last_seen_block;
 308     /* Last block from where we have sent data */
 309     RAMBlock *last_sent_block;
 310     /* Last dirty target page we have sent */
 311     ram_addr_t last_page;
 312     /* last ram version we have seen */
 313     uint32_t last_version;
 314     /* How many times we have dirty too many pages */
 315     int dirty_rate_high_cnt;
 316     /* these variables are used for bitmap sync */
 317     /* last time we did a full bitmap_sync */
 318     int64_t time_last_bitmap_sync;
 319     /* bytes transferred at start_time */
 320     uint64_t bytes_xfer_prev;
 321     /* number of dirty pages since start_time */
 322     uint64_t num_dirty_pages_period;
 323     /* xbzrle misses since the beginning of the period */
 324     uint64_t xbzrle_cache_miss_prev;
 325     /* Amount of xbzrle pages since the beginning of the period */
 326     uint64_t xbzrle_pages_prev;
 327     /* Amount of xbzrle encoded bytes since the beginning of the period */
 328     uint64_t xbzrle_bytes_prev;
 329     /* Start using XBZRLE (e.g., after the first round). */
 330     bool xbzrle_enabled;
 331
 332     /* compression statistics since the beginning of the period */
 333     /* amount of count that no free thread to compress data */
 334     uint64_t compress_thread_busy_prev;
 335     /* amount bytes after compression */
 336     uint64_t compressed_size_prev;
 337     /* amount of compressed pages */
 338     uint64_t compress_pages_prev;
 339
 340     /* total handled target pages at the beginning of period */
 341     uint64_t target_page_count_prev;
 342     /* total handled target pages since start */
 343     uint64_t target_page_count;
 344     /* number of dirty bits in the bitmap */
 345     uint64_t migration_dirty_pages;
 346     /* Protects modification of the bitmap and migration dirty pages */
 347     QemuMutex bitmap_mutex;
 348     /* The RAMBlock used in the last src_page_requests */
 349     RAMBlock *last_req_rb;
 350     /* Queue of outstanding page requests from the destination */
 351     QemuMutex src_page_req_mutex;
 352     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 353 };
 354 typedef struct RAMState RAMState;
 355
 356 static RAMState *ram_state;
 357
 358 static NotifierWithReturnList precopy_notifier_list;
 359
 360 void precopy_infrastructure_init(void)
 361 {
 362     notifier_with_return_list_init(&precopy_notifier_list);
 363 }
 364
 365 void precopy_add_notifier(NotifierWithReturn *n)
 366 {
 367     notifier_with_return_list_add(&precopy_notifier_list, n);
 368 }
 369
 370 void precopy_remove_notifier(NotifierWithReturn *n)
 371 {
 372     notifier_with_return_remove(n);
 373 }
 374
 375 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 376 {
 377     PrecopyNotifyData pnd;
 378     pnd.reason = reason;
 379     pnd.errp = errp;
 380
 381     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 382 }
 383
 384 uint64_t ram_bytes_remaining(void)
 385 {
 386     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 387                        0;
 388 }
 389
 390 MigrationStats ram_counters;
 391
 392 /* used by the search for pages to send */
 393 struct PageSearchStatus {
 394     /* Current block being searched */
 395     RAMBlock    *block;
 396     /* Current page to search from */
 397     unsigned long page;
 398     /* Set once we wrap around */
 399     bool         complete_round;
 400 };
 401 typedef struct PageSearchStatus PageSearchStatus;
 402
 403 CompressionStats compression_counters;
 404
 405 struct CompressParam {
 406     bool done;
 407     bool quit;
 408     bool zero_page;
 409     QEMUFile *file;
 410     QemuMutex mutex;
 411     QemuCond cond;
 412     RAMBlock *block;
 413     ram_addr_t offset;
 414
 415     /* internally used fields */
 416     z_stream stream;
 417     uint8_t *originbuf;
 418 };
 419 typedef struct CompressParam CompressParam;
 420
 421 struct DecompressParam {
 422     bool done;
 423     bool quit;
 424     QemuMutex mutex;
 425     QemuCond cond;
 426     void *des;
 427     uint8_t *compbuf;
 428     int len;
 429     z_stream stream;
 430 };
 431 typedef struct DecompressParam DecompressParam;
 432
 433 static CompressParam *comp_param;
 434 static QemuThread *compress_threads;
 435 /* comp_done_cond is used to wake up the migration thread when
 436  * one of the compression threads has finished the compression.
 437  * comp_done_lock is used to co-work with comp_done_cond.
 438  */
 439 static QemuMutex comp_done_lock;
 440 static QemuCond comp_done_cond;
 441 /* The empty QEMUFileOps will be used by file in CompressParam */
 442 static const QEMUFileOps empty_ops = { };
 443
 444 static QEMUFile *decomp_file;
 445 static DecompressParam *decomp_param;
 446 static QemuThread *decompress_threads;
 447 static QemuMutex decomp_done_lock;
 448 static QemuCond decomp_done_cond;
 449
 450 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 451                                  ram_addr_t offset, uint8_t *source_buf);
 452
 453 static void *do_data_compress(void *opaque)
 454 {
 455     CompressParam *param = opaque;
 456     RAMBlock *block;
 457     ram_addr_t offset;
 458     bool zero_page;
 459
 460     qemu_mutex_lock(&param->mutex);
 461     while (!param->quit) {
 462         if (param->block) {
 463             block = param->block;
 464             offset = param->offset;
 465             param->block = NULL;
 466             qemu_mutex_unlock(&param->mutex);
 467
 468             zero_page = do_compress_ram_page(param->file, &param->stream,
 469                                              block, offset, param->originbuf);
 470
 471             qemu_mutex_lock(&comp_done_lock);
 472             param->done = true;
 473             param->zero_page = zero_page;
 474             qemu_cond_signal(&comp_done_cond);
 475             qemu_mutex_unlock(&comp_done_lock);
 476
 477             qemu_mutex_lock(&param->mutex);
 478         } else {
 479             qemu_cond_wait(&param->cond, &param->mutex);
 480         }
 481     }
 482     qemu_mutex_unlock(&param->mutex);
 483
 484     return NULL;
 485 }
 486
 487 static void compress_threads_save_cleanup(void)
 488 {
 489     int i, thread_count;
 490
 491     if (!migrate_use_compression() || !comp_param) {
 492         return;
 493     }
 494
 495     thread_count = migrate_compress_threads();
 496     for (i = 0; i < thread_count; i++) {
 497         /*
 498          * we use it as a indicator which shows if the thread is
 499          * properly init'd or not
 500          */
 501         if (!comp_param[i].file) {
 502             break;
 503         }
 504
 505         qemu_mutex_lock(&comp_param[i].mutex);
 506         comp_param[i].quit = true;
 507         qemu_cond_signal(&comp_param[i].cond);
 508         qemu_mutex_unlock(&comp_param[i].mutex);
 509
 510         qemu_thread_join(compress_threads + i);
 511         qemu_mutex_destroy(&comp_param[i].mutex);
 512         qemu_cond_destroy(&comp_param[i].cond);
 513         deflateEnd(&comp_param[i].stream);
 514         g_free(comp_param[i].originbuf);
 515         qemu_fclose(comp_param[i].file);
 516         comp_param[i].file = NULL;
 517     }
 518     qemu_mutex_destroy(&comp_done_lock);
 519     qemu_cond_destroy(&comp_done_cond);
 520     g_free(compress_threads);
 521     g_free(comp_param);
 522     compress_threads = NULL;
 523     comp_param = NULL;
 524 }
 525
 526 static int compress_threads_save_setup(void)
 527 {
 528     int i, thread_count;
 529
 530     if (!migrate_use_compression()) {
 531         return 0;
 532     }
 533     thread_count = migrate_compress_threads();
 534     compress_threads = g_new0(QemuThread, thread_count);
 535     comp_param = g_new0(CompressParam, thread_count);
 536     qemu_cond_init(&comp_done_cond);
 537     qemu_mutex_init(&comp_done_lock);
 538     for (i = 0; i < thread_count; i++) {
 539         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 540         if (!comp_param[i].originbuf) {
 541             goto exit;
 542         }
 543
 544         if (deflateInit(&comp_param[i].stream,
 545                         migrate_compress_level()) != Z_OK) {
 546             g_free(comp_param[i].originbuf);
 547             goto exit;
 548         }
 549
 550         /* comp_param[i].file is just used as a dummy buffer to save data,
 551          * set its ops to empty.
 552          */
 553         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 554         comp_param[i].done = true;
 555         comp_param[i].quit = false;
 556         qemu_mutex_init(&comp_param[i].mutex);
 557         qemu_cond_init(&comp_param[i].cond);
 558         qemu_thread_create(compress_threads + i, "compress",
 559                            do_data_compress, comp_param + i,
 560                            QEMU_THREAD_JOINABLE);
 561     }
 562     return 0;
 563
 564 exit:
 565     compress_threads_save_cleanup();
 566     return -1;
 567 }
 568
 569 /**
 570  * save_page_header: write page header to wire
 571  *
 572  * If this is the 1st block, it also writes the block identification
 573  *
 574  * Returns the number of bytes written
 575  *
 576  * @f: QEMUFile where to send the data
 577  * @block: block that contains the page we want to send
 578  * @offset: offset inside the block for the page
 579  *          in the lower bits, it contains flags
 580  */
 581 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 582                                ram_addr_t offset)
 583 {
 584     size_t size, len;
 585
 586     if (block == rs->last_sent_block) {
 587         offset |= RAM_SAVE_FLAG_CONTINUE;
 588     }
 589     qemu_put_be64(f, offset);
 590     size = 8;
 591
 592     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 593         len = strlen(block->idstr);
 594         qemu_put_byte(f, len);
 595         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 596         size += 1 + len;
 597         rs->last_sent_block = block;
 598     }
 599     return size;
 600 }
 601
 602 /**
 603  * mig_throttle_guest_down: throotle down the guest
 604  *
 605  * Reduce amount of guest cpu execution to hopefully slow down memory
 606  * writes. If guest dirty memory rate is reduced below the rate at
 607  * which we can transfer pages to the destination then we should be
 608  * able to complete migration. Some workloads dirty memory way too
 609  * fast and will not effectively converge, even with auto-converge.
 610  */
 611 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 612                                     uint64_t bytes_dirty_threshold)
 613 {
 614     MigrationState *s = migrate_get_current();
 615     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 616     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 617     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 618     int pct_max = s->parameters.max_cpu_throttle;
 619
 620     uint64_t throttle_now = cpu_throttle_get_percentage();
 621     uint64_t cpu_now, cpu_ideal, throttle_inc;
 622
 623     /* We have not started throttling yet. Let's start it. */
 624     if (!cpu_throttle_active()) {
 625         cpu_throttle_set(pct_initial);
 626     } else {
 627         /* Throttling already on, just increase the rate */
 628         if (!pct_tailslow) {
 629             throttle_inc = pct_increment;
 630         } else {
 631             /* Compute the ideal CPU percentage used by Guest, which may
 632              * make the dirty rate match the dirty rate threshold. */
 633             cpu_now = 100 - throttle_now;
 634             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 635                         bytes_dirty_period);
 636             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 637         }
 638         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 639     }
 640 }
 641
 642 /**
 643  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 644  *
 645  * @rs: current RAM state
 646  * @current_addr: address for the zero page
 647  *
 648  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 649  * The important thing is that a stale (not-yet-0'd) page be replaced
 650  * by the new data.
 651  * As a bonus, if the page wasn't in the cache it gets added so that
 652  * when a small write is made into the 0'd page it gets XBZRLE sent.
 653  */
 654 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 655 {
 656     if (!rs->xbzrle_enabled) {
 657         return;
 658     }
 659
 660     /* We don't care if this fails to allocate a new cache page
 661      * as long as it updated an old one */
 662     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 663                  ram_counters.dirty_sync_count);
 664 }
 665
 666 #define ENCODING_FLAG_XBZRLE 0x1
 667
 668 /**
 669  * save_xbzrle_page: compress and send current page
 670  *
 671  * Returns: 1 means that we wrote the page
 672  *          0 means that page is identical to the one already sent
 673  *          -1 means that xbzrle would be longer than normal
 674  *
 675  * @rs: current RAM state
 676  * @current_data: pointer to the address of the page contents
 677  * @current_addr: addr of the page
 678  * @block: block that contains the page we want to send
 679  * @offset: offset inside the block for the page
 680  * @last_stage: if we are at the completion stage
 681  */
 682 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 683                             ram_addr_t current_addr, RAMBlock *block,
 684                             ram_addr_t offset, bool last_stage)
 685 {
 686     int encoded_len = 0, bytes_xbzrle;
 687     uint8_t *prev_cached_page;
 688
 689     if (!cache_is_cached(XBZRLE.cache, current_addr,
 690                          ram_counters.dirty_sync_count)) {
 691         xbzrle_counters.cache_miss++;
 692         if (!last_stage) {
 693             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 694                              ram_counters.dirty_sync_count) == -1) {
 695                 return -1;
 696             } else {
 697                 /* update *current_data when the page has been
 698                    inserted into cache */
 699                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 700             }
 701         }
 702         return -1;
 703     }
 704
 705     /*
 706      * Reaching here means the page has hit the xbzrle cache, no matter what
 707      * encoding result it is (normal encoding, overflow or skipping the page),
 708      * count the page as encoded. This is used to calculate the encoding rate.
 709      *
 710      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 711      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 712      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 713      * skipped page included. In this way, the encoding rate can tell if the
 714      * guest page is good for xbzrle encoding.
 715      */
 716     xbzrle_counters.pages++;
 717     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 718
 719     /* save current buffer into memory */
 720     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 721
 722     /* XBZRLE encoding (if there is no overflow) */
 723     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 724                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 725                                        TARGET_PAGE_SIZE);
 726
 727     /*
 728      * Update the cache contents, so that it corresponds to the data
 729      * sent, in all cases except where we skip the page.
 730      */
 731     if (!last_stage && encoded_len != 0) {
 732         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 733         /*
 734          * In the case where we couldn't compress, ensure that the caller
 735          * sends the data from the cache, since the guest might have
 736          * changed the RAM since we copied it.
 737          */
 738         *current_data = prev_cached_page;
 739     }
 740
 741     if (encoded_len == 0) {
 742         trace_save_xbzrle_page_skipping();
 743         return 0;
 744     } else if (encoded_len == -1) {
 745         trace_save_xbzrle_page_overflow();
 746         xbzrle_counters.overflow++;
 747         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 748         return -1;
 749     }
 750
 751     /* Send XBZRLE based compressed page */
 752     bytes_xbzrle = save_page_header(rs, rs->f, block,
 753                                     offset | RAM_SAVE_FLAG_XBZRLE);
 754     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 755     qemu_put_be16(rs->f, encoded_len);
 756     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 757     bytes_xbzrle += encoded_len + 1 + 2;
 758     /*
 759      * Like compressed_size (please see update_compress_thread_counts),
 760      * the xbzrle encoded bytes don't count the 8 byte header with
 761      * RAM_SAVE_FLAG_CONTINUE.
 762      */
 763     xbzrle_counters.bytes += bytes_xbzrle - 8;
 764     ram_counters.transferred += bytes_xbzrle;
 765
 766     return 1;
 767 }
 768
 769 /**
 770  * migration_bitmap_find_dirty: find the next dirty page from start
 771  *
 772  * Returns the page offset within memory region of the start of a dirty page
 773  *
 774  * @rs: current RAM state
 775  * @rb: RAMBlock where to search for dirty pages
 776  * @start: page where we start the search
 777  */
 778 static inline
 779 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 780                                           unsigned long start)
 781 {
 782     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 783     unsigned long *bitmap = rb->bmap;
 784
 785     if (ramblock_is_ignored(rb)) {
 786         return size;
 787     }
 788
 789     return find_next_bit(bitmap, size, start);
 790 }
 791
 792 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 793                                                 RAMBlock *rb,
 794                                                 unsigned long page)
 795 {
 796     bool ret;
 797
 798     QEMU_LOCK_GUARD(&rs->bitmap_mutex);
 799
 800     /*
 801      * Clear dirty bitmap if needed.  This _must_ be called before we
 802      * send any of the page in the chunk because we need to make sure
 803      * we can capture further page content changes when we sync dirty
 804      * log the next time.  So as long as we are going to send any of
 805      * the page in the chunk we clear the remote dirty bitmap for all.
 806      * Clearing it earlier won't be a problem, but too late will.
 807      */
 808     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
 809         uint8_t shift = rb->clear_bmap_shift;
 810         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
 811         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
 812
 813         /*
 814          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 815          * can make things easier sometimes since then start address
 816          * of the small chunk will always be 64 pages aligned so the
 817          * bitmap will always be aligned to unsigned long.  We should
 818          * even be able to remove this restriction but I'm simply
 819          * keeping it.
 820          */
 821         assert(shift >= 6);
 822         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 823         memory_region_clear_dirty_bitmap(rb->mr, start, size);
 824     }
 825
 826     ret = test_and_clear_bit(page, rb->bmap);
 827
 828     if (ret) {
 829         rs->migration_dirty_pages--;
 830     }
 831
 832     return ret;
 833 }
 834
 835 /* Called with RCU critical section */
 836 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 837 {
 838     uint64_t new_dirty_pages =
 839         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
 840
 841     rs->migration_dirty_pages += new_dirty_pages;
 842     rs->num_dirty_pages_period += new_dirty_pages;
 843 }
 844
 845 /**
 846  * ram_pagesize_summary: calculate all the pagesizes of a VM
 847  *
 848  * Returns a summary bitmap of the page sizes of all RAMBlocks
 849  *
 850  * For VMs with just normal pages this is equivalent to the host page
 851  * size. If it's got some huge pages then it's the OR of all the
 852  * different page sizes.
 853  */
 854 uint64_t ram_pagesize_summary(void)
 855 {
 856     RAMBlock *block;
 857     uint64_t summary = 0;
 858
 859     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 860         summary |= block->page_size;
 861     }
 862
 863     return summary;
 864 }
 865
 866 uint64_t ram_get_total_transferred_pages(void)
 867 {
 868     return  ram_counters.normal + ram_counters.duplicate +
 869                 compression_counters.pages + xbzrle_counters.pages;
 870 }
 871
 872 static void migration_update_rates(RAMState *rs, int64_t end_time)
 873 {
 874     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 875     double compressed_size;
 876
 877     /* calculate period counters */
 878     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 879                 / (end_time - rs->time_last_bitmap_sync);
 880
 881     if (!page_count) {
 882         return;
 883     }
 884
 885     if (migrate_use_xbzrle()) {
 886         double encoded_size, unencoded_size;
 887
 888         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 889             rs->xbzrle_cache_miss_prev) / page_count;
 890         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 891         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
 892                          TARGET_PAGE_SIZE;
 893         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
 894         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
 895             xbzrle_counters.encoding_rate = 0;
 896         } else {
 897             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
 898         }
 899         rs->xbzrle_pages_prev = xbzrle_counters.pages;
 900         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
 901     }
 902
 903     if (migrate_use_compression()) {
 904         compression_counters.busy_rate = (double)(compression_counters.busy -
 905             rs->compress_thread_busy_prev) / page_count;
 906         rs->compress_thread_busy_prev = compression_counters.busy;
 907
 908         compressed_size = compression_counters.compressed_size -
 909                           rs->compressed_size_prev;
 910         if (compressed_size) {
 911             double uncompressed_size = (compression_counters.pages -
 912                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
 913
 914             /* Compression-Ratio = Uncompressed-size / Compressed-size */
 915             compression_counters.compression_rate =
 916                                         uncompressed_size / compressed_size;
 917
 918             rs->compress_pages_prev = compression_counters.pages;
 919             rs->compressed_size_prev = compression_counters.compressed_size;
 920         }
 921     }
 922 }
 923
 924 static void migration_trigger_throttle(RAMState *rs)
 925 {
 926     MigrationState *s = migrate_get_current();
 927     uint64_t threshold = s->parameters.throttle_trigger_threshold;
 928
 929     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
 930     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
 931     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
 932
 933     /* During block migration the auto-converge logic incorrectly detects
 934      * that ram migration makes no progress. Avoid this by disabling the
 935      * throttling logic during the bulk phase of block migration. */
 936     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 937         /* The following detection logic can be refined later. For now:
 938            Check to see if the ratio between dirtied bytes and the approx.
 939            amount of bytes that just got transferred since the last time
 940            we were in this routine reaches the threshold. If that happens
 941            twice, start or increase throttling. */
 942
 943         if ((bytes_dirty_period > bytes_dirty_threshold) &&
 944             (++rs->dirty_rate_high_cnt >= 2)) {
 945             trace_migration_throttle();
 946             rs->dirty_rate_high_cnt = 0;
 947             mig_throttle_guest_down(bytes_dirty_period,
 948                                     bytes_dirty_threshold);
 949         }
 950     }
 951 }
 952
 953 static void migration_bitmap_sync(RAMState *rs)
 954 {
 955     RAMBlock *block;
 956     int64_t end_time;
 957
 958     ram_counters.dirty_sync_count++;
 959
 960     if (!rs->time_last_bitmap_sync) {
 961         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 962     }
 963
 964     trace_migration_bitmap_sync_start();
 965     memory_global_dirty_log_sync();
 966
 967     qemu_mutex_lock(&rs->bitmap_mutex);
 968     WITH_RCU_READ_LOCK_GUARD() {
 969         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 970             ramblock_sync_dirty_bitmap(rs, block);
 971         }
 972         ram_counters.remaining = ram_bytes_remaining();
 973     }
 974     qemu_mutex_unlock(&rs->bitmap_mutex);
 975
 976     memory_global_after_dirty_log_sync();
 977     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 978
 979     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 980
 981     /* more than 1 second = 1000 millisecons */
 982     if (end_time > rs->time_last_bitmap_sync + 1000) {
 983         migration_trigger_throttle(rs);
 984
 985         migration_update_rates(rs, end_time);
 986
 987         rs->target_page_count_prev = rs->target_page_count;
 988
 989         /* reset period counters */
 990         rs->time_last_bitmap_sync = end_time;
 991         rs->num_dirty_pages_period = 0;
 992         rs->bytes_xfer_prev = ram_counters.transferred;
 993     }
 994     if (migrate_use_events()) {
 995         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
 996     }
 997 }
 998
 999 static void migration_bitmap_sync_precopy(RAMState *rs)
1000 {
1001     Error *local_err = NULL;
1002
1003     /*
1004      * The current notifier usage is just an optimization to migration, so we
1005      * don't stop the normal migration process in the error case.
1006      */
1007     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1008         error_report_err(local_err);
1009         local_err = NULL;
1010     }
1011
1012     migration_bitmap_sync(rs);
1013
1014     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1015         error_report_err(local_err);
1016     }
1017 }
1018
1019 /**
1020  * save_zero_page_to_file: send the zero page to the file
1021  *
1022  * Returns the size of data written to the file, 0 means the page is not
1023  * a zero page
1024  *
1025  * @rs: current RAM state
1026  * @file: the file where the data is saved
1027  * @block: block that contains the page we want to send
1028  * @offset: offset inside the block for the page
1029  */
1030 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1031                                   RAMBlock *block, ram_addr_t offset)
1032 {
1033     uint8_t *p = block->host + offset;
1034     int len = 0;
1035
1036     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1037         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1038         qemu_put_byte(file, 0);
1039         len += 1;
1040     }
1041     return len;
1042 }
1043
1044 /**
1045  * save_zero_page: send the zero page to the stream
1046  *
1047  * Returns the number of pages written.
1048  *
1049  * @rs: current RAM state
1050  * @block: block that contains the page we want to send
1051  * @offset: offset inside the block for the page
1052  */
1053 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1054 {
1055     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1056
1057     if (len) {
1058         ram_counters.duplicate++;
1059         ram_counters.transferred += len;
1060         return 1;
1061     }
1062     return -1;
1063 }
1064
1065 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1066 {
1067     if (!migrate_release_ram() || !migration_in_postcopy()) {
1068         return;
1069     }
1070
1071     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1072 }
1073
1074 /*
1075  * @pages: the number of pages written by the control path,
1076  *        < 0 - error
1077  *        > 0 - number of pages written
1078  *
1079  * Return true if the pages has been saved, otherwise false is returned.
1080  */
1081 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1082                               int *pages)
1083 {
1084     uint64_t bytes_xmit = 0;
1085     int ret;
1086
1087     *pages = -1;
1088     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1089                                 &bytes_xmit);
1090     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1091         return false;
1092     }
1093
1094     if (bytes_xmit) {
1095         ram_counters.transferred += bytes_xmit;
1096         *pages = 1;
1097     }
1098
1099     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1100         return true;
1101     }
1102
1103     if (bytes_xmit > 0) {
1104         ram_counters.normal++;
1105     } else if (bytes_xmit == 0) {
1106         ram_counters.duplicate++;
1107     }
1108
1109     return true;
1110 }
1111
1112 /*
1113  * directly send the page to the stream
1114  *
1115  * Returns the number of pages written.
1116  *
1117  * @rs: current RAM state
1118  * @block: block that contains the page we want to send
1119  * @offset: offset inside the block for the page
1120  * @buf: the page to be sent
1121  * @async: send to page asyncly
1122  */
1123 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1124                             uint8_t *buf, bool async)
1125 {
1126     ram_counters.transferred += save_page_header(rs, rs->f, block,
1127                                                  offset | RAM_SAVE_FLAG_PAGE);
1128     if (async) {
1129         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1130                               migrate_release_ram() &
1131                               migration_in_postcopy());
1132     } else {
1133         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1134     }
1135     ram_counters.transferred += TARGET_PAGE_SIZE;
1136     ram_counters.normal++;
1137     return 1;
1138 }
1139
1140 /**
1141  * ram_save_page: send the given page to the stream
1142  *
1143  * Returns the number of pages written.
1144  *          < 0 - error
1145  *          >=0 - Number of pages written - this might legally be 0
1146  *                if xbzrle noticed the page was the same.
1147  *
1148  * @rs: current RAM state
1149  * @block: block that contains the page we want to send
1150  * @offset: offset inside the block for the page
1151  * @last_stage: if we are at the completion stage
1152  */
1153 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1154 {
1155     int pages = -1;
1156     uint8_t *p;
1157     bool send_async = true;
1158     RAMBlock *block = pss->block;
1159     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1160     ram_addr_t current_addr = block->offset + offset;
1161
1162     p = block->host + offset;
1163     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1164
1165     XBZRLE_cache_lock();
1166     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1167         pages = save_xbzrle_page(rs, &p, current_addr, block,
1168                                  offset, last_stage);
1169         if (!last_stage) {
1170             /* Can't send this cached data async, since the cache page
1171              * might get updated before it gets to the wire
1172              */
1173             send_async = false;
1174         }
1175     }
1176
1177     /* XBZRLE overflow or normal page */
1178     if (pages == -1) {
1179         pages = save_normal_page(rs, block, offset, p, send_async);
1180     }
1181
1182     XBZRLE_cache_unlock();
1183
1184     return pages;
1185 }
1186
1187 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1188                                  ram_addr_t offset)
1189 {
1190     if (multifd_queue_page(rs->f, block, offset) < 0) {
1191         return -1;
1192     }
1193     ram_counters.normal++;
1194
1195     return 1;
1196 }
1197
1198 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1199                                  ram_addr_t offset, uint8_t *source_buf)
1200 {
1201     RAMState *rs = ram_state;
1202     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1203     bool zero_page = false;
1204     int ret;
1205
1206     if (save_zero_page_to_file(rs, f, block, offset)) {
1207         zero_page = true;
1208         goto exit;
1209     }
1210
1211     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1212
1213     /*
1214      * copy it to a internal buffer to avoid it being modified by VM
1215      * so that we can catch up the error during compression and
1216      * decompression
1217      */
1218     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1219     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1220     if (ret < 0) {
1221         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1222         error_report("compressed data failed!");
1223         return false;
1224     }
1225
1226 exit:
1227     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1228     return zero_page;
1229 }
1230
1231 static void
1232 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1233 {
1234     ram_counters.transferred += bytes_xmit;
1235
1236     if (param->zero_page) {
1237         ram_counters.duplicate++;
1238         return;
1239     }
1240
1241     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1242     compression_counters.compressed_size += bytes_xmit - 8;
1243     compression_counters.pages++;
1244 }
1245
1246 static bool save_page_use_compression(RAMState *rs);
1247
1248 static void flush_compressed_data(RAMState *rs)
1249 {
1250     int idx, len, thread_count;
1251
1252     if (!save_page_use_compression(rs)) {
1253         return;
1254     }
1255     thread_count = migrate_compress_threads();
1256
1257     qemu_mutex_lock(&comp_done_lock);
1258     for (idx = 0; idx < thread_count; idx++) {
1259         while (!comp_param[idx].done) {
1260             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1261         }
1262     }
1263     qemu_mutex_unlock(&comp_done_lock);
1264
1265     for (idx = 0; idx < thread_count; idx++) {
1266         qemu_mutex_lock(&comp_param[idx].mutex);
1267         if (!comp_param[idx].quit) {
1268             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1269             /*
1270              * it's safe to fetch zero_page without holding comp_done_lock
1271              * as there is no further request submitted to the thread,
1272              * i.e, the thread should be waiting for a request at this point.
1273              */
1274             update_compress_thread_counts(&comp_param[idx], len);
1275         }
1276         qemu_mutex_unlock(&comp_param[idx].mutex);
1277     }
1278 }
1279
1280 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1281                                        ram_addr_t offset)
1282 {
1283     param->block = block;
1284     param->offset = offset;
1285 }
1286
1287 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1288                                            ram_addr_t offset)
1289 {
1290     int idx, thread_count, bytes_xmit = -1, pages = -1;
1291     bool wait = migrate_compress_wait_thread();
1292
1293     thread_count = migrate_compress_threads();
1294     qemu_mutex_lock(&comp_done_lock);
1295 retry:
1296     for (idx = 0; idx < thread_count; idx++) {
1297         if (comp_param[idx].done) {
1298             comp_param[idx].done = false;
1299             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1300             qemu_mutex_lock(&comp_param[idx].mutex);
1301             set_compress_params(&comp_param[idx], block, offset);
1302             qemu_cond_signal(&comp_param[idx].cond);
1303             qemu_mutex_unlock(&comp_param[idx].mutex);
1304             pages = 1;
1305             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1306             break;
1307         }
1308     }
1309
1310     /*
1311      * wait for the free thread if the user specifies 'compress-wait-thread',
1312      * otherwise we will post the page out in the main thread as normal page.
1313      */
1314     if (pages < 0 && wait) {
1315         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1316         goto retry;
1317     }
1318     qemu_mutex_unlock(&comp_done_lock);
1319
1320     return pages;
1321 }
1322
1323 /**
1324  * find_dirty_block: find the next dirty page and update any state
1325  * associated with the search process.
1326  *
1327  * Returns true if a page is found
1328  *
1329  * @rs: current RAM state
1330  * @pss: data about the state of the current dirty page scan
1331  * @again: set to false if the search has scanned the whole of RAM
1332  */
1333 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1334 {
1335     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1336     if (pss->complete_round && pss->block == rs->last_seen_block &&
1337         pss->page >= rs->last_page) {
1338         /*
1339          * We've been once around the RAM and haven't found anything.
1340          * Give up.
1341          */
1342         *again = false;
1343         return false;
1344     }
1345     if (!offset_in_ramblock(pss->block,
1346                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1347         /* Didn't find anything in this RAM Block */
1348         pss->page = 0;
1349         pss->block = QLIST_NEXT_RCU(pss->block, next);
1350         if (!pss->block) {
1351             /*
1352              * If memory migration starts over, we will meet a dirtied page
1353              * which may still exists in compression threads's ring, so we
1354              * should flush the compressed data to make sure the new page
1355              * is not overwritten by the old one in the destination.
1356              *
1357              * Also If xbzrle is on, stop using the data compression at this
1358              * point. In theory, xbzrle can do better than compression.
1359              */
1360             flush_compressed_data(rs);
1361
1362             /* Hit the end of the list */
1363             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1364             /* Flag that we've looped */
1365             pss->complete_round = true;
1366             /* After the first round, enable XBZRLE. */
1367             if (migrate_use_xbzrle()) {
1368                 rs->xbzrle_enabled = true;
1369             }
1370         }
1371         /* Didn't find anything this time, but try again on the new block */
1372         *again = true;
1373         return false;
1374     } else {
1375         /* Can go around again, but... */
1376         *again = true;
1377         /* We've found something so probably don't need to */
1378         return true;
1379     }
1380 }
1381
1382 /**
1383  * unqueue_page: gets a page of the queue
1384  *
1385  * Helper for 'get_queued_page' - gets a page off the queue
1386  *
1387  * Returns the block of the page (or NULL if none available)
1388  *
1389  * @rs: current RAM state
1390  * @offset: used to return the offset within the RAMBlock
1391  */
1392 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1393 {
1394     RAMBlock *block = NULL;
1395
1396     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1397         return NULL;
1398     }
1399
1400     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1401     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1402         struct RAMSrcPageRequest *entry =
1403                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1404         block = entry->rb;
1405         *offset = entry->offset;
1406
1407         if (entry->len > TARGET_PAGE_SIZE) {
1408             entry->len -= TARGET_PAGE_SIZE;
1409             entry->offset += TARGET_PAGE_SIZE;
1410         } else {
1411             memory_region_unref(block->mr);
1412             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1413             g_free(entry);
1414             migration_consume_urgent_request();
1415         }
1416     }
1417
1418     return block;
1419 }
1420
1421 #if defined(__linux__)
1422 /**
1423  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1424  *   is found, return RAM block pointer and page offset
1425  *
1426  * Returns pointer to the RAMBlock containing faulting page,
1427  *   NULL if no write faults are pending
1428  *
1429  * @rs: current RAM state
1430  * @offset: page offset from the beginning of the block
1431  */
1432 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1433 {
1434     struct uffd_msg uffd_msg;
1435     void *page_address;
1436     RAMBlock *block;
1437     int res;
1438
1439     if (!migrate_background_snapshot()) {
1440         return NULL;
1441     }
1442
1443     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1444     if (res <= 0) {
1445         return NULL;
1446     }
1447
1448     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1449     block = qemu_ram_block_from_host(page_address, false, offset);
1450     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1451     return block;
1452 }
1453
1454 /**
1455  * ram_save_release_protection: release UFFD write protection after
1456  *   a range of pages has been saved
1457  *
1458  * @rs: current RAM state
1459  * @pss: page-search-status structure
1460  * @start_page: index of the first page in the range relative to pss->block
1461  *
1462  * Returns 0 on success, negative value in case of an error
1463 */
1464 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1465         unsigned long start_page)
1466 {
1467     int res = 0;
1468
1469     /* Check if page is from UFFD-managed region. */
1470     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1471         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1472         uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1473
1474         /* Flush async buffers before un-protect. */
1475         qemu_fflush(rs->f);
1476         /* Un-protect memory range. */
1477         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1478                 false, false);
1479     }
1480
1481     return res;
1482 }
1483
1484 /* ram_write_tracking_available: check if kernel supports required UFFD features
1485  *
1486  * Returns true if supports, false otherwise
1487  */
1488 bool ram_write_tracking_available(void)
1489 {
1490     uint64_t uffd_features;
1491     int res;
1492
1493     res = uffd_query_features(&uffd_features);
1494     return (res == 0 &&
1495             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1496 }
1497
1498 /* ram_write_tracking_compatible: check if guest configuration is
1499  *   compatible with 'write-tracking'
1500  *
1501  * Returns true if compatible, false otherwise
1502  */
1503 bool ram_write_tracking_compatible(void)
1504 {
1505     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1506     int uffd_fd;
1507     RAMBlock *block;
1508     bool ret = false;
1509
1510     /* Open UFFD file descriptor */
1511     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1512     if (uffd_fd < 0) {
1513         return false;
1514     }
1515
1516     RCU_READ_LOCK_GUARD();
1517
1518     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1519         uint64_t uffd_ioctls;
1520
1521         /* Nothing to do with read-only and MMIO-writable regions */
1522         if (block->mr->readonly || block->mr->rom_device) {
1523             continue;
1524         }
1525         /* Try to register block memory via UFFD-IO to track writes */
1526         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1527                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1528             goto out;
1529         }
1530         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1531             goto out;
1532         }
1533     }
1534     ret = true;
1535
1536 out:
1537     uffd_close_fd(uffd_fd);
1538     return ret;
1539 }
1540
1541 /*
1542  * ram_block_populate_pages: populate memory in the RAM block by reading
1543  *   an integer from the beginning of each page.
1544  *
1545  * Since it's solely used for userfault_fd WP feature, here we just
1546  *   hardcode page size to qemu_real_host_page_size.
1547  *
1548  * @block: RAM block to populate
1549  */
1550 static void ram_block_populate_pages(RAMBlock *block)
1551 {
1552     char *ptr = (char *) block->host;
1553
1554     for (ram_addr_t offset = 0; offset < block->used_length;
1555             offset += qemu_real_host_page_size) {
1556         char tmp = *(ptr + offset);
1557
1558         /* Don't optimize the read out */
1559         asm volatile("" : "+r" (tmp));
1560     }
1561 }
1562
1563 /*
1564  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1565  */
1566 void ram_write_tracking_prepare(void)
1567 {
1568     RAMBlock *block;
1569
1570     RCU_READ_LOCK_GUARD();
1571
1572     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1573         /* Nothing to do with read-only and MMIO-writable regions */
1574         if (block->mr->readonly || block->mr->rom_device) {
1575             continue;
1576         }
1577
1578         /*
1579          * Populate pages of the RAM block before enabling userfault_fd
1580          * write protection.
1581          *
1582          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1583          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1584          * pages with pte_none() entries in page table.
1585          */
1586         ram_block_populate_pages(block);
1587     }
1588 }
1589
1590 /*
1591  * ram_write_tracking_start: start UFFD-WP memory tracking
1592  *
1593  * Returns 0 for success or negative value in case of error
1594  */
1595 int ram_write_tracking_start(void)
1596 {
1597     int uffd_fd;
1598     RAMState *rs = ram_state;
1599     RAMBlock *block;
1600
1601     /* Open UFFD file descriptor */
1602     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1603     if (uffd_fd < 0) {
1604         return uffd_fd;
1605     }
1606     rs->uffdio_fd = uffd_fd;
1607
1608     RCU_READ_LOCK_GUARD();
1609
1610     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1611         /* Nothing to do with read-only and MMIO-writable regions */
1612         if (block->mr->readonly || block->mr->rom_device) {
1613             continue;
1614         }
1615
1616         /* Register block memory with UFFD to track writes */
1617         if (uffd_register_memory(rs->uffdio_fd, block->host,
1618                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1619             goto fail;
1620         }
1621         /* Apply UFFD write protection to the block memory range */
1622         if (uffd_change_protection(rs->uffdio_fd, block->host,
1623                 block->max_length, true, false)) {
1624             goto fail;
1625         }
1626         block->flags |= RAM_UF_WRITEPROTECT;
1627         memory_region_ref(block->mr);
1628
1629         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1630                 block->host, block->max_length);
1631     }
1632
1633     return 0;
1634
1635 fail:
1636     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1637
1638     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1639         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1640             continue;
1641         }
1642         /*
1643          * In case some memory block failed to be write-protected
1644          * remove protection and unregister all succeeded RAM blocks
1645          */
1646         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1647                 false, false);
1648         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1649         /* Cleanup flags and remove reference */
1650         block->flags &= ~RAM_UF_WRITEPROTECT;
1651         memory_region_unref(block->mr);
1652     }
1653
1654     uffd_close_fd(uffd_fd);
1655     rs->uffdio_fd = -1;
1656     return -1;
1657 }
1658
1659 /**
1660  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1661  */
1662 void ram_write_tracking_stop(void)
1663 {
1664     RAMState *rs = ram_state;
1665     RAMBlock *block;
1666
1667     RCU_READ_LOCK_GUARD();
1668
1669     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1670         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1671             continue;
1672         }
1673         /* Remove protection and unregister all affected RAM blocks */
1674         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1675                 false, false);
1676         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1677
1678         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1679                 block->host, block->max_length);
1680
1681         /* Cleanup flags and remove reference */
1682         block->flags &= ~RAM_UF_WRITEPROTECT;
1683         memory_region_unref(block->mr);
1684     }
1685
1686     /* Finally close UFFD file descriptor */
1687     uffd_close_fd(rs->uffdio_fd);
1688     rs->uffdio_fd = -1;
1689 }
1690
1691 #else
1692 /* No target OS support, stubs just fail or ignore */
1693
1694 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1695 {
1696     (void) rs;
1697     (void) offset;
1698
1699     return NULL;
1700 }
1701
1702 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1703         unsigned long start_page)
1704 {
1705     (void) rs;
1706     (void) pss;
1707     (void) start_page;
1708
1709     return 0;
1710 }
1711
1712 bool ram_write_tracking_available(void)
1713 {
1714     return false;
1715 }
1716
1717 bool ram_write_tracking_compatible(void)
1718 {
1719     assert(0);
1720     return false;
1721 }
1722
1723 int ram_write_tracking_start(void)
1724 {
1725     assert(0);
1726     return -1;
1727 }
1728
1729 void ram_write_tracking_stop(void)
1730 {
1731     assert(0);
1732 }
1733 #endif /* defined(__linux__) */
1734
1735 /**
1736  * get_queued_page: unqueue a page from the postcopy requests
1737  *
1738  * Skips pages that are already sent (!dirty)
1739  *
1740  * Returns true if a queued page is found
1741  *
1742  * @rs: current RAM state
1743  * @pss: data about the state of the current dirty page scan
1744  */
1745 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1746 {
1747     RAMBlock  *block;
1748     ram_addr_t offset;
1749     bool dirty;
1750
1751     do {
1752         block = unqueue_page(rs, &offset);
1753         /*
1754          * We're sending this page, and since it's postcopy nothing else
1755          * will dirty it, and we must make sure it doesn't get sent again
1756          * even if this queue request was received after the background
1757          * search already sent it.
1758          */
1759         if (block) {
1760             unsigned long page;
1761
1762             page = offset >> TARGET_PAGE_BITS;
1763             dirty = test_bit(page, block->bmap);
1764             if (!dirty) {
1765                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1766                                                 page);
1767             } else {
1768                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1769             }
1770         }
1771
1772     } while (block && !dirty);
1773
1774     if (!block) {
1775         /*
1776          * Poll write faults too if background snapshot is enabled; that's
1777          * when we have vcpus got blocked by the write protected pages.
1778          */
1779         block = poll_fault_page(rs, &offset);
1780     }
1781
1782     if (block) {
1783         /*
1784          * We want the background search to continue from the queued page
1785          * since the guest is likely to want other pages near to the page
1786          * it just requested.
1787          */
1788         pss->block = block;
1789         pss->page = offset >> TARGET_PAGE_BITS;
1790
1791         /*
1792          * This unqueued page would break the "one round" check, even is
1793          * really rare.
1794          */
1795         pss->complete_round = false;
1796     }
1797
1798     return !!block;
1799 }
1800
1801 /**
1802  * migration_page_queue_free: drop any remaining pages in the ram
1803  * request queue
1804  *
1805  * It should be empty at the end anyway, but in error cases there may
1806  * be some left.  in case that there is any page left, we drop it.
1807  *
1808  */
1809 static void migration_page_queue_free(RAMState *rs)
1810 {
1811     struct RAMSrcPageRequest *mspr, *next_mspr;
1812     /* This queue generally should be empty - but in the case of a failed
1813      * migration might have some droppings in.
1814      */
1815     RCU_READ_LOCK_GUARD();
1816     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1817         memory_region_unref(mspr->rb->mr);
1818         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1819         g_free(mspr);
1820     }
1821 }
1822
1823 /**
1824  * ram_save_queue_pages: queue the page for transmission
1825  *
1826  * A request from postcopy destination for example.
1827  *
1828  * Returns zero on success or negative on error
1829  *
1830  * @rbname: Name of the RAMBLock of the request. NULL means the
1831  *          same that last one.
1832  * @start: starting address from the start of the RAMBlock
1833  * @len: length (in bytes) to send
1834  */
1835 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1836 {
1837     RAMBlock *ramblock;
1838     RAMState *rs = ram_state;
1839
1840     ram_counters.postcopy_requests++;
1841     RCU_READ_LOCK_GUARD();
1842
1843     if (!rbname) {
1844         /* Reuse last RAMBlock */
1845         ramblock = rs->last_req_rb;
1846
1847         if (!ramblock) {
1848             /*
1849              * Shouldn't happen, we can't reuse the last RAMBlock if
1850              * it's the 1st request.
1851              */
1852             error_report("ram_save_queue_pages no previous block");
1853             return -1;
1854         }
1855     } else {
1856         ramblock = qemu_ram_block_by_name(rbname);
1857
1858         if (!ramblock) {
1859             /* We shouldn't be asked for a non-existent RAMBlock */
1860             error_report("ram_save_queue_pages no block '%s'", rbname);
1861             return -1;
1862         }
1863         rs->last_req_rb = ramblock;
1864     }
1865     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1866     if (!offset_in_ramblock(ramblock, start + len - 1)) {
1867         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1868                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1869                      __func__, start, len, ramblock->used_length);
1870         return -1;
1871     }
1872
1873     struct RAMSrcPageRequest *new_entry =
1874         g_malloc0(sizeof(struct RAMSrcPageRequest));
1875     new_entry->rb = ramblock;
1876     new_entry->offset = start;
1877     new_entry->len = len;
1878
1879     memory_region_ref(ramblock->mr);
1880     qemu_mutex_lock(&rs->src_page_req_mutex);
1881     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1882     migration_make_urgent_request();
1883     qemu_mutex_unlock(&rs->src_page_req_mutex);
1884
1885     return 0;
1886 }
1887
1888 static bool save_page_use_compression(RAMState *rs)
1889 {
1890     if (!migrate_use_compression()) {
1891         return false;
1892     }
1893
1894     /*
1895      * If xbzrle is enabled (e.g., after first round of migration), stop
1896      * using the data compression. In theory, xbzrle can do better than
1897      * compression.
1898      */
1899     if (rs->xbzrle_enabled) {
1900         return false;
1901     }
1902
1903     return true;
1904 }
1905
1906 /*
1907  * try to compress the page before posting it out, return true if the page
1908  * has been properly handled by compression, otherwise needs other
1909  * paths to handle it
1910  */
1911 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1912 {
1913     if (!save_page_use_compression(rs)) {
1914         return false;
1915     }
1916
1917     /*
1918      * When starting the process of a new block, the first page of
1919      * the block should be sent out before other pages in the same
1920      * block, and all the pages in last block should have been sent
1921      * out, keeping this order is important, because the 'cont' flag
1922      * is used to avoid resending the block name.
1923      *
1924      * We post the fist page as normal page as compression will take
1925      * much CPU resource.
1926      */
1927     if (block != rs->last_sent_block) {
1928         flush_compressed_data(rs);
1929         return false;
1930     }
1931
1932     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1933         return true;
1934     }
1935
1936     compression_counters.busy++;
1937     return false;
1938 }
1939
1940 /**
1941  * ram_save_target_page: save one target page
1942  *
1943  * Returns the number of pages written
1944  *
1945  * @rs: current RAM state
1946  * @pss: data about the page we want to send
1947  * @last_stage: if we are at the completion stage
1948  */
1949 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1950                                 bool last_stage)
1951 {
1952     RAMBlock *block = pss->block;
1953     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1954     int res;
1955
1956     if (control_save_page(rs, block, offset, &res)) {
1957         return res;
1958     }
1959
1960     if (save_compress_page(rs, block, offset)) {
1961         return 1;
1962     }
1963
1964     res = save_zero_page(rs, block, offset);
1965     if (res > 0) {
1966         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1967          * page would be stale
1968          */
1969         if (!save_page_use_compression(rs)) {
1970             XBZRLE_cache_lock();
1971             xbzrle_cache_zero_page(rs, block->offset + offset);
1972             XBZRLE_cache_unlock();
1973         }
1974         ram_release_pages(block->idstr, offset, res);
1975         return res;
1976     }
1977
1978     /*
1979      * Do not use multifd for:
1980      * 1. Compression as the first page in the new block should be posted out
1981      *    before sending the compressed page
1982      * 2. In postcopy as one whole host page should be placed
1983      */
1984     if (!save_page_use_compression(rs) && migrate_use_multifd()
1985         && !migration_in_postcopy()) {
1986         return ram_save_multifd_page(rs, block, offset);
1987     }
1988
1989     return ram_save_page(rs, pss, last_stage);
1990 }
1991
1992 /**
1993  * ram_save_host_page: save a whole host page
1994  *
1995  * Starting at *offset send pages up to the end of the current host
1996  * page. It's valid for the initial offset to point into the middle of
1997  * a host page in which case the remainder of the hostpage is sent.
1998  * Only dirty target pages are sent. Note that the host page size may
1999  * be a huge page for this block.
2000  * The saving stops at the boundary of the used_length of the block
2001  * if the RAMBlock isn't a multiple of the host page size.
2002  *
2003  * Returns the number of pages written or negative on error
2004  *
2005  * @rs: current RAM state
2006  * @ms: current migration state
2007  * @pss: data about the page we want to send
2008  * @last_stage: if we are at the completion stage
2009  */
2010 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2011                               bool last_stage)
2012 {
2013     int tmppages, pages = 0;
2014     size_t pagesize_bits =
2015         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2016     unsigned long hostpage_boundary =
2017         QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2018     unsigned long start_page = pss->page;
2019     int res;
2020
2021     if (ramblock_is_ignored(pss->block)) {
2022         error_report("block %s should not be migrated !", pss->block->idstr);
2023         return 0;
2024     }
2025
2026     do {
2027         /* Check the pages is dirty and if it is send it */
2028         if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2029             tmppages = ram_save_target_page(rs, pss, last_stage);
2030             if (tmppages < 0) {
2031                 return tmppages;
2032             }
2033
2034             pages += tmppages;
2035             /*
2036              * Allow rate limiting to happen in the middle of huge pages if
2037              * something is sent in the current iteration.
2038              */
2039             if (pagesize_bits > 1 && tmppages > 0) {
2040                 migration_rate_limit();
2041             }
2042         }
2043         pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2044     } while ((pss->page < hostpage_boundary) &&
2045              offset_in_ramblock(pss->block,
2046                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2047     /* The offset we leave with is the min boundary of host page and block */
2048     pss->page = MIN(pss->page, hostpage_boundary) - 1;
2049
2050     res = ram_save_release_protection(rs, pss, start_page);
2051     return (res < 0 ? res : pages);
2052 }
2053
2054 /**
2055  * ram_find_and_save_block: finds a dirty page and sends it to f
2056  *
2057  * Called within an RCU critical section.
2058  *
2059  * Returns the number of pages written where zero means no dirty pages,
2060  * or negative on error
2061  *
2062  * @rs: current RAM state
2063  * @last_stage: if we are at the completion stage
2064  *
2065  * On systems where host-page-size > target-page-size it will send all the
2066  * pages in a host page that are dirty.
2067  */
2068
2069 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2070 {
2071     PageSearchStatus pss;
2072     int pages = 0;
2073     bool again, found;
2074
2075     /* No dirty page as there is zero RAM */
2076     if (!ram_bytes_total()) {
2077         return pages;
2078     }
2079
2080     pss.block = rs->last_seen_block;
2081     pss.page = rs->last_page;
2082     pss.complete_round = false;
2083
2084     if (!pss.block) {
2085         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2086     }
2087
2088     do {
2089         again = true;
2090         found = get_queued_page(rs, &pss);
2091
2092         if (!found) {
2093             /* priority queue empty, so just search for something dirty */
2094             found = find_dirty_block(rs, &pss, &again);
2095         }
2096
2097         if (found) {
2098             pages = ram_save_host_page(rs, &pss, last_stage);
2099         }
2100     } while (!pages && again);
2101
2102     rs->last_seen_block = pss.block;
2103     rs->last_page = pss.page;
2104
2105     return pages;
2106 }
2107
2108 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2109 {
2110     uint64_t pages = size / TARGET_PAGE_SIZE;
2111
2112     if (zero) {
2113         ram_counters.duplicate += pages;
2114     } else {
2115         ram_counters.normal += pages;
2116         ram_counters.transferred += size;
2117         qemu_update_position(f, size);
2118     }
2119 }
2120
2121 static uint64_t ram_bytes_total_common(bool count_ignored)
2122 {
2123     RAMBlock *block;
2124     uint64_t total = 0;
2125
2126     RCU_READ_LOCK_GUARD();
2127
2128     if (count_ignored) {
2129         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2130             total += block->used_length;
2131         }
2132     } else {
2133         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2134             total += block->used_length;
2135         }
2136     }
2137     return total;
2138 }
2139
2140 uint64_t ram_bytes_total(void)
2141 {
2142     return ram_bytes_total_common(false);
2143 }
2144
2145 static void xbzrle_load_setup(void)
2146 {
2147     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2148 }
2149
2150 static void xbzrle_load_cleanup(void)
2151 {
2152     g_free(XBZRLE.decoded_buf);
2153     XBZRLE.decoded_buf = NULL;
2154 }
2155
2156 static void ram_state_cleanup(RAMState **rsp)
2157 {
2158     if (*rsp) {
2159         migration_page_queue_free(*rsp);
2160         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2161         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2162         g_free(*rsp);
2163         *rsp = NULL;
2164     }
2165 }
2166
2167 static void xbzrle_cleanup(void)
2168 {
2169     XBZRLE_cache_lock();
2170     if (XBZRLE.cache) {
2171         cache_fini(XBZRLE.cache);
2172         g_free(XBZRLE.encoded_buf);
2173         g_free(XBZRLE.current_buf);
2174         g_free(XBZRLE.zero_target_page);
2175         XBZRLE.cache = NULL;
2176         XBZRLE.encoded_buf = NULL;
2177         XBZRLE.current_buf = NULL;
2178         XBZRLE.zero_target_page = NULL;
2179     }
2180     XBZRLE_cache_unlock();
2181 }
2182
2183 static void ram_save_cleanup(void *opaque)
2184 {
2185     RAMState **rsp = opaque;
2186     RAMBlock *block;
2187
2188     /* We don't use dirty log with background snapshots */
2189     if (!migrate_background_snapshot()) {
2190         /* caller have hold iothread lock or is in a bh, so there is
2191          * no writing race against the migration bitmap
2192          */
2193         memory_global_dirty_log_stop();
2194     }
2195
2196     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2197         g_free(block->clear_bmap);
2198         block->clear_bmap = NULL;
2199         g_free(block->bmap);
2200         block->bmap = NULL;
2201     }
2202
2203     xbzrle_cleanup();
2204     compress_threads_save_cleanup();
2205     ram_state_cleanup(rsp);
2206 }
2207
2208 static void ram_state_reset(RAMState *rs)
2209 {
2210     rs->last_seen_block = NULL;
2211     rs->last_sent_block = NULL;
2212     rs->last_page = 0;
2213     rs->last_version = ram_list.version;
2214     rs->xbzrle_enabled = false;
2215 }
2216
2217 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2218
2219 /*
2220  * 'expected' is the value you expect the bitmap mostly to be full
2221  * of; it won't bother printing lines that are all this value.
2222  * If 'todump' is null the migration bitmap is dumped.
2223  */
2224 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2225                            unsigned long pages)
2226 {
2227     int64_t cur;
2228     int64_t linelen = 128;
2229     char linebuf[129];
2230
2231     for (cur = 0; cur < pages; cur += linelen) {
2232         int64_t curb;
2233         bool found = false;
2234         /*
2235          * Last line; catch the case where the line length
2236          * is longer than remaining ram
2237          */
2238         if (cur + linelen > pages) {
2239             linelen = pages - cur;
2240         }
2241         for (curb = 0; curb < linelen; curb++) {
2242             bool thisbit = test_bit(cur + curb, todump);
2243             linebuf[curb] = thisbit ? '1' : '.';
2244             found = found || (thisbit != expected);
2245         }
2246         if (found) {
2247             linebuf[curb] = '\0';
2248             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2249         }
2250     }
2251 }
2252
2253 /* **** functions for postcopy ***** */
2254
2255 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2256 {
2257     struct RAMBlock *block;
2258
2259     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2260         unsigned long *bitmap = block->bmap;
2261         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2262         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2263
2264         while (run_start < range) {
2265             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2266             ram_discard_range(block->idstr,
2267                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2268                               ((ram_addr_t)(run_end - run_start))
2269                                 << TARGET_PAGE_BITS);
2270             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2271         }
2272     }
2273 }
2274
2275 /**
2276  * postcopy_send_discard_bm_ram: discard a RAMBlock
2277  *
2278  * Returns zero on success
2279  *
2280  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2281  *
2282  * @ms: current migration state
2283  * @block: RAMBlock to discard
2284  */
2285 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2286 {
2287     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2288     unsigned long current;
2289     unsigned long *bitmap = block->bmap;
2290
2291     for (current = 0; current < end; ) {
2292         unsigned long one = find_next_bit(bitmap, end, current);
2293         unsigned long zero, discard_length;
2294
2295         if (one >= end) {
2296             break;
2297         }
2298
2299         zero = find_next_zero_bit(bitmap, end, one + 1);
2300
2301         if (zero >= end) {
2302             discard_length = end - one;
2303         } else {
2304             discard_length = zero - one;
2305         }
2306         postcopy_discard_send_range(ms, one, discard_length);
2307         current = one + discard_length;
2308     }
2309
2310     return 0;
2311 }
2312
2313 /**
2314  * postcopy_each_ram_send_discard: discard all RAMBlocks
2315  *
2316  * Returns 0 for success or negative for error
2317  *
2318  * Utility for the outgoing postcopy code.
2319  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2320  *   passing it bitmap indexes and name.
2321  * (qemu_ram_foreach_block ends up passing unscaled lengths
2322  *  which would mean postcopy code would have to deal with target page)
2323  *
2324  * @ms: current migration state
2325  */
2326 static int postcopy_each_ram_send_discard(MigrationState *ms)
2327 {
2328     struct RAMBlock *block;
2329     int ret;
2330
2331     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2332         postcopy_discard_send_init(ms, block->idstr);
2333
2334         /*
2335          * Postcopy sends chunks of bitmap over the wire, but it
2336          * just needs indexes at this point, avoids it having
2337          * target page specific code.
2338          */
2339         ret = postcopy_send_discard_bm_ram(ms, block);
2340         postcopy_discard_send_finish(ms);
2341         if (ret) {
2342             return ret;
2343         }
2344     }
2345
2346     return 0;
2347 }
2348
2349 /**
2350  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2351  *
2352  * Helper for postcopy_chunk_hostpages; it's called twice to
2353  * canonicalize the two bitmaps, that are similar, but one is
2354  * inverted.
2355  *
2356  * Postcopy requires that all target pages in a hostpage are dirty or
2357  * clean, not a mix.  This function canonicalizes the bitmaps.
2358  *
2359  * @ms: current migration state
2360  * @block: block that contains the page we want to canonicalize
2361  */
2362 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2363 {
2364     RAMState *rs = ram_state;
2365     unsigned long *bitmap = block->bmap;
2366     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2367     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2368     unsigned long run_start;
2369
2370     if (block->page_size == TARGET_PAGE_SIZE) {
2371         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2372         return;
2373     }
2374
2375     /* Find a dirty page */
2376     run_start = find_next_bit(bitmap, pages, 0);
2377
2378     while (run_start < pages) {
2379
2380         /*
2381          * If the start of this run of pages is in the middle of a host
2382          * page, then we need to fixup this host page.
2383          */
2384         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2385             /* Find the end of this run */
2386             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2387             /*
2388              * If the end isn't at the start of a host page, then the
2389              * run doesn't finish at the end of a host page
2390              * and we need to discard.
2391              */
2392         }
2393
2394         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2395             unsigned long page;
2396             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2397                                                              host_ratio);
2398             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2399
2400             /* Clean up the bitmap */
2401             for (page = fixup_start_addr;
2402                  page < fixup_start_addr + host_ratio; page++) {
2403                 /*
2404                  * Remark them as dirty, updating the count for any pages
2405                  * that weren't previously dirty.
2406                  */
2407                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2408             }
2409         }
2410
2411         /* Find the next dirty page for the next iteration */
2412         run_start = find_next_bit(bitmap, pages, run_start);
2413     }
2414 }
2415
2416 /**
2417  * postcopy_chunk_hostpages: discard any partially sent host page
2418  *
2419  * Utility for the outgoing postcopy code.
2420  *
2421  * Discard any partially sent host-page size chunks, mark any partially
2422  * dirty host-page size chunks as all dirty.  In this case the host-page
2423  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2424  *
2425  * Returns zero on success
2426  *
2427  * @ms: current migration state
2428  * @block: block we want to work with
2429  */
2430 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2431 {
2432     postcopy_discard_send_init(ms, block->idstr);
2433
2434     /*
2435      * Ensure that all partially dirty host pages are made fully dirty.
2436      */
2437     postcopy_chunk_hostpages_pass(ms, block);
2438
2439     postcopy_discard_send_finish(ms);
2440     return 0;
2441 }
2442
2443 /**
2444  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2445  *
2446  * Returns zero on success
2447  *
2448  * Transmit the set of pages to be discarded after precopy to the target
2449  * these are pages that:
2450  *     a) Have been previously transmitted but are now dirty again
2451  *     b) Pages that have never been transmitted, this ensures that
2452  *        any pages on the destination that have been mapped by background
2453  *        tasks get discarded (transparent huge pages is the specific concern)
2454  * Hopefully this is pretty sparse
2455  *
2456  * @ms: current migration state
2457  */
2458 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2459 {
2460     RAMState *rs = ram_state;
2461     RAMBlock *block;
2462     int ret;
2463
2464     RCU_READ_LOCK_GUARD();
2465
2466     /* This should be our last sync, the src is now paused */
2467     migration_bitmap_sync(rs);
2468
2469     /* Easiest way to make sure we don't resume in the middle of a host-page */
2470     rs->last_seen_block = NULL;
2471     rs->last_sent_block = NULL;
2472     rs->last_page = 0;
2473
2474     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2475         /* Deal with TPS != HPS and huge pages */
2476         ret = postcopy_chunk_hostpages(ms, block);
2477         if (ret) {
2478             return ret;
2479         }
2480
2481 #ifdef DEBUG_POSTCOPY
2482         ram_debug_dump_bitmap(block->bmap, true,
2483                               block->used_length >> TARGET_PAGE_BITS);
2484 #endif
2485     }
2486     trace_ram_postcopy_send_discard_bitmap();
2487
2488     return postcopy_each_ram_send_discard(ms);
2489 }
2490
2491 /**
2492  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2493  *
2494  * Returns zero on success
2495  *
2496  * @rbname: name of the RAMBlock of the request. NULL means the
2497  *          same that last one.
2498  * @start: RAMBlock starting page
2499  * @length: RAMBlock size
2500  */
2501 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2502 {
2503     trace_ram_discard_range(rbname, start, length);
2504
2505     RCU_READ_LOCK_GUARD();
2506     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2507
2508     if (!rb) {
2509         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2510         return -1;
2511     }
2512
2513     /*
2514      * On source VM, we don't need to update the received bitmap since
2515      * we don't even have one.
2516      */
2517     if (rb->receivedmap) {
2518         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2519                      length >> qemu_target_page_bits());
2520     }
2521
2522     return ram_block_discard_range(rb, start, length);
2523 }
2524
2525 /*
2526  * For every allocation, we will try not to crash the VM if the
2527  * allocation failed.
2528  */
2529 static int xbzrle_init(void)
2530 {
2531     Error *local_err = NULL;
2532
2533     if (!migrate_use_xbzrle()) {
2534         return 0;
2535     }
2536
2537     XBZRLE_cache_lock();
2538
2539     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2540     if (!XBZRLE.zero_target_page) {
2541         error_report("%s: Error allocating zero page", __func__);
2542         goto err_out;
2543     }
2544
2545     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2546                               TARGET_PAGE_SIZE, &local_err);
2547     if (!XBZRLE.cache) {
2548         error_report_err(local_err);
2549         goto free_zero_page;
2550     }
2551
2552     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2553     if (!XBZRLE.encoded_buf) {
2554         error_report("%s: Error allocating encoded_buf", __func__);
2555         goto free_cache;
2556     }
2557
2558     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2559     if (!XBZRLE.current_buf) {
2560         error_report("%s: Error allocating current_buf", __func__);
2561         goto free_encoded_buf;
2562     }
2563
2564     /* We are all good */
2565     XBZRLE_cache_unlock();
2566     return 0;
2567
2568 free_encoded_buf:
2569     g_free(XBZRLE.encoded_buf);
2570     XBZRLE.encoded_buf = NULL;
2571 free_cache:
2572     cache_fini(XBZRLE.cache);
2573     XBZRLE.cache = NULL;
2574 free_zero_page:
2575     g_free(XBZRLE.zero_target_page);
2576     XBZRLE.zero_target_page = NULL;
2577 err_out:
2578     XBZRLE_cache_unlock();
2579     return -ENOMEM;
2580 }
2581
2582 static int ram_state_init(RAMState **rsp)
2583 {
2584     *rsp = g_try_new0(RAMState, 1);
2585
2586     if (!*rsp) {
2587         error_report("%s: Init ramstate fail", __func__);
2588         return -1;
2589     }
2590
2591     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2592     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2593     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2594
2595     /*
2596      * Count the total number of pages used by ram blocks not including any
2597      * gaps due to alignment or unplugs.
2598      * This must match with the initial values of dirty bitmap.
2599      */
2600     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2601     ram_state_reset(*rsp);
2602
2603     return 0;
2604 }
2605
2606 static void ram_list_init_bitmaps(void)
2607 {
2608     MigrationState *ms = migrate_get_current();
2609     RAMBlock *block;
2610     unsigned long pages;
2611     uint8_t shift;
2612
2613     /* Skip setting bitmap if there is no RAM */
2614     if (ram_bytes_total()) {
2615         shift = ms->clear_bitmap_shift;
2616         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2617             error_report("clear_bitmap_shift (%u) too big, using "
2618                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2619             shift = CLEAR_BITMAP_SHIFT_MAX;
2620         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2621             error_report("clear_bitmap_shift (%u) too small, using "
2622                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2623             shift = CLEAR_BITMAP_SHIFT_MIN;
2624         }
2625
2626         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2627             pages = block->max_length >> TARGET_PAGE_BITS;
2628             /*
2629              * The initial dirty bitmap for migration must be set with all
2630              * ones to make sure we'll migrate every guest RAM page to
2631              * destination.
2632              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2633              * new migration after a failed migration, ram_list.
2634              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2635              * guest memory.
2636              */
2637             block->bmap = bitmap_new(pages);
2638             bitmap_set(block->bmap, 0, pages);
2639             block->clear_bmap_shift = shift;
2640             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2641         }
2642     }
2643 }
2644
2645 static void ram_init_bitmaps(RAMState *rs)
2646 {
2647     /* For memory_global_dirty_log_start below.  */
2648     qemu_mutex_lock_iothread();
2649     qemu_mutex_lock_ramlist();
2650
2651     WITH_RCU_READ_LOCK_GUARD() {
2652         ram_list_init_bitmaps();
2653         /* We don't use dirty log with background snapshots */
2654         if (!migrate_background_snapshot()) {
2655             memory_global_dirty_log_start();
2656             migration_bitmap_sync_precopy(rs);
2657         }
2658     }
2659     qemu_mutex_unlock_ramlist();
2660     qemu_mutex_unlock_iothread();
2661 }
2662
2663 static int ram_init_all(RAMState **rsp)
2664 {
2665     if (ram_state_init(rsp)) {
2666         return -1;
2667     }
2668
2669     if (xbzrle_init()) {
2670         ram_state_cleanup(rsp);
2671         return -1;
2672     }
2673
2674     ram_init_bitmaps(*rsp);
2675
2676     return 0;
2677 }
2678
2679 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2680 {
2681     RAMBlock *block;
2682     uint64_t pages = 0;
2683
2684     /*
2685      * Postcopy is not using xbzrle/compression, so no need for that.
2686      * Also, since source are already halted, we don't need to care
2687      * about dirty page logging as well.
2688      */
2689
2690     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2691         pages += bitmap_count_one(block->bmap,
2692                                   block->used_length >> TARGET_PAGE_BITS);
2693     }
2694
2695     /* This may not be aligned with current bitmaps. Recalculate. */
2696     rs->migration_dirty_pages = pages;
2697
2698     ram_state_reset(rs);
2699
2700     /* Update RAMState cache of output QEMUFile */
2701     rs->f = out;
2702
2703     trace_ram_state_resume_prepare(pages);
2704 }
2705
2706 /*
2707  * This function clears bits of the free pages reported by the caller from the
2708  * migration dirty bitmap. @addr is the host address corresponding to the
2709  * start of the continuous guest free pages, and @len is the total bytes of
2710  * those pages.
2711  */
2712 void qemu_guest_free_page_hint(void *addr, size_t len)
2713 {
2714     RAMBlock *block;
2715     ram_addr_t offset;
2716     size_t used_len, start, npages;
2717     MigrationState *s = migrate_get_current();
2718
2719     /* This function is currently expected to be used during live migration */
2720     if (!migration_is_setup_or_active(s->state)) {
2721         return;
2722     }
2723
2724     for (; len > 0; len -= used_len, addr += used_len) {
2725         block = qemu_ram_block_from_host(addr, false, &offset);
2726         if (unlikely(!block || offset >= block->used_length)) {
2727             /*
2728              * The implementation might not support RAMBlock resize during
2729              * live migration, but it could happen in theory with future
2730              * updates. So we add a check here to capture that case.
2731              */
2732             error_report_once("%s unexpected error", __func__);
2733             return;
2734         }
2735
2736         if (len <= block->used_length - offset) {
2737             used_len = len;
2738         } else {
2739             used_len = block->used_length - offset;
2740         }
2741
2742         start = offset >> TARGET_PAGE_BITS;
2743         npages = used_len >> TARGET_PAGE_BITS;
2744
2745         qemu_mutex_lock(&ram_state->bitmap_mutex);
2746         ram_state->migration_dirty_pages -=
2747                       bitmap_count_one_with_offset(block->bmap, start, npages);
2748         bitmap_clear(block->bmap, start, npages);
2749         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2750     }
2751 }
2752
2753 /*
2754  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2755  * long-running RCU critical section.  When rcu-reclaims in the code
2756  * start to become numerous it will be necessary to reduce the
2757  * granularity of these critical sections.
2758  */
2759
2760 /**
2761  * ram_save_setup: Setup RAM for migration
2762  *
2763  * Returns zero to indicate success and negative for error
2764  *
2765  * @f: QEMUFile where to send the data
2766  * @opaque: RAMState pointer
2767  */
2768 static int ram_save_setup(QEMUFile *f, void *opaque)
2769 {
2770     RAMState **rsp = opaque;
2771     RAMBlock *block;
2772
2773     if (compress_threads_save_setup()) {
2774         return -1;
2775     }
2776
2777     /* migration has already setup the bitmap, reuse it. */
2778     if (!migration_in_colo_state()) {
2779         if (ram_init_all(rsp) != 0) {
2780             compress_threads_save_cleanup();
2781             return -1;
2782         }
2783     }
2784     (*rsp)->f = f;
2785
2786     WITH_RCU_READ_LOCK_GUARD() {
2787         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2788
2789         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2790             qemu_put_byte(f, strlen(block->idstr));
2791             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2792             qemu_put_be64(f, block->used_length);
2793             if (migrate_postcopy_ram() && block->page_size !=
2794                                           qemu_host_page_size) {
2795                 qemu_put_be64(f, block->page_size);
2796             }
2797             if (migrate_ignore_shared()) {
2798                 qemu_put_be64(f, block->mr->addr);
2799             }
2800         }
2801     }
2802
2803     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2804     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2805
2806     multifd_send_sync_main(f);
2807     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2808     qemu_fflush(f);
2809
2810     return 0;
2811 }
2812
2813 /**
2814  * ram_save_iterate: iterative stage for migration
2815  *
2816  * Returns zero to indicate success and negative for error
2817  *
2818  * @f: QEMUFile where to send the data
2819  * @opaque: RAMState pointer
2820  */
2821 static int ram_save_iterate(QEMUFile *f, void *opaque)
2822 {
2823     RAMState **temp = opaque;
2824     RAMState *rs = *temp;
2825     int ret = 0;
2826     int i;
2827     int64_t t0;
2828     int done = 0;
2829
2830     if (blk_mig_bulk_active()) {
2831         /* Avoid transferring ram during bulk phase of block migration as
2832          * the bulk phase will usually take a long time and transferring
2833          * ram updates during that time is pointless. */
2834         goto out;
2835     }
2836
2837     WITH_RCU_READ_LOCK_GUARD() {
2838         if (ram_list.version != rs->last_version) {
2839             ram_state_reset(rs);
2840         }
2841
2842         /* Read version before ram_list.blocks */
2843         smp_rmb();
2844
2845         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2846
2847         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2848         i = 0;
2849         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2850                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2851             int pages;
2852
2853             if (qemu_file_get_error(f)) {
2854                 break;
2855             }
2856
2857             pages = ram_find_and_save_block(rs, false);
2858             /* no more pages to sent */
2859             if (pages == 0) {
2860                 done = 1;
2861                 break;
2862             }
2863
2864             if (pages < 0) {
2865                 qemu_file_set_error(f, pages);
2866                 break;
2867             }
2868
2869             rs->target_page_count += pages;
2870
2871             /*
2872              * During postcopy, it is necessary to make sure one whole host
2873              * page is sent in one chunk.
2874              */
2875             if (migrate_postcopy_ram()) {
2876                 flush_compressed_data(rs);
2877             }
2878
2879             /*
2880              * we want to check in the 1st loop, just in case it was the 1st
2881              * time and we had to sync the dirty bitmap.
2882              * qemu_clock_get_ns() is a bit expensive, so we only check each
2883              * some iterations
2884              */
2885             if ((i & 63) == 0) {
2886                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2887                               1000000;
2888                 if (t1 > MAX_WAIT) {
2889                     trace_ram_save_iterate_big_wait(t1, i);
2890                     break;
2891                 }
2892             }
2893             i++;
2894         }
2895     }
2896
2897     /*
2898      * Must occur before EOS (or any QEMUFile operation)
2899      * because of RDMA protocol.
2900      */
2901     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2902
2903 out:
2904     if (ret >= 0
2905         && migration_is_setup_or_active(migrate_get_current()->state)) {
2906         multifd_send_sync_main(rs->f);
2907         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2908         qemu_fflush(f);
2909         ram_counters.transferred += 8;
2910
2911         ret = qemu_file_get_error(f);
2912     }
2913     if (ret < 0) {
2914         return ret;
2915     }
2916
2917     return done;
2918 }
2919
2920 /**
2921  * ram_save_complete: function called to send the remaining amount of ram
2922  *
2923  * Returns zero to indicate success or negative on error
2924  *
2925  * Called with iothread lock
2926  *
2927  * @f: QEMUFile where to send the data
2928  * @opaque: RAMState pointer
2929  */
2930 static int ram_save_complete(QEMUFile *f, void *opaque)
2931 {
2932     RAMState **temp = opaque;
2933     RAMState *rs = *temp;
2934     int ret = 0;
2935
2936     WITH_RCU_READ_LOCK_GUARD() {
2937         if (!migration_in_postcopy()) {
2938             migration_bitmap_sync_precopy(rs);
2939         }
2940
2941         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2942
2943         /* try transferring iterative blocks of memory */
2944
2945         /* flush all remaining blocks regardless of rate limiting */
2946         while (true) {
2947             int pages;
2948
2949             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2950             /* no more blocks to sent */
2951             if (pages == 0) {
2952                 break;
2953             }
2954             if (pages < 0) {
2955                 ret = pages;
2956                 break;
2957             }
2958         }
2959
2960         flush_compressed_data(rs);
2961         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2962     }
2963
2964     if (ret >= 0) {
2965         multifd_send_sync_main(rs->f);
2966         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2967         qemu_fflush(f);
2968     }
2969
2970     return ret;
2971 }
2972
2973 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2974                              uint64_t *res_precopy_only,
2975                              uint64_t *res_compatible,
2976                              uint64_t *res_postcopy_only)
2977 {
2978     RAMState **temp = opaque;
2979     RAMState *rs = *temp;
2980     uint64_t remaining_size;
2981
2982     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2983
2984     if (!migration_in_postcopy() &&
2985         remaining_size < max_size) {
2986         qemu_mutex_lock_iothread();
2987         WITH_RCU_READ_LOCK_GUARD() {
2988             migration_bitmap_sync_precopy(rs);
2989         }
2990         qemu_mutex_unlock_iothread();
2991         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2992     }
2993
2994     if (migrate_postcopy_ram()) {
2995         /* We can do postcopy, and all the data is postcopiable */
2996         *res_compatible += remaining_size;
2997     } else {
2998         *res_precopy_only += remaining_size;
2999     }
3000 }
3001
3002 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3003 {
3004     unsigned int xh_len;
3005     int xh_flags;
3006     uint8_t *loaded_data;
3007
3008     /* extract RLE header */
3009     xh_flags = qemu_get_byte(f);
3010     xh_len = qemu_get_be16(f);
3011
3012     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3013         error_report("Failed to load XBZRLE page - wrong compression!");
3014         return -1;
3015     }
3016
3017     if (xh_len > TARGET_PAGE_SIZE) {
3018         error_report("Failed to load XBZRLE page - len overflow!");
3019         return -1;
3020     }
3021     loaded_data = XBZRLE.decoded_buf;
3022     /* load data and decode */
3023     /* it can change loaded_data to point to an internal buffer */
3024     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3025
3026     /* decode RLE */
3027     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3028                              TARGET_PAGE_SIZE) == -1) {
3029         error_report("Failed to load XBZRLE page - decode error!");
3030         return -1;
3031     }
3032
3033     return 0;
3034 }
3035
3036 /**
3037  * ram_block_from_stream: read a RAMBlock id from the migration stream
3038  *
3039  * Must be called from within a rcu critical section.
3040  *
3041  * Returns a pointer from within the RCU-protected ram_list.
3042  *
3043  * @f: QEMUFile where to read the data from
3044  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3045  */
3046 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3047 {
3048     static RAMBlock *block;
3049     char id[256];
3050     uint8_t len;
3051
3052     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3053         if (!block) {
3054             error_report("Ack, bad migration stream!");
3055             return NULL;
3056         }
3057         return block;
3058     }
3059
3060     len = qemu_get_byte(f);
3061     qemu_get_buffer(f, (uint8_t *)id, len);
3062     id[len] = 0;
3063
3064     block = qemu_ram_block_by_name(id);
3065     if (!block) {
3066         error_report("Can't find block %s", id);
3067         return NULL;
3068     }
3069
3070     if (ramblock_is_ignored(block)) {
3071         error_report("block %s should not be migrated !", id);
3072         return NULL;
3073     }
3074
3075     return block;
3076 }
3077
3078 static inline void *host_from_ram_block_offset(RAMBlock *block,
3079                                                ram_addr_t offset)
3080 {
3081     if (!offset_in_ramblock(block, offset)) {
3082         return NULL;
3083     }
3084
3085     return block->host + offset;
3086 }
3087
3088 static void *host_page_from_ram_block_offset(RAMBlock *block,
3089                                              ram_addr_t offset)
3090 {
3091     /* Note: Explicitly no check against offset_in_ramblock(). */
3092     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3093                                    block->page_size);
3094 }
3095
3096 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3097                                                          ram_addr_t offset)
3098 {
3099     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3100 }
3101
3102 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3103                              ram_addr_t offset, bool record_bitmap)
3104 {
3105     if (!offset_in_ramblock(block, offset)) {
3106         return NULL;
3107     }
3108     if (!block->colo_cache) {
3109         error_report("%s: colo_cache is NULL in block :%s",
3110                      __func__, block->idstr);
3111         return NULL;
3112     }
3113
3114     /*
3115     * During colo checkpoint, we need bitmap of these migrated pages.
3116     * It help us to decide which pages in ram cache should be flushed
3117     * into VM's RAM later.
3118     */
3119     if (record_bitmap &&
3120         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3121         ram_state->migration_dirty_pages++;
3122     }
3123     return block->colo_cache + offset;
3124 }
3125
3126 /**
3127  * ram_handle_compressed: handle the zero page case
3128  *
3129  * If a page (or a whole RDMA chunk) has been
3130  * determined to be zero, then zap it.
3131  *
3132  * @host: host address for the zero page
3133  * @ch: what the page is filled from.  We only support zero
3134  * @size: size of the zero page
3135  */
3136 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3137 {
3138     if (ch != 0 || !is_zero_range(host, size)) {
3139         memset(host, ch, size);
3140     }
3141 }
3142
3143 /* return the size after decompression, or negative value on error */
3144 static int
3145 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3146                      const uint8_t *source, size_t source_len)
3147 {
3148     int err;
3149
3150     err = inflateReset(stream);
3151     if (err != Z_OK) {
3152         return -1;
3153     }
3154
3155     stream->avail_in = source_len;
3156     stream->next_in = (uint8_t *)source;
3157     stream->avail_out = dest_len;
3158     stream->next_out = dest;
3159
3160     err = inflate(stream, Z_NO_FLUSH);
3161     if (err != Z_STREAM_END) {
3162         return -1;
3163     }
3164
3165     return stream->total_out;
3166 }
3167
3168 static void *do_data_decompress(void *opaque)
3169 {
3170     DecompressParam *param = opaque;
3171     unsigned long pagesize;
3172     uint8_t *des;
3173     int len, ret;
3174
3175     qemu_mutex_lock(&param->mutex);
3176     while (!param->quit) {
3177         if (param->des) {
3178             des = param->des;
3179             len = param->len;
3180             param->des = 0;
3181             qemu_mutex_unlock(&param->mutex);
3182
3183             pagesize = TARGET_PAGE_SIZE;
3184
3185             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3186                                        param->compbuf, len);
3187             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3188                 error_report("decompress data failed");
3189                 qemu_file_set_error(decomp_file, ret);
3190             }
3191
3192             qemu_mutex_lock(&decomp_done_lock);
3193             param->done = true;
3194             qemu_cond_signal(&decomp_done_cond);
3195             qemu_mutex_unlock(&decomp_done_lock);
3196
3197             qemu_mutex_lock(&param->mutex);
3198         } else {
3199             qemu_cond_wait(&param->cond, &param->mutex);
3200         }
3201     }
3202     qemu_mutex_unlock(&param->mutex);
3203
3204     return NULL;
3205 }
3206
3207 static int wait_for_decompress_done(void)
3208 {
3209     int idx, thread_count;
3210
3211     if (!migrate_use_compression()) {
3212         return 0;
3213     }
3214
3215     thread_count = migrate_decompress_threads();
3216     qemu_mutex_lock(&decomp_done_lock);
3217     for (idx = 0; idx < thread_count; idx++) {
3218         while (!decomp_param[idx].done) {
3219             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3220         }
3221     }
3222     qemu_mutex_unlock(&decomp_done_lock);
3223     return qemu_file_get_error(decomp_file);
3224 }
3225
3226 static void compress_threads_load_cleanup(void)
3227 {
3228     int i, thread_count;
3229
3230     if (!migrate_use_compression()) {
3231         return;
3232     }
3233     thread_count = migrate_decompress_threads();
3234     for (i = 0; i < thread_count; i++) {
3235         /*
3236          * we use it as a indicator which shows if the thread is
3237          * properly init'd or not
3238          */
3239         if (!decomp_param[i].compbuf) {
3240             break;
3241         }
3242
3243         qemu_mutex_lock(&decomp_param[i].mutex);
3244         decomp_param[i].quit = true;
3245         qemu_cond_signal(&decomp_param[i].cond);
3246         qemu_mutex_unlock(&decomp_param[i].mutex);
3247     }
3248     for (i = 0; i < thread_count; i++) {
3249         if (!decomp_param[i].compbuf) {
3250             break;
3251         }
3252
3253         qemu_thread_join(decompress_threads + i);
3254         qemu_mutex_destroy(&decomp_param[i].mutex);
3255         qemu_cond_destroy(&decomp_param[i].cond);
3256         inflateEnd(&decomp_param[i].stream);
3257         g_free(decomp_param[i].compbuf);
3258         decomp_param[i].compbuf = NULL;
3259     }
3260     g_free(decompress_threads);
3261     g_free(decomp_param);
3262     decompress_threads = NULL;
3263     decomp_param = NULL;
3264     decomp_file = NULL;
3265 }
3266
3267 static int compress_threads_load_setup(QEMUFile *f)
3268 {
3269     int i, thread_count;
3270
3271     if (!migrate_use_compression()) {
3272         return 0;
3273     }
3274
3275     thread_count = migrate_decompress_threads();
3276     decompress_threads = g_new0(QemuThread, thread_count);
3277     decomp_param = g_new0(DecompressParam, thread_count);
3278     qemu_mutex_init(&decomp_done_lock);
3279     qemu_cond_init(&decomp_done_cond);
3280     decomp_file = f;
3281     for (i = 0; i < thread_count; i++) {
3282         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3283             goto exit;
3284         }
3285
3286         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3287         qemu_mutex_init(&decomp_param[i].mutex);
3288         qemu_cond_init(&decomp_param[i].cond);
3289         decomp_param[i].done = true;
3290         decomp_param[i].quit = false;
3291         qemu_thread_create(decompress_threads + i, "decompress",
3292                            do_data_decompress, decomp_param + i,
3293                            QEMU_THREAD_JOINABLE);
3294     }
3295     return 0;
3296 exit:
3297     compress_threads_load_cleanup();
3298     return -1;
3299 }
3300
3301 static void decompress_data_with_multi_threads(QEMUFile *f,
3302                                                void *host, int len)
3303 {
3304     int idx, thread_count;
3305
3306     thread_count = migrate_decompress_threads();
3307     QEMU_LOCK_GUARD(&decomp_done_lock);
3308     while (true) {
3309         for (idx = 0; idx < thread_count; idx++) {
3310             if (decomp_param[idx].done) {
3311                 decomp_param[idx].done = false;
3312                 qemu_mutex_lock(&decomp_param[idx].mutex);
3313                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3314                 decomp_param[idx].des = host;
3315                 decomp_param[idx].len = len;
3316                 qemu_cond_signal(&decomp_param[idx].cond);
3317                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3318                 break;
3319             }
3320         }
3321         if (idx < thread_count) {
3322             break;
3323         } else {
3324             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3325         }
3326     }
3327 }
3328
3329 static void colo_init_ram_state(void)
3330 {
3331     ram_state_init(&ram_state);
3332 }
3333
3334 /*
3335  * colo cache: this is for secondary VM, we cache the whole
3336  * memory of the secondary VM, it is need to hold the global lock
3337  * to call this helper.
3338  */
3339 int colo_init_ram_cache(void)
3340 {
3341     RAMBlock *block;
3342
3343     WITH_RCU_READ_LOCK_GUARD() {
3344         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3345             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3346                                                     NULL, false, false);
3347             if (!block->colo_cache) {
3348                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3349                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3350                              block->used_length);
3351                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3352                     if (block->colo_cache) {
3353                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3354                         block->colo_cache = NULL;
3355                     }
3356                 }
3357                 return -errno;
3358             }
3359         }
3360     }
3361
3362     /*
3363     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3364     * with to decide which page in cache should be flushed into SVM's RAM. Here
3365     * we use the same name 'ram_bitmap' as for migration.
3366     */
3367     if (ram_bytes_total()) {
3368         RAMBlock *block;
3369
3370         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3371             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3372             block->bmap = bitmap_new(pages);
3373         }
3374     }
3375
3376     colo_init_ram_state();
3377     return 0;
3378 }
3379
3380 /* TODO: duplicated with ram_init_bitmaps */
3381 void colo_incoming_start_dirty_log(void)
3382 {
3383     RAMBlock *block = NULL;
3384     /* For memory_global_dirty_log_start below. */
3385     qemu_mutex_lock_iothread();
3386     qemu_mutex_lock_ramlist();
3387
3388     memory_global_dirty_log_sync();
3389     WITH_RCU_READ_LOCK_GUARD() {
3390         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3391             ramblock_sync_dirty_bitmap(ram_state, block);
3392             /* Discard this dirty bitmap record */
3393             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3394         }
3395         memory_global_dirty_log_start();
3396     }
3397     ram_state->migration_dirty_pages = 0;
3398     qemu_mutex_unlock_ramlist();
3399     qemu_mutex_unlock_iothread();
3400 }
3401
3402 /* It is need to hold the global lock to call this helper */
3403 void colo_release_ram_cache(void)
3404 {
3405     RAMBlock *block;
3406
3407     memory_global_dirty_log_stop();
3408     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3409         g_free(block->bmap);
3410         block->bmap = NULL;
3411     }
3412
3413     WITH_RCU_READ_LOCK_GUARD() {
3414         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3415             if (block->colo_cache) {
3416                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3417                 block->colo_cache = NULL;
3418             }
3419         }
3420     }
3421     ram_state_cleanup(&ram_state);
3422 }
3423
3424 /**
3425  * ram_load_setup: Setup RAM for migration incoming side
3426  *
3427  * Returns zero to indicate success and negative for error
3428  *
3429  * @f: QEMUFile where to receive the data
3430  * @opaque: RAMState pointer
3431  */
3432 static int ram_load_setup(QEMUFile *f, void *opaque)
3433 {
3434     if (compress_threads_load_setup(f)) {
3435         return -1;
3436     }
3437
3438     xbzrle_load_setup();
3439     ramblock_recv_map_init();
3440
3441     return 0;
3442 }
3443
3444 static int ram_load_cleanup(void *opaque)
3445 {
3446     RAMBlock *rb;
3447
3448     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3449         qemu_ram_block_writeback(rb);
3450     }
3451
3452     xbzrle_load_cleanup();
3453     compress_threads_load_cleanup();
3454
3455     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3456         g_free(rb->receivedmap);
3457         rb->receivedmap = NULL;
3458     }
3459
3460     return 0;
3461 }
3462
3463 /**
3464  * ram_postcopy_incoming_init: allocate postcopy data structures
3465  *
3466  * Returns 0 for success and negative if there was one error
3467  *
3468  * @mis: current migration incoming state
3469  *
3470  * Allocate data structures etc needed by incoming migration with
3471  * postcopy-ram. postcopy-ram's similarly names
3472  * postcopy_ram_incoming_init does the work.
3473  */
3474 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3475 {
3476     return postcopy_ram_incoming_init(mis);
3477 }
3478
3479 /**
3480  * ram_load_postcopy: load a page in postcopy case
3481  *
3482  * Returns 0 for success or -errno in case of error
3483  *
3484  * Called in postcopy mode by ram_load().
3485  * rcu_read_lock is taken prior to this being called.
3486  *
3487  * @f: QEMUFile where to send the data
3488  */
3489 static int ram_load_postcopy(QEMUFile *f)
3490 {
3491     int flags = 0, ret = 0;
3492     bool place_needed = false;
3493     bool matches_target_page_size = false;
3494     MigrationIncomingState *mis = migration_incoming_get_current();
3495     /* Temporary page that is later 'placed' */
3496     void *postcopy_host_page = mis->postcopy_tmp_page;
3497     void *host_page = NULL;
3498     bool all_zero = true;
3499     int target_pages = 0;
3500
3501     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3502         ram_addr_t addr;
3503         void *page_buffer = NULL;
3504         void *place_source = NULL;
3505         RAMBlock *block = NULL;
3506         uint8_t ch;
3507         int len;
3508
3509         addr = qemu_get_be64(f);
3510
3511         /*
3512          * If qemu file error, we should stop here, and then "addr"
3513          * may be invalid
3514          */
3515         ret = qemu_file_get_error(f);
3516         if (ret) {
3517             break;
3518         }
3519
3520         flags = addr & ~TARGET_PAGE_MASK;
3521         addr &= TARGET_PAGE_MASK;
3522
3523         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3524         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3525                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3526             block = ram_block_from_stream(f, flags);
3527             if (!block) {
3528                 ret = -EINVAL;
3529                 break;
3530             }
3531
3532             /*
3533              * Relying on used_length is racy and can result in false positives.
3534              * We might place pages beyond used_length in case RAM was shrunk
3535              * while in postcopy, which is fine - trying to place via
3536              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3537              */
3538             if (!block->host || addr >= block->postcopy_length) {
3539                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3540                 ret = -EINVAL;
3541                 break;
3542             }
3543             target_pages++;
3544             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3545             /*
3546              * Postcopy requires that we place whole host pages atomically;
3547              * these may be huge pages for RAMBlocks that are backed by
3548              * hugetlbfs.
3549              * To make it atomic, the data is read into a temporary page
3550              * that's moved into place later.
3551              * The migration protocol uses,  possibly smaller, target-pages
3552              * however the source ensures it always sends all the components
3553              * of a host page in one chunk.
3554              */
3555             page_buffer = postcopy_host_page +
3556                           host_page_offset_from_ram_block_offset(block, addr);
3557             /* If all TP are zero then we can optimise the place */
3558             if (target_pages == 1) {
3559                 host_page = host_page_from_ram_block_offset(block, addr);
3560             } else if (host_page != host_page_from_ram_block_offset(block,
3561                                                                     addr)) {
3562                 /* not the 1st TP within the HP */
3563                 error_report("Non-same host page %p/%p", host_page,
3564                              host_page_from_ram_block_offset(block, addr));
3565                 ret = -EINVAL;
3566                 break;
3567             }
3568
3569             /*
3570              * If it's the last part of a host page then we place the host
3571              * page
3572              */
3573             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3574                 place_needed = true;
3575             }
3576             place_source = postcopy_host_page;
3577         }
3578
3579         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3580         case RAM_SAVE_FLAG_ZERO:
3581             ch = qemu_get_byte(f);
3582             /*
3583              * Can skip to set page_buffer when
3584              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3585              */
3586             if (ch || !matches_target_page_size) {
3587                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3588             }
3589             if (ch) {
3590                 all_zero = false;
3591             }
3592             break;
3593
3594         case RAM_SAVE_FLAG_PAGE:
3595             all_zero = false;
3596             if (!matches_target_page_size) {
3597                 /* For huge pages, we always use temporary buffer */
3598                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3599             } else {
3600                 /*
3601                  * For small pages that matches target page size, we
3602                  * avoid the qemu_file copy.  Instead we directly use
3603                  * the buffer of QEMUFile to place the page.  Note: we
3604                  * cannot do any QEMUFile operation before using that
3605                  * buffer to make sure the buffer is valid when
3606                  * placing the page.
3607                  */
3608                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3609                                          TARGET_PAGE_SIZE);
3610             }
3611             break;
3612         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3613             all_zero = false;
3614             len = qemu_get_be32(f);
3615             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3616                 error_report("Invalid compressed data length: %d", len);
3617                 ret = -EINVAL;
3618                 break;
3619             }
3620             decompress_data_with_multi_threads(f, page_buffer, len);
3621             break;
3622
3623         case RAM_SAVE_FLAG_EOS:
3624             /* normal exit */
3625             multifd_recv_sync_main();
3626             break;
3627         default:
3628             error_report("Unknown combination of migration flags: 0x%x"
3629                          " (postcopy mode)", flags);
3630             ret = -EINVAL;
3631             break;
3632         }
3633
3634         /* Got the whole host page, wait for decompress before placing. */
3635         if (place_needed) {
3636             ret |= wait_for_decompress_done();
3637         }
3638
3639         /* Detect for any possible file errors */
3640         if (!ret && qemu_file_get_error(f)) {
3641             ret = qemu_file_get_error(f);
3642         }
3643
3644         if (!ret && place_needed) {
3645             if (all_zero) {
3646                 ret = postcopy_place_page_zero(mis, host_page, block);
3647             } else {
3648                 ret = postcopy_place_page(mis, host_page, place_source,
3649                                           block);
3650             }
3651             place_needed = false;
3652             target_pages = 0;
3653             /* Assume we have a zero page until we detect something different */
3654             all_zero = true;
3655         }
3656     }
3657
3658     return ret;
3659 }
3660
3661 static bool postcopy_is_advised(void)
3662 {
3663     PostcopyState ps = postcopy_state_get();
3664     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3665 }
3666
3667 static bool postcopy_is_running(void)
3668 {
3669     PostcopyState ps = postcopy_state_get();
3670     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3671 }
3672
3673 /*
3674  * Flush content of RAM cache into SVM's memory.
3675  * Only flush the pages that be dirtied by PVM or SVM or both.
3676  */
3677 void colo_flush_ram_cache(void)
3678 {
3679     RAMBlock *block = NULL;
3680     void *dst_host;
3681     void *src_host;
3682     unsigned long offset = 0;
3683
3684     memory_global_dirty_log_sync();
3685     WITH_RCU_READ_LOCK_GUARD() {
3686         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3687             ramblock_sync_dirty_bitmap(ram_state, block);
3688         }
3689     }
3690
3691     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3692     WITH_RCU_READ_LOCK_GUARD() {
3693         block = QLIST_FIRST_RCU(&ram_list.blocks);
3694
3695         while (block) {
3696             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3697
3698             if (!offset_in_ramblock(block,
3699                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3700                 offset = 0;
3701                 block = QLIST_NEXT_RCU(block, next);
3702             } else {
3703                 migration_bitmap_clear_dirty(ram_state, block, offset);
3704                 dst_host = block->host
3705                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3706                 src_host = block->colo_cache
3707                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3708                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3709             }
3710         }
3711     }
3712     trace_colo_flush_ram_cache_end();
3713 }
3714
3715 /**
3716  * ram_load_precopy: load pages in precopy case
3717  *
3718  * Returns 0 for success or -errno in case of error
3719  *
3720  * Called in precopy mode by ram_load().
3721  * rcu_read_lock is taken prior to this being called.
3722  *
3723  * @f: QEMUFile where to send the data
3724  */
3725 static int ram_load_precopy(QEMUFile *f)
3726 {
3727     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3728     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3729     bool postcopy_advised = postcopy_is_advised();
3730     if (!migrate_use_compression()) {
3731         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3732     }
3733
3734     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3735         ram_addr_t addr, total_ram_bytes;
3736         void *host = NULL, *host_bak = NULL;
3737         uint8_t ch;
3738
3739         /*
3740          * Yield periodically to let main loop run, but an iteration of
3741          * the main loop is expensive, so do it each some iterations
3742          */
3743         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3744             aio_co_schedule(qemu_get_current_aio_context(),
3745                             qemu_coroutine_self());
3746             qemu_coroutine_yield();
3747         }
3748         i++;
3749
3750         addr = qemu_get_be64(f);
3751         flags = addr & ~TARGET_PAGE_MASK;
3752         addr &= TARGET_PAGE_MASK;
3753
3754         if (flags & invalid_flags) {
3755             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3756                 error_report("Received an unexpected compressed page");
3757             }
3758
3759             ret = -EINVAL;
3760             break;
3761         }
3762
3763         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3764                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3765             RAMBlock *block = ram_block_from_stream(f, flags);
3766
3767             host = host_from_ram_block_offset(block, addr);
3768             /*
3769              * After going into COLO stage, we should not load the page
3770              * into SVM's memory directly, we put them into colo_cache firstly.
3771              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3772              * Previously, we copied all these memory in preparing stage of COLO
3773              * while we need to stop VM, which is a time-consuming process.
3774              * Here we optimize it by a trick, back-up every page while in
3775              * migration process while COLO is enabled, though it affects the
3776              * speed of the migration, but it obviously reduce the downtime of
3777              * back-up all SVM'S memory in COLO preparing stage.
3778              */
3779             if (migration_incoming_colo_enabled()) {
3780                 if (migration_incoming_in_colo_state()) {
3781                     /* In COLO stage, put all pages into cache temporarily */
3782                     host = colo_cache_from_block_offset(block, addr, true);
3783                 } else {
3784                    /*
3785                     * In migration stage but before COLO stage,
3786                     * Put all pages into both cache and SVM's memory.
3787                     */
3788                     host_bak = colo_cache_from_block_offset(block, addr, false);
3789                 }
3790             }
3791             if (!host) {
3792                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3793                 ret = -EINVAL;
3794                 break;
3795             }
3796             if (!migration_incoming_in_colo_state()) {
3797                 ramblock_recv_bitmap_set(block, host);
3798             }
3799
3800             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3801         }
3802
3803         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3804         case RAM_SAVE_FLAG_MEM_SIZE:
3805             /* Synchronize RAM block list */
3806             total_ram_bytes = addr;
3807             while (!ret && total_ram_bytes) {
3808                 RAMBlock *block;
3809                 char id[256];
3810                 ram_addr_t length;
3811
3812                 len = qemu_get_byte(f);
3813                 qemu_get_buffer(f, (uint8_t *)id, len);
3814                 id[len] = 0;
3815                 length = qemu_get_be64(f);
3816
3817                 block = qemu_ram_block_by_name(id);
3818                 if (block && !qemu_ram_is_migratable(block)) {
3819                     error_report("block %s should not be migrated !", id);
3820                     ret = -EINVAL;
3821                 } else if (block) {
3822                     if (length != block->used_length) {
3823                         Error *local_err = NULL;
3824
3825                         ret = qemu_ram_resize(block, length,
3826                                               &local_err);
3827                         if (local_err) {
3828                             error_report_err(local_err);
3829                         }
3830                     }
3831                     /* For postcopy we need to check hugepage sizes match */
3832                     if (postcopy_advised && migrate_postcopy_ram() &&
3833                         block->page_size != qemu_host_page_size) {
3834                         uint64_t remote_page_size = qemu_get_be64(f);
3835                         if (remote_page_size != block->page_size) {
3836                             error_report("Mismatched RAM page size %s "
3837                                          "(local) %zd != %" PRId64,
3838                                          id, block->page_size,
3839                                          remote_page_size);
3840                             ret = -EINVAL;
3841                         }
3842                     }
3843                     if (migrate_ignore_shared()) {
3844                         hwaddr addr = qemu_get_be64(f);
3845                         if (ramblock_is_ignored(block) &&
3846                             block->mr->addr != addr) {
3847                             error_report("Mismatched GPAs for block %s "
3848                                          "%" PRId64 "!= %" PRId64,
3849                                          id, (uint64_t)addr,
3850                                          (uint64_t)block->mr->addr);
3851                             ret = -EINVAL;
3852                         }
3853                     }
3854                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3855                                           block->idstr);
3856                 } else {
3857                     error_report("Unknown ramblock \"%s\", cannot "
3858                                  "accept migration", id);
3859                     ret = -EINVAL;
3860                 }
3861
3862                 total_ram_bytes -= length;
3863             }
3864             break;
3865
3866         case RAM_SAVE_FLAG_ZERO:
3867             ch = qemu_get_byte(f);
3868             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3869             break;
3870
3871         case RAM_SAVE_FLAG_PAGE:
3872             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3873             break;
3874
3875         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3876             len = qemu_get_be32(f);
3877             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3878                 error_report("Invalid compressed data length: %d", len);
3879                 ret = -EINVAL;
3880                 break;
3881             }
3882             decompress_data_with_multi_threads(f, host, len);
3883             break;
3884
3885         case RAM_SAVE_FLAG_XBZRLE:
3886             if (load_xbzrle(f, addr, host) < 0) {
3887                 error_report("Failed to decompress XBZRLE page at "
3888                              RAM_ADDR_FMT, addr);
3889                 ret = -EINVAL;
3890                 break;
3891             }
3892             break;
3893         case RAM_SAVE_FLAG_EOS:
3894             /* normal exit */
3895             multifd_recv_sync_main();
3896             break;
3897         default:
3898             if (flags & RAM_SAVE_FLAG_HOOK) {
3899                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3900             } else {
3901                 error_report("Unknown combination of migration flags: 0x%x",
3902                              flags);
3903                 ret = -EINVAL;
3904             }
3905         }
3906         if (!ret) {
3907             ret = qemu_file_get_error(f);
3908         }
3909         if (!ret && host_bak) {
3910             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3911         }
3912     }
3913
3914     ret |= wait_for_decompress_done();
3915     return ret;
3916 }
3917
3918 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3919 {
3920     int ret = 0;
3921     static uint64_t seq_iter;
3922     /*
3923      * If system is running in postcopy mode, page inserts to host memory must
3924      * be atomic
3925      */
3926     bool postcopy_running = postcopy_is_running();
3927
3928     seq_iter++;
3929
3930     if (version_id != 4) {
3931         return -EINVAL;
3932     }
3933
3934     /*
3935      * This RCU critical section can be very long running.
3936      * When RCU reclaims in the code start to become numerous,
3937      * it will be necessary to reduce the granularity of this
3938      * critical section.
3939      */
3940     WITH_RCU_READ_LOCK_GUARD() {
3941         if (postcopy_running) {
3942             ret = ram_load_postcopy(f);
3943         } else {
3944             ret = ram_load_precopy(f);
3945         }
3946     }
3947     trace_ram_load_complete(ret, seq_iter);
3948
3949     return ret;
3950 }
3951
3952 static bool ram_has_postcopy(void *opaque)
3953 {
3954     RAMBlock *rb;
3955     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3956         if (ramblock_is_pmem(rb)) {
3957             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3958                          "is not supported now!", rb->idstr, rb->host);
3959             return false;
3960         }
3961     }
3962
3963     return migrate_postcopy_ram();
3964 }
3965
3966 /* Sync all the dirty bitmap with destination VM.  */
3967 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3968 {
3969     RAMBlock *block;
3970     QEMUFile *file = s->to_dst_file;
3971     int ramblock_count = 0;
3972
3973     trace_ram_dirty_bitmap_sync_start();
3974
3975     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3976         qemu_savevm_send_recv_bitmap(file, block->idstr);
3977         trace_ram_dirty_bitmap_request(block->idstr);
3978         ramblock_count++;
3979     }
3980
3981     trace_ram_dirty_bitmap_sync_wait();
3982
3983     /* Wait until all the ramblocks' dirty bitmap synced */
3984     while (ramblock_count--) {
3985         qemu_sem_wait(&s->rp_state.rp_sem);
3986     }
3987
3988     trace_ram_dirty_bitmap_sync_complete();
3989
3990     return 0;
3991 }
3992
3993 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3994 {
3995     qemu_sem_post(&s->rp_state.rp_sem);
3996 }
3997
3998 /*
3999  * Read the received bitmap, revert it as the initial dirty bitmap.
4000  * This is only used when the postcopy migration is paused but wants
4001  * to resume from a middle point.
4002  */
4003 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4004 {
4005     int ret = -EINVAL;
4006     QEMUFile *file = s->rp_state.from_dst_file;
4007     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4008     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4009     uint64_t size, end_mark;
4010
4011     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4012
4013     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4014         error_report("%s: incorrect state %s", __func__,
4015                      MigrationStatus_str(s->state));
4016         return -EINVAL;
4017     }
4018
4019     /*
4020      * Note: see comments in ramblock_recv_bitmap_send() on why we
4021      * need the endianness conversion, and the paddings.
4022      */
4023     local_size = ROUND_UP(local_size, 8);
4024
4025     /* Add paddings */
4026     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4027
4028     size = qemu_get_be64(file);
4029
4030     /* The size of the bitmap should match with our ramblock */
4031     if (size != local_size) {
4032         error_report("%s: ramblock '%s' bitmap size mismatch "
4033                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4034                      block->idstr, size, local_size);
4035         ret = -EINVAL;
4036         goto out;
4037     }
4038
4039     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4040     end_mark = qemu_get_be64(file);
4041
4042     ret = qemu_file_get_error(file);
4043     if (ret || size != local_size) {
4044         error_report("%s: read bitmap failed for ramblock '%s': %d"
4045                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4046                      __func__, block->idstr, ret, local_size, size);
4047         ret = -EIO;
4048         goto out;
4049     }
4050
4051     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4052         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4053                      __func__, block->idstr, end_mark);
4054         ret = -EINVAL;
4055         goto out;
4056     }
4057
4058     /*
4059      * Endianness conversion. We are during postcopy (though paused).
4060      * The dirty bitmap won't change. We can directly modify it.
4061      */
4062     bitmap_from_le(block->bmap, le_bitmap, nbits);
4063
4064     /*
4065      * What we received is "received bitmap". Revert it as the initial
4066      * dirty bitmap for this ramblock.
4067      */
4068     bitmap_complement(block->bmap, block->bmap, nbits);
4069
4070     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4071
4072     /*
4073      * We succeeded to sync bitmap for current ramblock. If this is
4074      * the last one to sync, we need to notify the main send thread.
4075      */
4076     ram_dirty_bitmap_reload_notify(s);
4077
4078     ret = 0;
4079 out:
4080     g_free(le_bitmap);
4081     return ret;
4082 }
4083
4084 static int ram_resume_prepare(MigrationState *s, void *opaque)
4085 {
4086     RAMState *rs = *(RAMState **)opaque;
4087     int ret;
4088
4089     ret = ram_dirty_bitmap_sync_all(s, rs);
4090     if (ret) {
4091         return ret;
4092     }
4093
4094     ram_state_resume_prepare(rs, s->to_dst_file);
4095
4096     return 0;
4097 }
4098
4099 static SaveVMHandlers savevm_ram_handlers = {
4100     .save_setup = ram_save_setup,
4101     .save_live_iterate = ram_save_iterate,
4102     .save_live_complete_postcopy = ram_save_complete,
4103     .save_live_complete_precopy = ram_save_complete,
4104     .has_postcopy = ram_has_postcopy,
4105     .save_live_pending = ram_save_pending,
4106     .load_state = ram_load,
4107     .save_cleanup = ram_save_cleanup,
4108     .load_setup = ram_load_setup,
4109     .load_cleanup = ram_load_cleanup,
4110     .resume_prepare = ram_resume_prepare,
4111 };
4112
4113 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4114                                       size_t old_size, size_t new_size)
4115 {
4116     PostcopyState ps = postcopy_state_get();
4117     ram_addr_t offset;
4118     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4119     Error *err = NULL;
4120
4121     if (ramblock_is_ignored(rb)) {
4122         return;
4123     }
4124
4125     if (!migration_is_idle()) {
4126         /*
4127          * Precopy code on the source cannot deal with the size of RAM blocks
4128          * changing at random points in time - especially after sending the
4129          * RAM block sizes in the migration stream, they must no longer change.
4130          * Abort and indicate a proper reason.
4131          */
4132         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4133         migrate_set_error(migrate_get_current(), err);
4134         error_free(err);
4135         migration_cancel();
4136     }
4137
4138     switch (ps) {
4139     case POSTCOPY_INCOMING_ADVISE:
4140         /*
4141          * Update what ram_postcopy_incoming_init()->init_range() does at the
4142          * time postcopy was advised. Syncing RAM blocks with the source will
4143          * result in RAM resizes.
4144          */
4145         if (old_size < new_size) {
4146             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4147                 error_report("RAM block '%s' discard of resized RAM failed",
4148                              rb->idstr);
4149             }
4150         }
4151         rb->postcopy_length = new_size;
4152         break;
4153     case POSTCOPY_INCOMING_NONE:
4154     case POSTCOPY_INCOMING_RUNNING:
4155     case POSTCOPY_INCOMING_END:
4156         /*
4157          * Once our guest is running, postcopy does no longer care about
4158          * resizes. When growing, the new memory was not available on the
4159          * source, no handler needed.
4160          */
4161         break;
4162     default:
4163         error_report("RAM block '%s' resized during postcopy state: %d",
4164                      rb->idstr, ps);
4165         exit(-1);
4166     }
4167 }
4168
4169 static RAMBlockNotifier ram_mig_ram_notifier = {
4170     .ram_block_resized = ram_mig_ram_block_resized,
4171 };
4172
4173 void ram_mig_init(void)
4174 {
4175     qemu_mutex_init(&XBZRLE.lock);
4176     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4177     ram_block_notifier_add(&ram_mig_ram_notifier);
4178 }