migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/main-loop.h"
  34 #include "xbzrle.h"
  35 #include "ram.h"
  36 #include "migration.h"
  37 #include "migration/register.h"
  38 #include "migration/misc.h"
  39 #include "qemu-file.h"
  40 #include "postcopy-ram.h"
  41 #include "page_cache.h"
  42 #include "qemu/error-report.h"
  43 #include "qapi/error.h"
  44 #include "qapi/qapi-types-migration.h"
  45 #include "qapi/qapi-events-migration.h"
  46 #include "qapi/qmp/qerror.h"
  47 #include "trace.h"
  48 #include "exec/ram_addr.h"
  49 #include "exec/target_page.h"
  50 #include "qemu/rcu_queue.h"
  51 #include "migration/colo.h"
  52 #include "block.h"
  53 #include "sysemu/cpu-throttle.h"
  54 #include "savevm.h"
  55 #include "qemu/iov.h"
  56 #include "multifd.h"
  57 #include "sysemu/runstate.h"
  58
  59 #if defined(__linux__)
  60 #include "qemu/userfaultfd.h"
  61 #endif /* defined(__linux__) */
  62
  63 /***********************************************************/
  64 /* ram save/restore */
  65
  66 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  67  * worked for pages that where filled with the same char.  We switched
  68  * it to only search for the zero value.  And to avoid confusion with
  69  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  70  */
  71
  72 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  73 #define RAM_SAVE_FLAG_ZERO     0x02
  74 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  75 #define RAM_SAVE_FLAG_PAGE     0x08
  76 #define RAM_SAVE_FLAG_EOS      0x10
  77 #define RAM_SAVE_FLAG_CONTINUE 0x20
  78 #define RAM_SAVE_FLAG_XBZRLE   0x40
  79 /* 0x80 is reserved in migration.h start with 0x100 next */
  80 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  81
  82 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  83 {
  84     return buffer_is_zero(p, size);
  85 }
  86
  87 XBZRLECacheStats xbzrle_counters;
  88
  89 /* struct contains XBZRLE cache and a static page
  90    used by the compression */
  91 static struct {
  92     /* buffer used for XBZRLE encoding */
  93     uint8_t *encoded_buf;
  94     /* buffer for storing page content */
  95     uint8_t *current_buf;
  96     /* Cache for XBZRLE, Protected by lock. */
  97     PageCache *cache;
  98     QemuMutex lock;
  99     /* it will store a page full of zeros */
 100     uint8_t *zero_target_page;
 101     /* buffer used for XBZRLE decoding */
 102     uint8_t *decoded_buf;
 103 } XBZRLE;
 104
 105 static void XBZRLE_cache_lock(void)
 106 {
 107     if (migrate_use_xbzrle()) {
 108         qemu_mutex_lock(&XBZRLE.lock);
 109     }
 110 }
 111
 112 static void XBZRLE_cache_unlock(void)
 113 {
 114     if (migrate_use_xbzrle()) {
 115         qemu_mutex_unlock(&XBZRLE.lock);
 116     }
 117 }
 118
 119 /**
 120  * xbzrle_cache_resize: resize the xbzrle cache
 121  *
 122  * This function is called from migrate_params_apply in main
 123  * thread, possibly while a migration is in progress.  A running
 124  * migration may be using the cache and might finish during this call,
 125  * hence changes to the cache are protected by XBZRLE.lock().
 126  *
 127  * Returns 0 for success or -1 for error
 128  *
 129  * @new_size: new cache size
 130  * @errp: set *errp if the check failed, with reason
 131  */
 132 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 133 {
 134     PageCache *new_cache;
 135     int64_t ret = 0;
 136
 137     /* Check for truncation */
 138     if (new_size != (size_t)new_size) {
 139         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 140                    "exceeding address space");
 141         return -1;
 142     }
 143
 144     if (new_size == migrate_xbzrle_cache_size()) {
 145         /* nothing to do */
 146         return 0;
 147     }
 148
 149     XBZRLE_cache_lock();
 150
 151     if (XBZRLE.cache != NULL) {
 152         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 153         if (!new_cache) {
 154             ret = -1;
 155             goto out;
 156         }
 157
 158         cache_fini(XBZRLE.cache);
 159         XBZRLE.cache = new_cache;
 160     }
 161 out:
 162     XBZRLE_cache_unlock();
 163     return ret;
 164 }
 165
 166 bool ramblock_is_ignored(RAMBlock *block)
 167 {
 168     return !qemu_ram_is_migratable(block) ||
 169            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 170 }
 171
 172 #undef RAMBLOCK_FOREACH
 173
 174 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 175 {
 176     RAMBlock *block;
 177     int ret = 0;
 178
 179     RCU_READ_LOCK_GUARD();
 180
 181     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 182         ret = func(block, opaque);
 183         if (ret) {
 184             break;
 185         }
 186     }
 187     return ret;
 188 }
 189
 190 static void ramblock_recv_map_init(void)
 191 {
 192     RAMBlock *rb;
 193
 194     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 195         assert(!rb->receivedmap);
 196         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 197     }
 198 }
 199
 200 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 201 {
 202     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 203                     rb->receivedmap);
 204 }
 205
 206 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 207 {
 208     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 209 }
 210
 211 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 212 {
 213     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 214 }
 215
 216 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 217                                     size_t nr)
 218 {
 219     bitmap_set_atomic(rb->receivedmap,
 220                       ramblock_recv_bitmap_offset(host_addr, rb),
 221                       nr);
 222 }
 223
 224 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 225
 226 /*
 227  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 228  *
 229  * Returns >0 if success with sent bytes, or <0 if error.
 230  */
 231 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 232                                   const char *block_name)
 233 {
 234     RAMBlock *block = qemu_ram_block_by_name(block_name);
 235     unsigned long *le_bitmap, nbits;
 236     uint64_t size;
 237
 238     if (!block) {
 239         error_report("%s: invalid block name: %s", __func__, block_name);
 240         return -1;
 241     }
 242
 243     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 244
 245     /*
 246      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 247      * machines we may need 4 more bytes for padding (see below
 248      * comment). So extend it a bit before hand.
 249      */
 250     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 251
 252     /*
 253      * Always use little endian when sending the bitmap. This is
 254      * required that when source and destination VMs are not using the
 255      * same endianness. (Note: big endian won't work.)
 256      */
 257     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 258
 259     /* Size of the bitmap, in bytes */
 260     size = DIV_ROUND_UP(nbits, 8);
 261
 262     /*
 263      * size is always aligned to 8 bytes for 64bit machines, but it
 264      * may not be true for 32bit machines. We need this padding to
 265      * make sure the migration can survive even between 32bit and
 266      * 64bit machines.
 267      */
 268     size = ROUND_UP(size, 8);
 269
 270     qemu_put_be64(file, size);
 271     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 272     /*
 273      * Mark as an end, in case the middle part is screwed up due to
 274      * some "mysterious" reason.
 275      */
 276     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 277     qemu_fflush(file);
 278
 279     g_free(le_bitmap);
 280
 281     if (qemu_file_get_error(file)) {
 282         return qemu_file_get_error(file);
 283     }
 284
 285     return size + sizeof(size);
 286 }
 287
 288 /*
 289  * An outstanding page request, on the source, having been received
 290  * and queued
 291  */
 292 struct RAMSrcPageRequest {
 293     RAMBlock *rb;
 294     hwaddr    offset;
 295     hwaddr    len;
 296
 297     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 298 };
 299
 300 /* State of RAM for migration */
 301 struct RAMState {
 302     /* QEMUFile used for this migration */
 303     QEMUFile *f;
 304     /* UFFD file descriptor, used in 'write-tracking' migration */
 305     int uffdio_fd;
 306     /* Last block that we have visited searching for dirty pages */
 307     RAMBlock *last_seen_block;
 308     /* Last block from where we have sent data */
 309     RAMBlock *last_sent_block;
 310     /* Last dirty target page we have sent */
 311     ram_addr_t last_page;
 312     /* last ram version we have seen */
 313     uint32_t last_version;
 314     /* How many times we have dirty too many pages */
 315     int dirty_rate_high_cnt;
 316     /* these variables are used for bitmap sync */
 317     /* last time we did a full bitmap_sync */
 318     int64_t time_last_bitmap_sync;
 319     /* bytes transferred at start_time */
 320     uint64_t bytes_xfer_prev;
 321     /* number of dirty pages since start_time */
 322     uint64_t num_dirty_pages_period;
 323     /* xbzrle misses since the beginning of the period */
 324     uint64_t xbzrle_cache_miss_prev;
 325     /* Amount of xbzrle pages since the beginning of the period */
 326     uint64_t xbzrle_pages_prev;
 327     /* Amount of xbzrle encoded bytes since the beginning of the period */
 328     uint64_t xbzrle_bytes_prev;
 329     /* Start using XBZRLE (e.g., after the first round). */
 330     bool xbzrle_enabled;
 331
 332     /* compression statistics since the beginning of the period */
 333     /* amount of count that no free thread to compress data */
 334     uint64_t compress_thread_busy_prev;
 335     /* amount bytes after compression */
 336     uint64_t compressed_size_prev;
 337     /* amount of compressed pages */
 338     uint64_t compress_pages_prev;
 339
 340     /* total handled target pages at the beginning of period */
 341     uint64_t target_page_count_prev;
 342     /* total handled target pages since start */
 343     uint64_t target_page_count;
 344     /* number of dirty bits in the bitmap */
 345     uint64_t migration_dirty_pages;
 346     /* Protects modification of the bitmap and migration dirty pages */
 347     QemuMutex bitmap_mutex;
 348     /* The RAMBlock used in the last src_page_requests */
 349     RAMBlock *last_req_rb;
 350     /* Queue of outstanding page requests from the destination */
 351     QemuMutex src_page_req_mutex;
 352     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 353 };
 354 typedef struct RAMState RAMState;
 355
 356 static RAMState *ram_state;
 357
 358 static NotifierWithReturnList precopy_notifier_list;
 359
 360 void precopy_infrastructure_init(void)
 361 {
 362     notifier_with_return_list_init(&precopy_notifier_list);
 363 }
 364
 365 void precopy_add_notifier(NotifierWithReturn *n)
 366 {
 367     notifier_with_return_list_add(&precopy_notifier_list, n);
 368 }
 369
 370 void precopy_remove_notifier(NotifierWithReturn *n)
 371 {
 372     notifier_with_return_remove(n);
 373 }
 374
 375 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 376 {
 377     PrecopyNotifyData pnd;
 378     pnd.reason = reason;
 379     pnd.errp = errp;
 380
 381     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 382 }
 383
 384 uint64_t ram_bytes_remaining(void)
 385 {
 386     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 387                        0;
 388 }
 389
 390 MigrationStats ram_counters;
 391
 392 /* used by the search for pages to send */
 393 struct PageSearchStatus {
 394     /* Current block being searched */
 395     RAMBlock    *block;
 396     /* Current page to search from */
 397     unsigned long page;
 398     /* Set once we wrap around */
 399     bool         complete_round;
 400 };
 401 typedef struct PageSearchStatus PageSearchStatus;
 402
 403 CompressionStats compression_counters;
 404
 405 struct CompressParam {
 406     bool done;
 407     bool quit;
 408     bool zero_page;
 409     QEMUFile *file;
 410     QemuMutex mutex;
 411     QemuCond cond;
 412     RAMBlock *block;
 413     ram_addr_t offset;
 414
 415     /* internally used fields */
 416     z_stream stream;
 417     uint8_t *originbuf;
 418 };
 419 typedef struct CompressParam CompressParam;
 420
 421 struct DecompressParam {
 422     bool done;
 423     bool quit;
 424     QemuMutex mutex;
 425     QemuCond cond;
 426     void *des;
 427     uint8_t *compbuf;
 428     int len;
 429     z_stream stream;
 430 };
 431 typedef struct DecompressParam DecompressParam;
 432
 433 static CompressParam *comp_param;
 434 static QemuThread *compress_threads;
 435 /* comp_done_cond is used to wake up the migration thread when
 436  * one of the compression threads has finished the compression.
 437  * comp_done_lock is used to co-work with comp_done_cond.
 438  */
 439 static QemuMutex comp_done_lock;
 440 static QemuCond comp_done_cond;
 441 /* The empty QEMUFileOps will be used by file in CompressParam */
 442 static const QEMUFileOps empty_ops = { };
 443
 444 static QEMUFile *decomp_file;
 445 static DecompressParam *decomp_param;
 446 static QemuThread *decompress_threads;
 447 static QemuMutex decomp_done_lock;
 448 static QemuCond decomp_done_cond;
 449
 450 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 451                                  ram_addr_t offset, uint8_t *source_buf);
 452
 453 static void *do_data_compress(void *opaque)
 454 {
 455     CompressParam *param = opaque;
 456     RAMBlock *block;
 457     ram_addr_t offset;
 458     bool zero_page;
 459
 460     qemu_mutex_lock(&param->mutex);
 461     while (!param->quit) {
 462         if (param->block) {
 463             block = param->block;
 464             offset = param->offset;
 465             param->block = NULL;
 466             qemu_mutex_unlock(&param->mutex);
 467
 468             zero_page = do_compress_ram_page(param->file, &param->stream,
 469                                              block, offset, param->originbuf);
 470
 471             qemu_mutex_lock(&comp_done_lock);
 472             param->done = true;
 473             param->zero_page = zero_page;
 474             qemu_cond_signal(&comp_done_cond);
 475             qemu_mutex_unlock(&comp_done_lock);
 476
 477             qemu_mutex_lock(&param->mutex);
 478         } else {
 479             qemu_cond_wait(&param->cond, &param->mutex);
 480         }
 481     }
 482     qemu_mutex_unlock(&param->mutex);
 483
 484     return NULL;
 485 }
 486
 487 static void compress_threads_save_cleanup(void)
 488 {
 489     int i, thread_count;
 490
 491     if (!migrate_use_compression() || !comp_param) {
 492         return;
 493     }
 494
 495     thread_count = migrate_compress_threads();
 496     for (i = 0; i < thread_count; i++) {
 497         /*
 498          * we use it as a indicator which shows if the thread is
 499          * properly init'd or not
 500          */
 501         if (!comp_param[i].file) {
 502             break;
 503         }
 504
 505         qemu_mutex_lock(&comp_param[i].mutex);
 506         comp_param[i].quit = true;
 507         qemu_cond_signal(&comp_param[i].cond);
 508         qemu_mutex_unlock(&comp_param[i].mutex);
 509
 510         qemu_thread_join(compress_threads + i);
 511         qemu_mutex_destroy(&comp_param[i].mutex);
 512         qemu_cond_destroy(&comp_param[i].cond);
 513         deflateEnd(&comp_param[i].stream);
 514         g_free(comp_param[i].originbuf);
 515         qemu_fclose(comp_param[i].file);
 516         comp_param[i].file = NULL;
 517     }
 518     qemu_mutex_destroy(&comp_done_lock);
 519     qemu_cond_destroy(&comp_done_cond);
 520     g_free(compress_threads);
 521     g_free(comp_param);
 522     compress_threads = NULL;
 523     comp_param = NULL;
 524 }
 525
 526 static int compress_threads_save_setup(void)
 527 {
 528     int i, thread_count;
 529
 530     if (!migrate_use_compression()) {
 531         return 0;
 532     }
 533     thread_count = migrate_compress_threads();
 534     compress_threads = g_new0(QemuThread, thread_count);
 535     comp_param = g_new0(CompressParam, thread_count);
 536     qemu_cond_init(&comp_done_cond);
 537     qemu_mutex_init(&comp_done_lock);
 538     for (i = 0; i < thread_count; i++) {
 539         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 540         if (!comp_param[i].originbuf) {
 541             goto exit;
 542         }
 543
 544         if (deflateInit(&comp_param[i].stream,
 545                         migrate_compress_level()) != Z_OK) {
 546             g_free(comp_param[i].originbuf);
 547             goto exit;
 548         }
 549
 550         /* comp_param[i].file is just used as a dummy buffer to save data,
 551          * set its ops to empty.
 552          */
 553         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
 554         comp_param[i].done = true;
 555         comp_param[i].quit = false;
 556         qemu_mutex_init(&comp_param[i].mutex);
 557         qemu_cond_init(&comp_param[i].cond);
 558         qemu_thread_create(compress_threads + i, "compress",
 559                            do_data_compress, comp_param + i,
 560                            QEMU_THREAD_JOINABLE);
 561     }
 562     return 0;
 563
 564 exit:
 565     compress_threads_save_cleanup();
 566     return -1;
 567 }
 568
 569 /**
 570  * save_page_header: write page header to wire
 571  *
 572  * If this is the 1st block, it also writes the block identification
 573  *
 574  * Returns the number of bytes written
 575  *
 576  * @f: QEMUFile where to send the data
 577  * @block: block that contains the page we want to send
 578  * @offset: offset inside the block for the page
 579  *          in the lower bits, it contains flags
 580  */
 581 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 582                                ram_addr_t offset)
 583 {
 584     size_t size, len;
 585
 586     if (block == rs->last_sent_block) {
 587         offset |= RAM_SAVE_FLAG_CONTINUE;
 588     }
 589     qemu_put_be64(f, offset);
 590     size = 8;
 591
 592     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 593         len = strlen(block->idstr);
 594         qemu_put_byte(f, len);
 595         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 596         size += 1 + len;
 597         rs->last_sent_block = block;
 598     }
 599     return size;
 600 }
 601
 602 /**
 603  * mig_throttle_guest_down: throttle down the guest
 604  *
 605  * Reduce amount of guest cpu execution to hopefully slow down memory
 606  * writes. If guest dirty memory rate is reduced below the rate at
 607  * which we can transfer pages to the destination then we should be
 608  * able to complete migration. Some workloads dirty memory way too
 609  * fast and will not effectively converge, even with auto-converge.
 610  */
 611 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 612                                     uint64_t bytes_dirty_threshold)
 613 {
 614     MigrationState *s = migrate_get_current();
 615     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 616     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 617     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 618     int pct_max = s->parameters.max_cpu_throttle;
 619
 620     uint64_t throttle_now = cpu_throttle_get_percentage();
 621     uint64_t cpu_now, cpu_ideal, throttle_inc;
 622
 623     /* We have not started throttling yet. Let's start it. */
 624     if (!cpu_throttle_active()) {
 625         cpu_throttle_set(pct_initial);
 626     } else {
 627         /* Throttling already on, just increase the rate */
 628         if (!pct_tailslow) {
 629             throttle_inc = pct_increment;
 630         } else {
 631             /* Compute the ideal CPU percentage used by Guest, which may
 632              * make the dirty rate match the dirty rate threshold. */
 633             cpu_now = 100 - throttle_now;
 634             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 635                         bytes_dirty_period);
 636             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 637         }
 638         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 639     }
 640 }
 641
 642 /**
 643  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 644  *
 645  * @rs: current RAM state
 646  * @current_addr: address for the zero page
 647  *
 648  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 649  * The important thing is that a stale (not-yet-0'd) page be replaced
 650  * by the new data.
 651  * As a bonus, if the page wasn't in the cache it gets added so that
 652  * when a small write is made into the 0'd page it gets XBZRLE sent.
 653  */
 654 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 655 {
 656     if (!rs->xbzrle_enabled) {
 657         return;
 658     }
 659
 660     /* We don't care if this fails to allocate a new cache page
 661      * as long as it updated an old one */
 662     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 663                  ram_counters.dirty_sync_count);
 664 }
 665
 666 #define ENCODING_FLAG_XBZRLE 0x1
 667
 668 /**
 669  * save_xbzrle_page: compress and send current page
 670  *
 671  * Returns: 1 means that we wrote the page
 672  *          0 means that page is identical to the one already sent
 673  *          -1 means that xbzrle would be longer than normal
 674  *
 675  * @rs: current RAM state
 676  * @current_data: pointer to the address of the page contents
 677  * @current_addr: addr of the page
 678  * @block: block that contains the page we want to send
 679  * @offset: offset inside the block for the page
 680  * @last_stage: if we are at the completion stage
 681  */
 682 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 683                             ram_addr_t current_addr, RAMBlock *block,
 684                             ram_addr_t offset, bool last_stage)
 685 {
 686     int encoded_len = 0, bytes_xbzrle;
 687     uint8_t *prev_cached_page;
 688
 689     if (!cache_is_cached(XBZRLE.cache, current_addr,
 690                          ram_counters.dirty_sync_count)) {
 691         xbzrle_counters.cache_miss++;
 692         if (!last_stage) {
 693             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 694                              ram_counters.dirty_sync_count) == -1) {
 695                 return -1;
 696             } else {
 697                 /* update *current_data when the page has been
 698                    inserted into cache */
 699                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 700             }
 701         }
 702         return -1;
 703     }
 704
 705     /*
 706      * Reaching here means the page has hit the xbzrle cache, no matter what
 707      * encoding result it is (normal encoding, overflow or skipping the page),
 708      * count the page as encoded. This is used to calculate the encoding rate.
 709      *
 710      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 711      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 712      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 713      * skipped page included. In this way, the encoding rate can tell if the
 714      * guest page is good for xbzrle encoding.
 715      */
 716     xbzrle_counters.pages++;
 717     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 718
 719     /* save current buffer into memory */
 720     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 721
 722     /* XBZRLE encoding (if there is no overflow) */
 723     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 724                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 725                                        TARGET_PAGE_SIZE);
 726
 727     /*
 728      * Update the cache contents, so that it corresponds to the data
 729      * sent, in all cases except where we skip the page.
 730      */
 731     if (!last_stage && encoded_len != 0) {
 732         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 733         /*
 734          * In the case where we couldn't compress, ensure that the caller
 735          * sends the data from the cache, since the guest might have
 736          * changed the RAM since we copied it.
 737          */
 738         *current_data = prev_cached_page;
 739     }
 740
 741     if (encoded_len == 0) {
 742         trace_save_xbzrle_page_skipping();
 743         return 0;
 744     } else if (encoded_len == -1) {
 745         trace_save_xbzrle_page_overflow();
 746         xbzrle_counters.overflow++;
 747         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 748         return -1;
 749     }
 750
 751     /* Send XBZRLE based compressed page */
 752     bytes_xbzrle = save_page_header(rs, rs->f, block,
 753                                     offset | RAM_SAVE_FLAG_XBZRLE);
 754     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 755     qemu_put_be16(rs->f, encoded_len);
 756     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 757     bytes_xbzrle += encoded_len + 1 + 2;
 758     /*
 759      * Like compressed_size (please see update_compress_thread_counts),
 760      * the xbzrle encoded bytes don't count the 8 byte header with
 761      * RAM_SAVE_FLAG_CONTINUE.
 762      */
 763     xbzrle_counters.bytes += bytes_xbzrle - 8;
 764     ram_counters.transferred += bytes_xbzrle;
 765
 766     return 1;
 767 }
 768
 769 /**
 770  * migration_bitmap_find_dirty: find the next dirty page from start
 771  *
 772  * Returns the page offset within memory region of the start of a dirty page
 773  *
 774  * @rs: current RAM state
 775  * @rb: RAMBlock where to search for dirty pages
 776  * @start: page where we start the search
 777  */
 778 static inline
 779 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 780                                           unsigned long start)
 781 {
 782     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 783     unsigned long *bitmap = rb->bmap;
 784
 785     if (ramblock_is_ignored(rb)) {
 786         return size;
 787     }
 788
 789     return find_next_bit(bitmap, size, start);
 790 }
 791
 792 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
 793                                                        unsigned long page)
 794 {
 795     uint8_t shift;
 796     hwaddr size, start;
 797
 798     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
 799         return;
 800     }
 801
 802     shift = rb->clear_bmap_shift;
 803     /*
 804      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 805      * can make things easier sometimes since then start address
 806      * of the small chunk will always be 64 pages aligned so the
 807      * bitmap will always be aligned to unsigned long. We should
 808      * even be able to remove this restriction but I'm simply
 809      * keeping it.
 810      */
 811     assert(shift >= 6);
 812
 813     size = 1ULL << (TARGET_PAGE_BITS + shift);
 814     start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
 815     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 816     memory_region_clear_dirty_bitmap(rb->mr, start, size);
 817 }
 818
 819 static void
 820 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
 821                                                  unsigned long start,
 822                                                  unsigned long npages)
 823 {
 824     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
 825     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
 826     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
 827
 828     /*
 829      * Clear pages from start to start + npages - 1, so the end boundary is
 830      * exclusive.
 831      */
 832     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
 833         migration_clear_memory_region_dirty_bitmap(rb, i);
 834     }
 835 }
 836
 837 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 838                                                 RAMBlock *rb,
 839                                                 unsigned long page)
 840 {
 841     bool ret;
 842
 843     /*
 844      * Clear dirty bitmap if needed.  This _must_ be called before we
 845      * send any of the page in the chunk because we need to make sure
 846      * we can capture further page content changes when we sync dirty
 847      * log the next time.  So as long as we are going to send any of
 848      * the page in the chunk we clear the remote dirty bitmap for all.
 849      * Clearing it earlier won't be a problem, but too late will.
 850      */
 851     migration_clear_memory_region_dirty_bitmap(rb, page);
 852
 853     ret = test_and_clear_bit(page, rb->bmap);
 854     if (ret) {
 855         rs->migration_dirty_pages--;
 856     }
 857
 858     return ret;
 859 }
 860
 861 /* Called with RCU critical section */
 862 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 863 {
 864     uint64_t new_dirty_pages =
 865         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
 866
 867     rs->migration_dirty_pages += new_dirty_pages;
 868     rs->num_dirty_pages_period += new_dirty_pages;
 869 }
 870
 871 /**
 872  * ram_pagesize_summary: calculate all the pagesizes of a VM
 873  *
 874  * Returns a summary bitmap of the page sizes of all RAMBlocks
 875  *
 876  * For VMs with just normal pages this is equivalent to the host page
 877  * size. If it's got some huge pages then it's the OR of all the
 878  * different page sizes.
 879  */
 880 uint64_t ram_pagesize_summary(void)
 881 {
 882     RAMBlock *block;
 883     uint64_t summary = 0;
 884
 885     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 886         summary |= block->page_size;
 887     }
 888
 889     return summary;
 890 }
 891
 892 uint64_t ram_get_total_transferred_pages(void)
 893 {
 894     return  ram_counters.normal + ram_counters.duplicate +
 895                 compression_counters.pages + xbzrle_counters.pages;
 896 }
 897
 898 static void migration_update_rates(RAMState *rs, int64_t end_time)
 899 {
 900     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 901     double compressed_size;
 902
 903     /* calculate period counters */
 904     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 905                 / (end_time - rs->time_last_bitmap_sync);
 906
 907     if (!page_count) {
 908         return;
 909     }
 910
 911     if (migrate_use_xbzrle()) {
 912         double encoded_size, unencoded_size;
 913
 914         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 915             rs->xbzrle_cache_miss_prev) / page_count;
 916         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 917         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
 918                          TARGET_PAGE_SIZE;
 919         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
 920         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
 921             xbzrle_counters.encoding_rate = 0;
 922         } else {
 923             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
 924         }
 925         rs->xbzrle_pages_prev = xbzrle_counters.pages;
 926         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
 927     }
 928
 929     if (migrate_use_compression()) {
 930         compression_counters.busy_rate = (double)(compression_counters.busy -
 931             rs->compress_thread_busy_prev) / page_count;
 932         rs->compress_thread_busy_prev = compression_counters.busy;
 933
 934         compressed_size = compression_counters.compressed_size -
 935                           rs->compressed_size_prev;
 936         if (compressed_size) {
 937             double uncompressed_size = (compression_counters.pages -
 938                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
 939
 940             /* Compression-Ratio = Uncompressed-size / Compressed-size */
 941             compression_counters.compression_rate =
 942                                         uncompressed_size / compressed_size;
 943
 944             rs->compress_pages_prev = compression_counters.pages;
 945             rs->compressed_size_prev = compression_counters.compressed_size;
 946         }
 947     }
 948 }
 949
 950 static void migration_trigger_throttle(RAMState *rs)
 951 {
 952     MigrationState *s = migrate_get_current();
 953     uint64_t threshold = s->parameters.throttle_trigger_threshold;
 954
 955     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
 956     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
 957     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
 958
 959     /* During block migration the auto-converge logic incorrectly detects
 960      * that ram migration makes no progress. Avoid this by disabling the
 961      * throttling logic during the bulk phase of block migration. */
 962     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 963         /* The following detection logic can be refined later. For now:
 964            Check to see if the ratio between dirtied bytes and the approx.
 965            amount of bytes that just got transferred since the last time
 966            we were in this routine reaches the threshold. If that happens
 967            twice, start or increase throttling. */
 968
 969         if ((bytes_dirty_period > bytes_dirty_threshold) &&
 970             (++rs->dirty_rate_high_cnt >= 2)) {
 971             trace_migration_throttle();
 972             rs->dirty_rate_high_cnt = 0;
 973             mig_throttle_guest_down(bytes_dirty_period,
 974                                     bytes_dirty_threshold);
 975         }
 976     }
 977 }
 978
 979 static void migration_bitmap_sync(RAMState *rs)
 980 {
 981     RAMBlock *block;
 982     int64_t end_time;
 983
 984     ram_counters.dirty_sync_count++;
 985
 986     if (!rs->time_last_bitmap_sync) {
 987         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 988     }
 989
 990     trace_migration_bitmap_sync_start();
 991     memory_global_dirty_log_sync();
 992
 993     qemu_mutex_lock(&rs->bitmap_mutex);
 994     WITH_RCU_READ_LOCK_GUARD() {
 995         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 996             ramblock_sync_dirty_bitmap(rs, block);
 997         }
 998         ram_counters.remaining = ram_bytes_remaining();
 999     }
1000     qemu_mutex_unlock(&rs->bitmap_mutex);
1001
1002     memory_global_after_dirty_log_sync();
1003     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1004
1005     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1006
1007     /* more than 1 second = 1000 millisecons */
1008     if (end_time > rs->time_last_bitmap_sync + 1000) {
1009         migration_trigger_throttle(rs);
1010
1011         migration_update_rates(rs, end_time);
1012
1013         rs->target_page_count_prev = rs->target_page_count;
1014
1015         /* reset period counters */
1016         rs->time_last_bitmap_sync = end_time;
1017         rs->num_dirty_pages_period = 0;
1018         rs->bytes_xfer_prev = ram_counters.transferred;
1019     }
1020     if (migrate_use_events()) {
1021         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1022     }
1023 }
1024
1025 static void migration_bitmap_sync_precopy(RAMState *rs)
1026 {
1027     Error *local_err = NULL;
1028
1029     /*
1030      * The current notifier usage is just an optimization to migration, so we
1031      * don't stop the normal migration process in the error case.
1032      */
1033     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1034         error_report_err(local_err);
1035         local_err = NULL;
1036     }
1037
1038     migration_bitmap_sync(rs);
1039
1040     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1041         error_report_err(local_err);
1042     }
1043 }
1044
1045 /**
1046  * save_zero_page_to_file: send the zero page to the file
1047  *
1048  * Returns the size of data written to the file, 0 means the page is not
1049  * a zero page
1050  *
1051  * @rs: current RAM state
1052  * @file: the file where the data is saved
1053  * @block: block that contains the page we want to send
1054  * @offset: offset inside the block for the page
1055  */
1056 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1057                                   RAMBlock *block, ram_addr_t offset)
1058 {
1059     uint8_t *p = block->host + offset;
1060     int len = 0;
1061
1062     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1063         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1064         qemu_put_byte(file, 0);
1065         len += 1;
1066     }
1067     return len;
1068 }
1069
1070 /**
1071  * save_zero_page: send the zero page to the stream
1072  *
1073  * Returns the number of pages written.
1074  *
1075  * @rs: current RAM state
1076  * @block: block that contains the page we want to send
1077  * @offset: offset inside the block for the page
1078  */
1079 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1080 {
1081     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1082
1083     if (len) {
1084         ram_counters.duplicate++;
1085         ram_counters.transferred += len;
1086         return 1;
1087     }
1088     return -1;
1089 }
1090
1091 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1092 {
1093     if (!migrate_release_ram() || !migration_in_postcopy()) {
1094         return;
1095     }
1096
1097     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1098 }
1099
1100 /*
1101  * @pages: the number of pages written by the control path,
1102  *        < 0 - error
1103  *        > 0 - number of pages written
1104  *
1105  * Return true if the pages has been saved, otherwise false is returned.
1106  */
1107 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1108                               int *pages)
1109 {
1110     uint64_t bytes_xmit = 0;
1111     int ret;
1112
1113     *pages = -1;
1114     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1115                                 &bytes_xmit);
1116     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1117         return false;
1118     }
1119
1120     if (bytes_xmit) {
1121         ram_counters.transferred += bytes_xmit;
1122         *pages = 1;
1123     }
1124
1125     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1126         return true;
1127     }
1128
1129     if (bytes_xmit > 0) {
1130         ram_counters.normal++;
1131     } else if (bytes_xmit == 0) {
1132         ram_counters.duplicate++;
1133     }
1134
1135     return true;
1136 }
1137
1138 /*
1139  * directly send the page to the stream
1140  *
1141  * Returns the number of pages written.
1142  *
1143  * @rs: current RAM state
1144  * @block: block that contains the page we want to send
1145  * @offset: offset inside the block for the page
1146  * @buf: the page to be sent
1147  * @async: send to page asyncly
1148  */
1149 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1150                             uint8_t *buf, bool async)
1151 {
1152     ram_counters.transferred += save_page_header(rs, rs->f, block,
1153                                                  offset | RAM_SAVE_FLAG_PAGE);
1154     if (async) {
1155         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1156                               migrate_release_ram() &
1157                               migration_in_postcopy());
1158     } else {
1159         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1160     }
1161     ram_counters.transferred += TARGET_PAGE_SIZE;
1162     ram_counters.normal++;
1163     return 1;
1164 }
1165
1166 /**
1167  * ram_save_page: send the given page to the stream
1168  *
1169  * Returns the number of pages written.
1170  *          < 0 - error
1171  *          >=0 - Number of pages written - this might legally be 0
1172  *                if xbzrle noticed the page was the same.
1173  *
1174  * @rs: current RAM state
1175  * @block: block that contains the page we want to send
1176  * @offset: offset inside the block for the page
1177  * @last_stage: if we are at the completion stage
1178  */
1179 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1180 {
1181     int pages = -1;
1182     uint8_t *p;
1183     bool send_async = true;
1184     RAMBlock *block = pss->block;
1185     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1186     ram_addr_t current_addr = block->offset + offset;
1187
1188     p = block->host + offset;
1189     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1190
1191     XBZRLE_cache_lock();
1192     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1193         pages = save_xbzrle_page(rs, &p, current_addr, block,
1194                                  offset, last_stage);
1195         if (!last_stage) {
1196             /* Can't send this cached data async, since the cache page
1197              * might get updated before it gets to the wire
1198              */
1199             send_async = false;
1200         }
1201     }
1202
1203     /* XBZRLE overflow or normal page */
1204     if (pages == -1) {
1205         pages = save_normal_page(rs, block, offset, p, send_async);
1206     }
1207
1208     XBZRLE_cache_unlock();
1209
1210     return pages;
1211 }
1212
1213 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1214                                  ram_addr_t offset)
1215 {
1216     if (multifd_queue_page(rs->f, block, offset) < 0) {
1217         return -1;
1218     }
1219     ram_counters.normal++;
1220
1221     return 1;
1222 }
1223
1224 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1225                                  ram_addr_t offset, uint8_t *source_buf)
1226 {
1227     RAMState *rs = ram_state;
1228     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1229     bool zero_page = false;
1230     int ret;
1231
1232     if (save_zero_page_to_file(rs, f, block, offset)) {
1233         zero_page = true;
1234         goto exit;
1235     }
1236
1237     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1238
1239     /*
1240      * copy it to a internal buffer to avoid it being modified by VM
1241      * so that we can catch up the error during compression and
1242      * decompression
1243      */
1244     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1245     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1246     if (ret < 0) {
1247         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1248         error_report("compressed data failed!");
1249         return false;
1250     }
1251
1252 exit:
1253     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1254     return zero_page;
1255 }
1256
1257 static void
1258 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1259 {
1260     ram_counters.transferred += bytes_xmit;
1261
1262     if (param->zero_page) {
1263         ram_counters.duplicate++;
1264         return;
1265     }
1266
1267     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1268     compression_counters.compressed_size += bytes_xmit - 8;
1269     compression_counters.pages++;
1270 }
1271
1272 static bool save_page_use_compression(RAMState *rs);
1273
1274 static void flush_compressed_data(RAMState *rs)
1275 {
1276     int idx, len, thread_count;
1277
1278     if (!save_page_use_compression(rs)) {
1279         return;
1280     }
1281     thread_count = migrate_compress_threads();
1282
1283     qemu_mutex_lock(&comp_done_lock);
1284     for (idx = 0; idx < thread_count; idx++) {
1285         while (!comp_param[idx].done) {
1286             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1287         }
1288     }
1289     qemu_mutex_unlock(&comp_done_lock);
1290
1291     for (idx = 0; idx < thread_count; idx++) {
1292         qemu_mutex_lock(&comp_param[idx].mutex);
1293         if (!comp_param[idx].quit) {
1294             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1295             /*
1296              * it's safe to fetch zero_page without holding comp_done_lock
1297              * as there is no further request submitted to the thread,
1298              * i.e, the thread should be waiting for a request at this point.
1299              */
1300             update_compress_thread_counts(&comp_param[idx], len);
1301         }
1302         qemu_mutex_unlock(&comp_param[idx].mutex);
1303     }
1304 }
1305
1306 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1307                                        ram_addr_t offset)
1308 {
1309     param->block = block;
1310     param->offset = offset;
1311 }
1312
1313 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1314                                            ram_addr_t offset)
1315 {
1316     int idx, thread_count, bytes_xmit = -1, pages = -1;
1317     bool wait = migrate_compress_wait_thread();
1318
1319     thread_count = migrate_compress_threads();
1320     qemu_mutex_lock(&comp_done_lock);
1321 retry:
1322     for (idx = 0; idx < thread_count; idx++) {
1323         if (comp_param[idx].done) {
1324             comp_param[idx].done = false;
1325             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1326             qemu_mutex_lock(&comp_param[idx].mutex);
1327             set_compress_params(&comp_param[idx], block, offset);
1328             qemu_cond_signal(&comp_param[idx].cond);
1329             qemu_mutex_unlock(&comp_param[idx].mutex);
1330             pages = 1;
1331             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1332             break;
1333         }
1334     }
1335
1336     /*
1337      * wait for the free thread if the user specifies 'compress-wait-thread',
1338      * otherwise we will post the page out in the main thread as normal page.
1339      */
1340     if (pages < 0 && wait) {
1341         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1342         goto retry;
1343     }
1344     qemu_mutex_unlock(&comp_done_lock);
1345
1346     return pages;
1347 }
1348
1349 /**
1350  * find_dirty_block: find the next dirty page and update any state
1351  * associated with the search process.
1352  *
1353  * Returns true if a page is found
1354  *
1355  * @rs: current RAM state
1356  * @pss: data about the state of the current dirty page scan
1357  * @again: set to false if the search has scanned the whole of RAM
1358  */
1359 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1360 {
1361     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1362     if (pss->complete_round && pss->block == rs->last_seen_block &&
1363         pss->page >= rs->last_page) {
1364         /*
1365          * We've been once around the RAM and haven't found anything.
1366          * Give up.
1367          */
1368         *again = false;
1369         return false;
1370     }
1371     if (!offset_in_ramblock(pss->block,
1372                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1373         /* Didn't find anything in this RAM Block */
1374         pss->page = 0;
1375         pss->block = QLIST_NEXT_RCU(pss->block, next);
1376         if (!pss->block) {
1377             /*
1378              * If memory migration starts over, we will meet a dirtied page
1379              * which may still exists in compression threads's ring, so we
1380              * should flush the compressed data to make sure the new page
1381              * is not overwritten by the old one in the destination.
1382              *
1383              * Also If xbzrle is on, stop using the data compression at this
1384              * point. In theory, xbzrle can do better than compression.
1385              */
1386             flush_compressed_data(rs);
1387
1388             /* Hit the end of the list */
1389             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1390             /* Flag that we've looped */
1391             pss->complete_round = true;
1392             /* After the first round, enable XBZRLE. */
1393             if (migrate_use_xbzrle()) {
1394                 rs->xbzrle_enabled = true;
1395             }
1396         }
1397         /* Didn't find anything this time, but try again on the new block */
1398         *again = true;
1399         return false;
1400     } else {
1401         /* Can go around again, but... */
1402         *again = true;
1403         /* We've found something so probably don't need to */
1404         return true;
1405     }
1406 }
1407
1408 /**
1409  * unqueue_page: gets a page of the queue
1410  *
1411  * Helper for 'get_queued_page' - gets a page off the queue
1412  *
1413  * Returns the block of the page (or NULL if none available)
1414  *
1415  * @rs: current RAM state
1416  * @offset: used to return the offset within the RAMBlock
1417  */
1418 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1419 {
1420     RAMBlock *block = NULL;
1421
1422     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1423         return NULL;
1424     }
1425
1426     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1427     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1428         struct RAMSrcPageRequest *entry =
1429                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1430         block = entry->rb;
1431         *offset = entry->offset;
1432
1433         if (entry->len > TARGET_PAGE_SIZE) {
1434             entry->len -= TARGET_PAGE_SIZE;
1435             entry->offset += TARGET_PAGE_SIZE;
1436         } else {
1437             memory_region_unref(block->mr);
1438             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1439             g_free(entry);
1440             migration_consume_urgent_request();
1441         }
1442     }
1443
1444     return block;
1445 }
1446
1447 #if defined(__linux__)
1448 /**
1449  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1450  *   is found, return RAM block pointer and page offset
1451  *
1452  * Returns pointer to the RAMBlock containing faulting page,
1453  *   NULL if no write faults are pending
1454  *
1455  * @rs: current RAM state
1456  * @offset: page offset from the beginning of the block
1457  */
1458 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1459 {
1460     struct uffd_msg uffd_msg;
1461     void *page_address;
1462     RAMBlock *block;
1463     int res;
1464
1465     if (!migrate_background_snapshot()) {
1466         return NULL;
1467     }
1468
1469     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1470     if (res <= 0) {
1471         return NULL;
1472     }
1473
1474     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1475     block = qemu_ram_block_from_host(page_address, false, offset);
1476     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1477     return block;
1478 }
1479
1480 /**
1481  * ram_save_release_protection: release UFFD write protection after
1482  *   a range of pages has been saved
1483  *
1484  * @rs: current RAM state
1485  * @pss: page-search-status structure
1486  * @start_page: index of the first page in the range relative to pss->block
1487  *
1488  * Returns 0 on success, negative value in case of an error
1489 */
1490 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1491         unsigned long start_page)
1492 {
1493     int res = 0;
1494
1495     /* Check if page is from UFFD-managed region. */
1496     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1497         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1498         uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1499
1500         /* Flush async buffers before un-protect. */
1501         qemu_fflush(rs->f);
1502         /* Un-protect memory range. */
1503         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1504                 false, false);
1505     }
1506
1507     return res;
1508 }
1509
1510 /* ram_write_tracking_available: check if kernel supports required UFFD features
1511  *
1512  * Returns true if supports, false otherwise
1513  */
1514 bool ram_write_tracking_available(void)
1515 {
1516     uint64_t uffd_features;
1517     int res;
1518
1519     res = uffd_query_features(&uffd_features);
1520     return (res == 0 &&
1521             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1522 }
1523
1524 /* ram_write_tracking_compatible: check if guest configuration is
1525  *   compatible with 'write-tracking'
1526  *
1527  * Returns true if compatible, false otherwise
1528  */
1529 bool ram_write_tracking_compatible(void)
1530 {
1531     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1532     int uffd_fd;
1533     RAMBlock *block;
1534     bool ret = false;
1535
1536     /* Open UFFD file descriptor */
1537     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1538     if (uffd_fd < 0) {
1539         return false;
1540     }
1541
1542     RCU_READ_LOCK_GUARD();
1543
1544     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1545         uint64_t uffd_ioctls;
1546
1547         /* Nothing to do with read-only and MMIO-writable regions */
1548         if (block->mr->readonly || block->mr->rom_device) {
1549             continue;
1550         }
1551         /* Try to register block memory via UFFD-IO to track writes */
1552         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1553                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1554             goto out;
1555         }
1556         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1557             goto out;
1558         }
1559     }
1560     ret = true;
1561
1562 out:
1563     uffd_close_fd(uffd_fd);
1564     return ret;
1565 }
1566
1567 /*
1568  * ram_block_populate_pages: populate memory in the RAM block by reading
1569  *   an integer from the beginning of each page.
1570  *
1571  * Since it's solely used for userfault_fd WP feature, here we just
1572  *   hardcode page size to qemu_real_host_page_size.
1573  *
1574  * @block: RAM block to populate
1575  */
1576 static void ram_block_populate_pages(RAMBlock *block)
1577 {
1578     char *ptr = (char *) block->host;
1579
1580     for (ram_addr_t offset = 0; offset < block->used_length;
1581             offset += qemu_real_host_page_size) {
1582         char tmp = *(ptr + offset);
1583
1584         /* Don't optimize the read out */
1585         asm volatile("" : "+r" (tmp));
1586     }
1587 }
1588
1589 /*
1590  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1591  */
1592 void ram_write_tracking_prepare(void)
1593 {
1594     RAMBlock *block;
1595
1596     RCU_READ_LOCK_GUARD();
1597
1598     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1599         /* Nothing to do with read-only and MMIO-writable regions */
1600         if (block->mr->readonly || block->mr->rom_device) {
1601             continue;
1602         }
1603
1604         /*
1605          * Populate pages of the RAM block before enabling userfault_fd
1606          * write protection.
1607          *
1608          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1609          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1610          * pages with pte_none() entries in page table.
1611          */
1612         ram_block_populate_pages(block);
1613     }
1614 }
1615
1616 /*
1617  * ram_write_tracking_start: start UFFD-WP memory tracking
1618  *
1619  * Returns 0 for success or negative value in case of error
1620  */
1621 int ram_write_tracking_start(void)
1622 {
1623     int uffd_fd;
1624     RAMState *rs = ram_state;
1625     RAMBlock *block;
1626
1627     /* Open UFFD file descriptor */
1628     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1629     if (uffd_fd < 0) {
1630         return uffd_fd;
1631     }
1632     rs->uffdio_fd = uffd_fd;
1633
1634     RCU_READ_LOCK_GUARD();
1635
1636     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1637         /* Nothing to do with read-only and MMIO-writable regions */
1638         if (block->mr->readonly || block->mr->rom_device) {
1639             continue;
1640         }
1641
1642         /* Register block memory with UFFD to track writes */
1643         if (uffd_register_memory(rs->uffdio_fd, block->host,
1644                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1645             goto fail;
1646         }
1647         /* Apply UFFD write protection to the block memory range */
1648         if (uffd_change_protection(rs->uffdio_fd, block->host,
1649                 block->max_length, true, false)) {
1650             goto fail;
1651         }
1652         block->flags |= RAM_UF_WRITEPROTECT;
1653         memory_region_ref(block->mr);
1654
1655         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1656                 block->host, block->max_length);
1657     }
1658
1659     return 0;
1660
1661 fail:
1662     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1663
1664     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1665         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1666             continue;
1667         }
1668         /*
1669          * In case some memory block failed to be write-protected
1670          * remove protection and unregister all succeeded RAM blocks
1671          */
1672         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1673                 false, false);
1674         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1675         /* Cleanup flags and remove reference */
1676         block->flags &= ~RAM_UF_WRITEPROTECT;
1677         memory_region_unref(block->mr);
1678     }
1679
1680     uffd_close_fd(uffd_fd);
1681     rs->uffdio_fd = -1;
1682     return -1;
1683 }
1684
1685 /**
1686  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1687  */
1688 void ram_write_tracking_stop(void)
1689 {
1690     RAMState *rs = ram_state;
1691     RAMBlock *block;
1692
1693     RCU_READ_LOCK_GUARD();
1694
1695     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1696         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1697             continue;
1698         }
1699         /* Remove protection and unregister all affected RAM blocks */
1700         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1701                 false, false);
1702         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1703
1704         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1705                 block->host, block->max_length);
1706
1707         /* Cleanup flags and remove reference */
1708         block->flags &= ~RAM_UF_WRITEPROTECT;
1709         memory_region_unref(block->mr);
1710     }
1711
1712     /* Finally close UFFD file descriptor */
1713     uffd_close_fd(rs->uffdio_fd);
1714     rs->uffdio_fd = -1;
1715 }
1716
1717 #else
1718 /* No target OS support, stubs just fail or ignore */
1719
1720 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1721 {
1722     (void) rs;
1723     (void) offset;
1724
1725     return NULL;
1726 }
1727
1728 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1729         unsigned long start_page)
1730 {
1731     (void) rs;
1732     (void) pss;
1733     (void) start_page;
1734
1735     return 0;
1736 }
1737
1738 bool ram_write_tracking_available(void)
1739 {
1740     return false;
1741 }
1742
1743 bool ram_write_tracking_compatible(void)
1744 {
1745     assert(0);
1746     return false;
1747 }
1748
1749 int ram_write_tracking_start(void)
1750 {
1751     assert(0);
1752     return -1;
1753 }
1754
1755 void ram_write_tracking_stop(void)
1756 {
1757     assert(0);
1758 }
1759 #endif /* defined(__linux__) */
1760
1761 /**
1762  * get_queued_page: unqueue a page from the postcopy requests
1763  *
1764  * Skips pages that are already sent (!dirty)
1765  *
1766  * Returns true if a queued page is found
1767  *
1768  * @rs: current RAM state
1769  * @pss: data about the state of the current dirty page scan
1770  */
1771 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1772 {
1773     RAMBlock  *block;
1774     ram_addr_t offset;
1775     bool dirty;
1776
1777     do {
1778         block = unqueue_page(rs, &offset);
1779         /*
1780          * We're sending this page, and since it's postcopy nothing else
1781          * will dirty it, and we must make sure it doesn't get sent again
1782          * even if this queue request was received after the background
1783          * search already sent it.
1784          */
1785         if (block) {
1786             unsigned long page;
1787
1788             page = offset >> TARGET_PAGE_BITS;
1789             dirty = test_bit(page, block->bmap);
1790             if (!dirty) {
1791                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1792                                                 page);
1793             } else {
1794                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1795             }
1796         }
1797
1798     } while (block && !dirty);
1799
1800     if (!block) {
1801         /*
1802          * Poll write faults too if background snapshot is enabled; that's
1803          * when we have vcpus got blocked by the write protected pages.
1804          */
1805         block = poll_fault_page(rs, &offset);
1806     }
1807
1808     if (block) {
1809         /*
1810          * We want the background search to continue from the queued page
1811          * since the guest is likely to want other pages near to the page
1812          * it just requested.
1813          */
1814         pss->block = block;
1815         pss->page = offset >> TARGET_PAGE_BITS;
1816
1817         /*
1818          * This unqueued page would break the "one round" check, even is
1819          * really rare.
1820          */
1821         pss->complete_round = false;
1822     }
1823
1824     return !!block;
1825 }
1826
1827 /**
1828  * migration_page_queue_free: drop any remaining pages in the ram
1829  * request queue
1830  *
1831  * It should be empty at the end anyway, but in error cases there may
1832  * be some left.  in case that there is any page left, we drop it.
1833  *
1834  */
1835 static void migration_page_queue_free(RAMState *rs)
1836 {
1837     struct RAMSrcPageRequest *mspr, *next_mspr;
1838     /* This queue generally should be empty - but in the case of a failed
1839      * migration might have some droppings in.
1840      */
1841     RCU_READ_LOCK_GUARD();
1842     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1843         memory_region_unref(mspr->rb->mr);
1844         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1845         g_free(mspr);
1846     }
1847 }
1848
1849 /**
1850  * ram_save_queue_pages: queue the page for transmission
1851  *
1852  * A request from postcopy destination for example.
1853  *
1854  * Returns zero on success or negative on error
1855  *
1856  * @rbname: Name of the RAMBLock of the request. NULL means the
1857  *          same that last one.
1858  * @start: starting address from the start of the RAMBlock
1859  * @len: length (in bytes) to send
1860  */
1861 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1862 {
1863     RAMBlock *ramblock;
1864     RAMState *rs = ram_state;
1865
1866     ram_counters.postcopy_requests++;
1867     RCU_READ_LOCK_GUARD();
1868
1869     if (!rbname) {
1870         /* Reuse last RAMBlock */
1871         ramblock = rs->last_req_rb;
1872
1873         if (!ramblock) {
1874             /*
1875              * Shouldn't happen, we can't reuse the last RAMBlock if
1876              * it's the 1st request.
1877              */
1878             error_report("ram_save_queue_pages no previous block");
1879             return -1;
1880         }
1881     } else {
1882         ramblock = qemu_ram_block_by_name(rbname);
1883
1884         if (!ramblock) {
1885             /* We shouldn't be asked for a non-existent RAMBlock */
1886             error_report("ram_save_queue_pages no block '%s'", rbname);
1887             return -1;
1888         }
1889         rs->last_req_rb = ramblock;
1890     }
1891     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1892     if (!offset_in_ramblock(ramblock, start + len - 1)) {
1893         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1894                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1895                      __func__, start, len, ramblock->used_length);
1896         return -1;
1897     }
1898
1899     struct RAMSrcPageRequest *new_entry =
1900         g_malloc0(sizeof(struct RAMSrcPageRequest));
1901     new_entry->rb = ramblock;
1902     new_entry->offset = start;
1903     new_entry->len = len;
1904
1905     memory_region_ref(ramblock->mr);
1906     qemu_mutex_lock(&rs->src_page_req_mutex);
1907     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1908     migration_make_urgent_request();
1909     qemu_mutex_unlock(&rs->src_page_req_mutex);
1910
1911     return 0;
1912 }
1913
1914 static bool save_page_use_compression(RAMState *rs)
1915 {
1916     if (!migrate_use_compression()) {
1917         return false;
1918     }
1919
1920     /*
1921      * If xbzrle is enabled (e.g., after first round of migration), stop
1922      * using the data compression. In theory, xbzrle can do better than
1923      * compression.
1924      */
1925     if (rs->xbzrle_enabled) {
1926         return false;
1927     }
1928
1929     return true;
1930 }
1931
1932 /*
1933  * try to compress the page before posting it out, return true if the page
1934  * has been properly handled by compression, otherwise needs other
1935  * paths to handle it
1936  */
1937 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1938 {
1939     if (!save_page_use_compression(rs)) {
1940         return false;
1941     }
1942
1943     /*
1944      * When starting the process of a new block, the first page of
1945      * the block should be sent out before other pages in the same
1946      * block, and all the pages in last block should have been sent
1947      * out, keeping this order is important, because the 'cont' flag
1948      * is used to avoid resending the block name.
1949      *
1950      * We post the fist page as normal page as compression will take
1951      * much CPU resource.
1952      */
1953     if (block != rs->last_sent_block) {
1954         flush_compressed_data(rs);
1955         return false;
1956     }
1957
1958     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1959         return true;
1960     }
1961
1962     compression_counters.busy++;
1963     return false;
1964 }
1965
1966 /**
1967  * ram_save_target_page: save one target page
1968  *
1969  * Returns the number of pages written
1970  *
1971  * @rs: current RAM state
1972  * @pss: data about the page we want to send
1973  * @last_stage: if we are at the completion stage
1974  */
1975 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1976                                 bool last_stage)
1977 {
1978     RAMBlock *block = pss->block;
1979     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1980     int res;
1981
1982     if (control_save_page(rs, block, offset, &res)) {
1983         return res;
1984     }
1985
1986     if (save_compress_page(rs, block, offset)) {
1987         return 1;
1988     }
1989
1990     res = save_zero_page(rs, block, offset);
1991     if (res > 0) {
1992         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1993          * page would be stale
1994          */
1995         if (!save_page_use_compression(rs)) {
1996             XBZRLE_cache_lock();
1997             xbzrle_cache_zero_page(rs, block->offset + offset);
1998             XBZRLE_cache_unlock();
1999         }
2000         ram_release_pages(block->idstr, offset, res);
2001         return res;
2002     }
2003
2004     /*
2005      * Do not use multifd for:
2006      * 1. Compression as the first page in the new block should be posted out
2007      *    before sending the compressed page
2008      * 2. In postcopy as one whole host page should be placed
2009      */
2010     if (!save_page_use_compression(rs) && migrate_use_multifd()
2011         && !migration_in_postcopy()) {
2012         return ram_save_multifd_page(rs, block, offset);
2013     }
2014
2015     return ram_save_page(rs, pss, last_stage);
2016 }
2017
2018 /**
2019  * ram_save_host_page: save a whole host page
2020  *
2021  * Starting at *offset send pages up to the end of the current host
2022  * page. It's valid for the initial offset to point into the middle of
2023  * a host page in which case the remainder of the hostpage is sent.
2024  * Only dirty target pages are sent. Note that the host page size may
2025  * be a huge page for this block.
2026  * The saving stops at the boundary of the used_length of the block
2027  * if the RAMBlock isn't a multiple of the host page size.
2028  *
2029  * Returns the number of pages written or negative on error
2030  *
2031  * @rs: current RAM state
2032  * @ms: current migration state
2033  * @pss: data about the page we want to send
2034  * @last_stage: if we are at the completion stage
2035  */
2036 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2037                               bool last_stage)
2038 {
2039     int tmppages, pages = 0;
2040     size_t pagesize_bits =
2041         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2042     unsigned long hostpage_boundary =
2043         QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2044     unsigned long start_page = pss->page;
2045     int res;
2046
2047     if (ramblock_is_ignored(pss->block)) {
2048         error_report("block %s should not be migrated !", pss->block->idstr);
2049         return 0;
2050     }
2051
2052     do {
2053         /* Check the pages is dirty and if it is send it */
2054         if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2055             tmppages = ram_save_target_page(rs, pss, last_stage);
2056             if (tmppages < 0) {
2057                 return tmppages;
2058             }
2059
2060             pages += tmppages;
2061             /*
2062              * Allow rate limiting to happen in the middle of huge pages if
2063              * something is sent in the current iteration.
2064              */
2065             if (pagesize_bits > 1 && tmppages > 0) {
2066                 migration_rate_limit();
2067             }
2068         }
2069         pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2070     } while ((pss->page < hostpage_boundary) &&
2071              offset_in_ramblock(pss->block,
2072                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2073     /* The offset we leave with is the min boundary of host page and block */
2074     pss->page = MIN(pss->page, hostpage_boundary) - 1;
2075
2076     res = ram_save_release_protection(rs, pss, start_page);
2077     return (res < 0 ? res : pages);
2078 }
2079
2080 /**
2081  * ram_find_and_save_block: finds a dirty page and sends it to f
2082  *
2083  * Called within an RCU critical section.
2084  *
2085  * Returns the number of pages written where zero means no dirty pages,
2086  * or negative on error
2087  *
2088  * @rs: current RAM state
2089  * @last_stage: if we are at the completion stage
2090  *
2091  * On systems where host-page-size > target-page-size it will send all the
2092  * pages in a host page that are dirty.
2093  */
2094
2095 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2096 {
2097     PageSearchStatus pss;
2098     int pages = 0;
2099     bool again, found;
2100
2101     /* No dirty page as there is zero RAM */
2102     if (!ram_bytes_total()) {
2103         return pages;
2104     }
2105
2106     pss.block = rs->last_seen_block;
2107     pss.page = rs->last_page;
2108     pss.complete_round = false;
2109
2110     if (!pss.block) {
2111         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2112     }
2113
2114     do {
2115         again = true;
2116         found = get_queued_page(rs, &pss);
2117
2118         if (!found) {
2119             /* priority queue empty, so just search for something dirty */
2120             found = find_dirty_block(rs, &pss, &again);
2121         }
2122
2123         if (found) {
2124             pages = ram_save_host_page(rs, &pss, last_stage);
2125         }
2126     } while (!pages && again);
2127
2128     rs->last_seen_block = pss.block;
2129     rs->last_page = pss.page;
2130
2131     return pages;
2132 }
2133
2134 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2135 {
2136     uint64_t pages = size / TARGET_PAGE_SIZE;
2137
2138     if (zero) {
2139         ram_counters.duplicate += pages;
2140     } else {
2141         ram_counters.normal += pages;
2142         ram_counters.transferred += size;
2143         qemu_update_position(f, size);
2144     }
2145 }
2146
2147 static uint64_t ram_bytes_total_common(bool count_ignored)
2148 {
2149     RAMBlock *block;
2150     uint64_t total = 0;
2151
2152     RCU_READ_LOCK_GUARD();
2153
2154     if (count_ignored) {
2155         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2156             total += block->used_length;
2157         }
2158     } else {
2159         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2160             total += block->used_length;
2161         }
2162     }
2163     return total;
2164 }
2165
2166 uint64_t ram_bytes_total(void)
2167 {
2168     return ram_bytes_total_common(false);
2169 }
2170
2171 static void xbzrle_load_setup(void)
2172 {
2173     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2174 }
2175
2176 static void xbzrle_load_cleanup(void)
2177 {
2178     g_free(XBZRLE.decoded_buf);
2179     XBZRLE.decoded_buf = NULL;
2180 }
2181
2182 static void ram_state_cleanup(RAMState **rsp)
2183 {
2184     if (*rsp) {
2185         migration_page_queue_free(*rsp);
2186         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2187         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2188         g_free(*rsp);
2189         *rsp = NULL;
2190     }
2191 }
2192
2193 static void xbzrle_cleanup(void)
2194 {
2195     XBZRLE_cache_lock();
2196     if (XBZRLE.cache) {
2197         cache_fini(XBZRLE.cache);
2198         g_free(XBZRLE.encoded_buf);
2199         g_free(XBZRLE.current_buf);
2200         g_free(XBZRLE.zero_target_page);
2201         XBZRLE.cache = NULL;
2202         XBZRLE.encoded_buf = NULL;
2203         XBZRLE.current_buf = NULL;
2204         XBZRLE.zero_target_page = NULL;
2205     }
2206     XBZRLE_cache_unlock();
2207 }
2208
2209 static void ram_save_cleanup(void *opaque)
2210 {
2211     RAMState **rsp = opaque;
2212     RAMBlock *block;
2213
2214     /* We don't use dirty log with background snapshots */
2215     if (!migrate_background_snapshot()) {
2216         /* caller have hold iothread lock or is in a bh, so there is
2217          * no writing race against the migration bitmap
2218          */
2219         memory_global_dirty_log_stop();
2220     }
2221
2222     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2223         g_free(block->clear_bmap);
2224         block->clear_bmap = NULL;
2225         g_free(block->bmap);
2226         block->bmap = NULL;
2227     }
2228
2229     xbzrle_cleanup();
2230     compress_threads_save_cleanup();
2231     ram_state_cleanup(rsp);
2232 }
2233
2234 static void ram_state_reset(RAMState *rs)
2235 {
2236     rs->last_seen_block = NULL;
2237     rs->last_sent_block = NULL;
2238     rs->last_page = 0;
2239     rs->last_version = ram_list.version;
2240     rs->xbzrle_enabled = false;
2241 }
2242
2243 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2244
2245 /*
2246  * 'expected' is the value you expect the bitmap mostly to be full
2247  * of; it won't bother printing lines that are all this value.
2248  * If 'todump' is null the migration bitmap is dumped.
2249  */
2250 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2251                            unsigned long pages)
2252 {
2253     int64_t cur;
2254     int64_t linelen = 128;
2255     char linebuf[129];
2256
2257     for (cur = 0; cur < pages; cur += linelen) {
2258         int64_t curb;
2259         bool found = false;
2260         /*
2261          * Last line; catch the case where the line length
2262          * is longer than remaining ram
2263          */
2264         if (cur + linelen > pages) {
2265             linelen = pages - cur;
2266         }
2267         for (curb = 0; curb < linelen; curb++) {
2268             bool thisbit = test_bit(cur + curb, todump);
2269             linebuf[curb] = thisbit ? '1' : '.';
2270             found = found || (thisbit != expected);
2271         }
2272         if (found) {
2273             linebuf[curb] = '\0';
2274             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2275         }
2276     }
2277 }
2278
2279 /* **** functions for postcopy ***** */
2280
2281 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2282 {
2283     struct RAMBlock *block;
2284
2285     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2286         unsigned long *bitmap = block->bmap;
2287         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2288         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2289
2290         while (run_start < range) {
2291             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2292             ram_discard_range(block->idstr,
2293                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2294                               ((ram_addr_t)(run_end - run_start))
2295                                 << TARGET_PAGE_BITS);
2296             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2297         }
2298     }
2299 }
2300
2301 /**
2302  * postcopy_send_discard_bm_ram: discard a RAMBlock
2303  *
2304  * Returns zero on success
2305  *
2306  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2307  *
2308  * @ms: current migration state
2309  * @block: RAMBlock to discard
2310  */
2311 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2312 {
2313     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2314     unsigned long current;
2315     unsigned long *bitmap = block->bmap;
2316
2317     for (current = 0; current < end; ) {
2318         unsigned long one = find_next_bit(bitmap, end, current);
2319         unsigned long zero, discard_length;
2320
2321         if (one >= end) {
2322             break;
2323         }
2324
2325         zero = find_next_zero_bit(bitmap, end, one + 1);
2326
2327         if (zero >= end) {
2328             discard_length = end - one;
2329         } else {
2330             discard_length = zero - one;
2331         }
2332         postcopy_discard_send_range(ms, one, discard_length);
2333         current = one + discard_length;
2334     }
2335
2336     return 0;
2337 }
2338
2339 /**
2340  * postcopy_each_ram_send_discard: discard all RAMBlocks
2341  *
2342  * Returns 0 for success or negative for error
2343  *
2344  * Utility for the outgoing postcopy code.
2345  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2346  *   passing it bitmap indexes and name.
2347  * (qemu_ram_foreach_block ends up passing unscaled lengths
2348  *  which would mean postcopy code would have to deal with target page)
2349  *
2350  * @ms: current migration state
2351  */
2352 static int postcopy_each_ram_send_discard(MigrationState *ms)
2353 {
2354     struct RAMBlock *block;
2355     int ret;
2356
2357     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2358         postcopy_discard_send_init(ms, block->idstr);
2359
2360         /*
2361          * Postcopy sends chunks of bitmap over the wire, but it
2362          * just needs indexes at this point, avoids it having
2363          * target page specific code.
2364          */
2365         ret = postcopy_send_discard_bm_ram(ms, block);
2366         postcopy_discard_send_finish(ms);
2367         if (ret) {
2368             return ret;
2369         }
2370     }
2371
2372     return 0;
2373 }
2374
2375 /**
2376  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2377  *
2378  * Helper for postcopy_chunk_hostpages; it's called twice to
2379  * canonicalize the two bitmaps, that are similar, but one is
2380  * inverted.
2381  *
2382  * Postcopy requires that all target pages in a hostpage are dirty or
2383  * clean, not a mix.  This function canonicalizes the bitmaps.
2384  *
2385  * @ms: current migration state
2386  * @block: block that contains the page we want to canonicalize
2387  */
2388 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2389 {
2390     RAMState *rs = ram_state;
2391     unsigned long *bitmap = block->bmap;
2392     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2393     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2394     unsigned long run_start;
2395
2396     if (block->page_size == TARGET_PAGE_SIZE) {
2397         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2398         return;
2399     }
2400
2401     /* Find a dirty page */
2402     run_start = find_next_bit(bitmap, pages, 0);
2403
2404     while (run_start < pages) {
2405
2406         /*
2407          * If the start of this run of pages is in the middle of a host
2408          * page, then we need to fixup this host page.
2409          */
2410         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2411             /* Find the end of this run */
2412             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2413             /*
2414              * If the end isn't at the start of a host page, then the
2415              * run doesn't finish at the end of a host page
2416              * and we need to discard.
2417              */
2418         }
2419
2420         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2421             unsigned long page;
2422             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2423                                                              host_ratio);
2424             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2425
2426             /* Clean up the bitmap */
2427             for (page = fixup_start_addr;
2428                  page < fixup_start_addr + host_ratio; page++) {
2429                 /*
2430                  * Remark them as dirty, updating the count for any pages
2431                  * that weren't previously dirty.
2432                  */
2433                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2434             }
2435         }
2436
2437         /* Find the next dirty page for the next iteration */
2438         run_start = find_next_bit(bitmap, pages, run_start);
2439     }
2440 }
2441
2442 /**
2443  * postcopy_chunk_hostpages: discard any partially sent host page
2444  *
2445  * Utility for the outgoing postcopy code.
2446  *
2447  * Discard any partially sent host-page size chunks, mark any partially
2448  * dirty host-page size chunks as all dirty.  In this case the host-page
2449  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2450  *
2451  * Returns zero on success
2452  *
2453  * @ms: current migration state
2454  * @block: block we want to work with
2455  */
2456 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2457 {
2458     postcopy_discard_send_init(ms, block->idstr);
2459
2460     /*
2461      * Ensure that all partially dirty host pages are made fully dirty.
2462      */
2463     postcopy_chunk_hostpages_pass(ms, block);
2464
2465     postcopy_discard_send_finish(ms);
2466     return 0;
2467 }
2468
2469 /**
2470  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2471  *
2472  * Returns zero on success
2473  *
2474  * Transmit the set of pages to be discarded after precopy to the target
2475  * these are pages that:
2476  *     a) Have been previously transmitted but are now dirty again
2477  *     b) Pages that have never been transmitted, this ensures that
2478  *        any pages on the destination that have been mapped by background
2479  *        tasks get discarded (transparent huge pages is the specific concern)
2480  * Hopefully this is pretty sparse
2481  *
2482  * @ms: current migration state
2483  */
2484 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2485 {
2486     RAMState *rs = ram_state;
2487     RAMBlock *block;
2488     int ret;
2489
2490     RCU_READ_LOCK_GUARD();
2491
2492     /* This should be our last sync, the src is now paused */
2493     migration_bitmap_sync(rs);
2494
2495     /* Easiest way to make sure we don't resume in the middle of a host-page */
2496     rs->last_seen_block = NULL;
2497     rs->last_sent_block = NULL;
2498     rs->last_page = 0;
2499
2500     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2501         /* Deal with TPS != HPS and huge pages */
2502         ret = postcopy_chunk_hostpages(ms, block);
2503         if (ret) {
2504             return ret;
2505         }
2506
2507 #ifdef DEBUG_POSTCOPY
2508         ram_debug_dump_bitmap(block->bmap, true,
2509                               block->used_length >> TARGET_PAGE_BITS);
2510 #endif
2511     }
2512     trace_ram_postcopy_send_discard_bitmap();
2513
2514     return postcopy_each_ram_send_discard(ms);
2515 }
2516
2517 /**
2518  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2519  *
2520  * Returns zero on success
2521  *
2522  * @rbname: name of the RAMBlock of the request. NULL means the
2523  *          same that last one.
2524  * @start: RAMBlock starting page
2525  * @length: RAMBlock size
2526  */
2527 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2528 {
2529     trace_ram_discard_range(rbname, start, length);
2530
2531     RCU_READ_LOCK_GUARD();
2532     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2533
2534     if (!rb) {
2535         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2536         return -1;
2537     }
2538
2539     /*
2540      * On source VM, we don't need to update the received bitmap since
2541      * we don't even have one.
2542      */
2543     if (rb->receivedmap) {
2544         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2545                      length >> qemu_target_page_bits());
2546     }
2547
2548     return ram_block_discard_range(rb, start, length);
2549 }
2550
2551 /*
2552  * For every allocation, we will try not to crash the VM if the
2553  * allocation failed.
2554  */
2555 static int xbzrle_init(void)
2556 {
2557     Error *local_err = NULL;
2558
2559     if (!migrate_use_xbzrle()) {
2560         return 0;
2561     }
2562
2563     XBZRLE_cache_lock();
2564
2565     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2566     if (!XBZRLE.zero_target_page) {
2567         error_report("%s: Error allocating zero page", __func__);
2568         goto err_out;
2569     }
2570
2571     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2572                               TARGET_PAGE_SIZE, &local_err);
2573     if (!XBZRLE.cache) {
2574         error_report_err(local_err);
2575         goto free_zero_page;
2576     }
2577
2578     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2579     if (!XBZRLE.encoded_buf) {
2580         error_report("%s: Error allocating encoded_buf", __func__);
2581         goto free_cache;
2582     }
2583
2584     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2585     if (!XBZRLE.current_buf) {
2586         error_report("%s: Error allocating current_buf", __func__);
2587         goto free_encoded_buf;
2588     }
2589
2590     /* We are all good */
2591     XBZRLE_cache_unlock();
2592     return 0;
2593
2594 free_encoded_buf:
2595     g_free(XBZRLE.encoded_buf);
2596     XBZRLE.encoded_buf = NULL;
2597 free_cache:
2598     cache_fini(XBZRLE.cache);
2599     XBZRLE.cache = NULL;
2600 free_zero_page:
2601     g_free(XBZRLE.zero_target_page);
2602     XBZRLE.zero_target_page = NULL;
2603 err_out:
2604     XBZRLE_cache_unlock();
2605     return -ENOMEM;
2606 }
2607
2608 static int ram_state_init(RAMState **rsp)
2609 {
2610     *rsp = g_try_new0(RAMState, 1);
2611
2612     if (!*rsp) {
2613         error_report("%s: Init ramstate fail", __func__);
2614         return -1;
2615     }
2616
2617     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2618     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2619     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2620
2621     /*
2622      * Count the total number of pages used by ram blocks not including any
2623      * gaps due to alignment or unplugs.
2624      * This must match with the initial values of dirty bitmap.
2625      */
2626     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2627     ram_state_reset(*rsp);
2628
2629     return 0;
2630 }
2631
2632 static void ram_list_init_bitmaps(void)
2633 {
2634     MigrationState *ms = migrate_get_current();
2635     RAMBlock *block;
2636     unsigned long pages;
2637     uint8_t shift;
2638
2639     /* Skip setting bitmap if there is no RAM */
2640     if (ram_bytes_total()) {
2641         shift = ms->clear_bitmap_shift;
2642         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2643             error_report("clear_bitmap_shift (%u) too big, using "
2644                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2645             shift = CLEAR_BITMAP_SHIFT_MAX;
2646         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2647             error_report("clear_bitmap_shift (%u) too small, using "
2648                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2649             shift = CLEAR_BITMAP_SHIFT_MIN;
2650         }
2651
2652         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2653             pages = block->max_length >> TARGET_PAGE_BITS;
2654             /*
2655              * The initial dirty bitmap for migration must be set with all
2656              * ones to make sure we'll migrate every guest RAM page to
2657              * destination.
2658              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2659              * new migration after a failed migration, ram_list.
2660              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2661              * guest memory.
2662              */
2663             block->bmap = bitmap_new(pages);
2664             bitmap_set(block->bmap, 0, pages);
2665             block->clear_bmap_shift = shift;
2666             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2667         }
2668     }
2669 }
2670
2671 static void ram_init_bitmaps(RAMState *rs)
2672 {
2673     /* For memory_global_dirty_log_start below.  */
2674     qemu_mutex_lock_iothread();
2675     qemu_mutex_lock_ramlist();
2676
2677     WITH_RCU_READ_LOCK_GUARD() {
2678         ram_list_init_bitmaps();
2679         /* We don't use dirty log with background snapshots */
2680         if (!migrate_background_snapshot()) {
2681             memory_global_dirty_log_start();
2682             migration_bitmap_sync_precopy(rs);
2683         }
2684     }
2685     qemu_mutex_unlock_ramlist();
2686     qemu_mutex_unlock_iothread();
2687 }
2688
2689 static int ram_init_all(RAMState **rsp)
2690 {
2691     if (ram_state_init(rsp)) {
2692         return -1;
2693     }
2694
2695     if (xbzrle_init()) {
2696         ram_state_cleanup(rsp);
2697         return -1;
2698     }
2699
2700     ram_init_bitmaps(*rsp);
2701
2702     return 0;
2703 }
2704
2705 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2706 {
2707     RAMBlock *block;
2708     uint64_t pages = 0;
2709
2710     /*
2711      * Postcopy is not using xbzrle/compression, so no need for that.
2712      * Also, since source are already halted, we don't need to care
2713      * about dirty page logging as well.
2714      */
2715
2716     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2717         pages += bitmap_count_one(block->bmap,
2718                                   block->used_length >> TARGET_PAGE_BITS);
2719     }
2720
2721     /* This may not be aligned with current bitmaps. Recalculate. */
2722     rs->migration_dirty_pages = pages;
2723
2724     ram_state_reset(rs);
2725
2726     /* Update RAMState cache of output QEMUFile */
2727     rs->f = out;
2728
2729     trace_ram_state_resume_prepare(pages);
2730 }
2731
2732 /*
2733  * This function clears bits of the free pages reported by the caller from the
2734  * migration dirty bitmap. @addr is the host address corresponding to the
2735  * start of the continuous guest free pages, and @len is the total bytes of
2736  * those pages.
2737  */
2738 void qemu_guest_free_page_hint(void *addr, size_t len)
2739 {
2740     RAMBlock *block;
2741     ram_addr_t offset;
2742     size_t used_len, start, npages;
2743     MigrationState *s = migrate_get_current();
2744
2745     /* This function is currently expected to be used during live migration */
2746     if (!migration_is_setup_or_active(s->state)) {
2747         return;
2748     }
2749
2750     for (; len > 0; len -= used_len, addr += used_len) {
2751         block = qemu_ram_block_from_host(addr, false, &offset);
2752         if (unlikely(!block || offset >= block->used_length)) {
2753             /*
2754              * The implementation might not support RAMBlock resize during
2755              * live migration, but it could happen in theory with future
2756              * updates. So we add a check here to capture that case.
2757              */
2758             error_report_once("%s unexpected error", __func__);
2759             return;
2760         }
2761
2762         if (len <= block->used_length - offset) {
2763             used_len = len;
2764         } else {
2765             used_len = block->used_length - offset;
2766         }
2767
2768         start = offset >> TARGET_PAGE_BITS;
2769         npages = used_len >> TARGET_PAGE_BITS;
2770
2771         qemu_mutex_lock(&ram_state->bitmap_mutex);
2772         /*
2773          * The skipped free pages are equavalent to be sent from clear_bmap's
2774          * perspective, so clear the bits from the memory region bitmap which
2775          * are initially set. Otherwise those skipped pages will be sent in
2776          * the next round after syncing from the memory region bitmap.
2777          */
2778         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2779         ram_state->migration_dirty_pages -=
2780                       bitmap_count_one_with_offset(block->bmap, start, npages);
2781         bitmap_clear(block->bmap, start, npages);
2782         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2783     }
2784 }
2785
2786 /*
2787  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2788  * long-running RCU critical section.  When rcu-reclaims in the code
2789  * start to become numerous it will be necessary to reduce the
2790  * granularity of these critical sections.
2791  */
2792
2793 /**
2794  * ram_save_setup: Setup RAM for migration
2795  *
2796  * Returns zero to indicate success and negative for error
2797  *
2798  * @f: QEMUFile where to send the data
2799  * @opaque: RAMState pointer
2800  */
2801 static int ram_save_setup(QEMUFile *f, void *opaque)
2802 {
2803     RAMState **rsp = opaque;
2804     RAMBlock *block;
2805
2806     if (compress_threads_save_setup()) {
2807         return -1;
2808     }
2809
2810     /* migration has already setup the bitmap, reuse it. */
2811     if (!migration_in_colo_state()) {
2812         if (ram_init_all(rsp) != 0) {
2813             compress_threads_save_cleanup();
2814             return -1;
2815         }
2816     }
2817     (*rsp)->f = f;
2818
2819     WITH_RCU_READ_LOCK_GUARD() {
2820         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2821
2822         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2823             qemu_put_byte(f, strlen(block->idstr));
2824             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2825             qemu_put_be64(f, block->used_length);
2826             if (migrate_postcopy_ram() && block->page_size !=
2827                                           qemu_host_page_size) {
2828                 qemu_put_be64(f, block->page_size);
2829             }
2830             if (migrate_ignore_shared()) {
2831                 qemu_put_be64(f, block->mr->addr);
2832             }
2833         }
2834     }
2835
2836     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2837     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2838
2839     multifd_send_sync_main(f);
2840     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2841     qemu_fflush(f);
2842
2843     return 0;
2844 }
2845
2846 /**
2847  * ram_save_iterate: iterative stage for migration
2848  *
2849  * Returns zero to indicate success and negative for error
2850  *
2851  * @f: QEMUFile where to send the data
2852  * @opaque: RAMState pointer
2853  */
2854 static int ram_save_iterate(QEMUFile *f, void *opaque)
2855 {
2856     RAMState **temp = opaque;
2857     RAMState *rs = *temp;
2858     int ret = 0;
2859     int i;
2860     int64_t t0;
2861     int done = 0;
2862
2863     if (blk_mig_bulk_active()) {
2864         /* Avoid transferring ram during bulk phase of block migration as
2865          * the bulk phase will usually take a long time and transferring
2866          * ram updates during that time is pointless. */
2867         goto out;
2868     }
2869
2870     /*
2871      * We'll take this lock a little bit long, but it's okay for two reasons.
2872      * Firstly, the only possible other thread to take it is who calls
2873      * qemu_guest_free_page_hint(), which should be rare; secondly, see
2874      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
2875      * guarantees that we'll at least released it in a regular basis.
2876      */
2877     qemu_mutex_lock(&rs->bitmap_mutex);
2878     WITH_RCU_READ_LOCK_GUARD() {
2879         if (ram_list.version != rs->last_version) {
2880             ram_state_reset(rs);
2881         }
2882
2883         /* Read version before ram_list.blocks */
2884         smp_rmb();
2885
2886         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2887
2888         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2889         i = 0;
2890         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2891                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2892             int pages;
2893
2894             if (qemu_file_get_error(f)) {
2895                 break;
2896             }
2897
2898             pages = ram_find_and_save_block(rs, false);
2899             /* no more pages to sent */
2900             if (pages == 0) {
2901                 done = 1;
2902                 break;
2903             }
2904
2905             if (pages < 0) {
2906                 qemu_file_set_error(f, pages);
2907                 break;
2908             }
2909
2910             rs->target_page_count += pages;
2911
2912             /*
2913              * During postcopy, it is necessary to make sure one whole host
2914              * page is sent in one chunk.
2915              */
2916             if (migrate_postcopy_ram()) {
2917                 flush_compressed_data(rs);
2918             }
2919
2920             /*
2921              * we want to check in the 1st loop, just in case it was the 1st
2922              * time and we had to sync the dirty bitmap.
2923              * qemu_clock_get_ns() is a bit expensive, so we only check each
2924              * some iterations
2925              */
2926             if ((i & 63) == 0) {
2927                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2928                               1000000;
2929                 if (t1 > MAX_WAIT) {
2930                     trace_ram_save_iterate_big_wait(t1, i);
2931                     break;
2932                 }
2933             }
2934             i++;
2935         }
2936     }
2937     qemu_mutex_unlock(&rs->bitmap_mutex);
2938
2939     /*
2940      * Must occur before EOS (or any QEMUFile operation)
2941      * because of RDMA protocol.
2942      */
2943     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2944
2945 out:
2946     if (ret >= 0
2947         && migration_is_setup_or_active(migrate_get_current()->state)) {
2948         multifd_send_sync_main(rs->f);
2949         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2950         qemu_fflush(f);
2951         ram_counters.transferred += 8;
2952
2953         ret = qemu_file_get_error(f);
2954     }
2955     if (ret < 0) {
2956         return ret;
2957     }
2958
2959     return done;
2960 }
2961
2962 /**
2963  * ram_save_complete: function called to send the remaining amount of ram
2964  *
2965  * Returns zero to indicate success or negative on error
2966  *
2967  * Called with iothread lock
2968  *
2969  * @f: QEMUFile where to send the data
2970  * @opaque: RAMState pointer
2971  */
2972 static int ram_save_complete(QEMUFile *f, void *opaque)
2973 {
2974     RAMState **temp = opaque;
2975     RAMState *rs = *temp;
2976     int ret = 0;
2977
2978     WITH_RCU_READ_LOCK_GUARD() {
2979         if (!migration_in_postcopy()) {
2980             migration_bitmap_sync_precopy(rs);
2981         }
2982
2983         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2984
2985         /* try transferring iterative blocks of memory */
2986
2987         /* flush all remaining blocks regardless of rate limiting */
2988         while (true) {
2989             int pages;
2990
2991             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2992             /* no more blocks to sent */
2993             if (pages == 0) {
2994                 break;
2995             }
2996             if (pages < 0) {
2997                 ret = pages;
2998                 break;
2999             }
3000         }
3001
3002         flush_compressed_data(rs);
3003         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3004     }
3005
3006     if (ret >= 0) {
3007         multifd_send_sync_main(rs->f);
3008         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3009         qemu_fflush(f);
3010     }
3011
3012     return ret;
3013 }
3014
3015 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3016                              uint64_t *res_precopy_only,
3017                              uint64_t *res_compatible,
3018                              uint64_t *res_postcopy_only)
3019 {
3020     RAMState **temp = opaque;
3021     RAMState *rs = *temp;
3022     uint64_t remaining_size;
3023
3024     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3025
3026     if (!migration_in_postcopy() &&
3027         remaining_size < max_size) {
3028         qemu_mutex_lock_iothread();
3029         WITH_RCU_READ_LOCK_GUARD() {
3030             migration_bitmap_sync_precopy(rs);
3031         }
3032         qemu_mutex_unlock_iothread();
3033         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3034     }
3035
3036     if (migrate_postcopy_ram()) {
3037         /* We can do postcopy, and all the data is postcopiable */
3038         *res_compatible += remaining_size;
3039     } else {
3040         *res_precopy_only += remaining_size;
3041     }
3042 }
3043
3044 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3045 {
3046     unsigned int xh_len;
3047     int xh_flags;
3048     uint8_t *loaded_data;
3049
3050     /* extract RLE header */
3051     xh_flags = qemu_get_byte(f);
3052     xh_len = qemu_get_be16(f);
3053
3054     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3055         error_report("Failed to load XBZRLE page - wrong compression!");
3056         return -1;
3057     }
3058
3059     if (xh_len > TARGET_PAGE_SIZE) {
3060         error_report("Failed to load XBZRLE page - len overflow!");
3061         return -1;
3062     }
3063     loaded_data = XBZRLE.decoded_buf;
3064     /* load data and decode */
3065     /* it can change loaded_data to point to an internal buffer */
3066     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3067
3068     /* decode RLE */
3069     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3070                              TARGET_PAGE_SIZE) == -1) {
3071         error_report("Failed to load XBZRLE page - decode error!");
3072         return -1;
3073     }
3074
3075     return 0;
3076 }
3077
3078 /**
3079  * ram_block_from_stream: read a RAMBlock id from the migration stream
3080  *
3081  * Must be called from within a rcu critical section.
3082  *
3083  * Returns a pointer from within the RCU-protected ram_list.
3084  *
3085  * @f: QEMUFile where to read the data from
3086  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3087  */
3088 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3089 {
3090     static RAMBlock *block;
3091     char id[256];
3092     uint8_t len;
3093
3094     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3095         if (!block) {
3096             error_report("Ack, bad migration stream!");
3097             return NULL;
3098         }
3099         return block;
3100     }
3101
3102     len = qemu_get_byte(f);
3103     qemu_get_buffer(f, (uint8_t *)id, len);
3104     id[len] = 0;
3105
3106     block = qemu_ram_block_by_name(id);
3107     if (!block) {
3108         error_report("Can't find block %s", id);
3109         return NULL;
3110     }
3111
3112     if (ramblock_is_ignored(block)) {
3113         error_report("block %s should not be migrated !", id);
3114         return NULL;
3115     }
3116
3117     return block;
3118 }
3119
3120 static inline void *host_from_ram_block_offset(RAMBlock *block,
3121                                                ram_addr_t offset)
3122 {
3123     if (!offset_in_ramblock(block, offset)) {
3124         return NULL;
3125     }
3126
3127     return block->host + offset;
3128 }
3129
3130 static void *host_page_from_ram_block_offset(RAMBlock *block,
3131                                              ram_addr_t offset)
3132 {
3133     /* Note: Explicitly no check against offset_in_ramblock(). */
3134     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3135                                    block->page_size);
3136 }
3137
3138 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3139                                                          ram_addr_t offset)
3140 {
3141     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3142 }
3143
3144 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3145                              ram_addr_t offset, bool record_bitmap)
3146 {
3147     if (!offset_in_ramblock(block, offset)) {
3148         return NULL;
3149     }
3150     if (!block->colo_cache) {
3151         error_report("%s: colo_cache is NULL in block :%s",
3152                      __func__, block->idstr);
3153         return NULL;
3154     }
3155
3156     /*
3157     * During colo checkpoint, we need bitmap of these migrated pages.
3158     * It help us to decide which pages in ram cache should be flushed
3159     * into VM's RAM later.
3160     */
3161     if (record_bitmap &&
3162         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3163         ram_state->migration_dirty_pages++;
3164     }
3165     return block->colo_cache + offset;
3166 }
3167
3168 /**
3169  * ram_handle_compressed: handle the zero page case
3170  *
3171  * If a page (or a whole RDMA chunk) has been
3172  * determined to be zero, then zap it.
3173  *
3174  * @host: host address for the zero page
3175  * @ch: what the page is filled from.  We only support zero
3176  * @size: size of the zero page
3177  */
3178 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3179 {
3180     if (ch != 0 || !is_zero_range(host, size)) {
3181         memset(host, ch, size);
3182     }
3183 }
3184
3185 /* return the size after decompression, or negative value on error */
3186 static int
3187 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3188                      const uint8_t *source, size_t source_len)
3189 {
3190     int err;
3191
3192     err = inflateReset(stream);
3193     if (err != Z_OK) {
3194         return -1;
3195     }
3196
3197     stream->avail_in = source_len;
3198     stream->next_in = (uint8_t *)source;
3199     stream->avail_out = dest_len;
3200     stream->next_out = dest;
3201
3202     err = inflate(stream, Z_NO_FLUSH);
3203     if (err != Z_STREAM_END) {
3204         return -1;
3205     }
3206
3207     return stream->total_out;
3208 }
3209
3210 static void *do_data_decompress(void *opaque)
3211 {
3212     DecompressParam *param = opaque;
3213     unsigned long pagesize;
3214     uint8_t *des;
3215     int len, ret;
3216
3217     qemu_mutex_lock(&param->mutex);
3218     while (!param->quit) {
3219         if (param->des) {
3220             des = param->des;
3221             len = param->len;
3222             param->des = 0;
3223             qemu_mutex_unlock(&param->mutex);
3224
3225             pagesize = TARGET_PAGE_SIZE;
3226
3227             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3228                                        param->compbuf, len);
3229             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3230                 error_report("decompress data failed");
3231                 qemu_file_set_error(decomp_file, ret);
3232             }
3233
3234             qemu_mutex_lock(&decomp_done_lock);
3235             param->done = true;
3236             qemu_cond_signal(&decomp_done_cond);
3237             qemu_mutex_unlock(&decomp_done_lock);
3238
3239             qemu_mutex_lock(&param->mutex);
3240         } else {
3241             qemu_cond_wait(&param->cond, &param->mutex);
3242         }
3243     }
3244     qemu_mutex_unlock(&param->mutex);
3245
3246     return NULL;
3247 }
3248
3249 static int wait_for_decompress_done(void)
3250 {
3251     int idx, thread_count;
3252
3253     if (!migrate_use_compression()) {
3254         return 0;
3255     }
3256
3257     thread_count = migrate_decompress_threads();
3258     qemu_mutex_lock(&decomp_done_lock);
3259     for (idx = 0; idx < thread_count; idx++) {
3260         while (!decomp_param[idx].done) {
3261             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3262         }
3263     }
3264     qemu_mutex_unlock(&decomp_done_lock);
3265     return qemu_file_get_error(decomp_file);
3266 }
3267
3268 static void compress_threads_load_cleanup(void)
3269 {
3270     int i, thread_count;
3271
3272     if (!migrate_use_compression()) {
3273         return;
3274     }
3275     thread_count = migrate_decompress_threads();
3276     for (i = 0; i < thread_count; i++) {
3277         /*
3278          * we use it as a indicator which shows if the thread is
3279          * properly init'd or not
3280          */
3281         if (!decomp_param[i].compbuf) {
3282             break;
3283         }
3284
3285         qemu_mutex_lock(&decomp_param[i].mutex);
3286         decomp_param[i].quit = true;
3287         qemu_cond_signal(&decomp_param[i].cond);
3288         qemu_mutex_unlock(&decomp_param[i].mutex);
3289     }
3290     for (i = 0; i < thread_count; i++) {
3291         if (!decomp_param[i].compbuf) {
3292             break;
3293         }
3294
3295         qemu_thread_join(decompress_threads + i);
3296         qemu_mutex_destroy(&decomp_param[i].mutex);
3297         qemu_cond_destroy(&decomp_param[i].cond);
3298         inflateEnd(&decomp_param[i].stream);
3299         g_free(decomp_param[i].compbuf);
3300         decomp_param[i].compbuf = NULL;
3301     }
3302     g_free(decompress_threads);
3303     g_free(decomp_param);
3304     decompress_threads = NULL;
3305     decomp_param = NULL;
3306     decomp_file = NULL;
3307 }
3308
3309 static int compress_threads_load_setup(QEMUFile *f)
3310 {
3311     int i, thread_count;
3312
3313     if (!migrate_use_compression()) {
3314         return 0;
3315     }
3316
3317     thread_count = migrate_decompress_threads();
3318     decompress_threads = g_new0(QemuThread, thread_count);
3319     decomp_param = g_new0(DecompressParam, thread_count);
3320     qemu_mutex_init(&decomp_done_lock);
3321     qemu_cond_init(&decomp_done_cond);
3322     decomp_file = f;
3323     for (i = 0; i < thread_count; i++) {
3324         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3325             goto exit;
3326         }
3327
3328         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3329         qemu_mutex_init(&decomp_param[i].mutex);
3330         qemu_cond_init(&decomp_param[i].cond);
3331         decomp_param[i].done = true;
3332         decomp_param[i].quit = false;
3333         qemu_thread_create(decompress_threads + i, "decompress",
3334                            do_data_decompress, decomp_param + i,
3335                            QEMU_THREAD_JOINABLE);
3336     }
3337     return 0;
3338 exit:
3339     compress_threads_load_cleanup();
3340     return -1;
3341 }
3342
3343 static void decompress_data_with_multi_threads(QEMUFile *f,
3344                                                void *host, int len)
3345 {
3346     int idx, thread_count;
3347
3348     thread_count = migrate_decompress_threads();
3349     QEMU_LOCK_GUARD(&decomp_done_lock);
3350     while (true) {
3351         for (idx = 0; idx < thread_count; idx++) {
3352             if (decomp_param[idx].done) {
3353                 decomp_param[idx].done = false;
3354                 qemu_mutex_lock(&decomp_param[idx].mutex);
3355                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3356                 decomp_param[idx].des = host;
3357                 decomp_param[idx].len = len;
3358                 qemu_cond_signal(&decomp_param[idx].cond);
3359                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3360                 break;
3361             }
3362         }
3363         if (idx < thread_count) {
3364             break;
3365         } else {
3366             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3367         }
3368     }
3369 }
3370
3371 static void colo_init_ram_state(void)
3372 {
3373     ram_state_init(&ram_state);
3374 }
3375
3376 /*
3377  * colo cache: this is for secondary VM, we cache the whole
3378  * memory of the secondary VM, it is need to hold the global lock
3379  * to call this helper.
3380  */
3381 int colo_init_ram_cache(void)
3382 {
3383     RAMBlock *block;
3384
3385     WITH_RCU_READ_LOCK_GUARD() {
3386         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3387             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3388                                                     NULL, false, false);
3389             if (!block->colo_cache) {
3390                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3391                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3392                              block->used_length);
3393                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3394                     if (block->colo_cache) {
3395                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3396                         block->colo_cache = NULL;
3397                     }
3398                 }
3399                 return -errno;
3400             }
3401         }
3402     }
3403
3404     /*
3405     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3406     * with to decide which page in cache should be flushed into SVM's RAM. Here
3407     * we use the same name 'ram_bitmap' as for migration.
3408     */
3409     if (ram_bytes_total()) {
3410         RAMBlock *block;
3411
3412         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3413             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3414             block->bmap = bitmap_new(pages);
3415         }
3416     }
3417
3418     colo_init_ram_state();
3419     return 0;
3420 }
3421
3422 /* TODO: duplicated with ram_init_bitmaps */
3423 void colo_incoming_start_dirty_log(void)
3424 {
3425     RAMBlock *block = NULL;
3426     /* For memory_global_dirty_log_start below. */
3427     qemu_mutex_lock_iothread();
3428     qemu_mutex_lock_ramlist();
3429
3430     memory_global_dirty_log_sync();
3431     WITH_RCU_READ_LOCK_GUARD() {
3432         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3433             ramblock_sync_dirty_bitmap(ram_state, block);
3434             /* Discard this dirty bitmap record */
3435             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3436         }
3437         memory_global_dirty_log_start();
3438     }
3439     ram_state->migration_dirty_pages = 0;
3440     qemu_mutex_unlock_ramlist();
3441     qemu_mutex_unlock_iothread();
3442 }
3443
3444 /* It is need to hold the global lock to call this helper */
3445 void colo_release_ram_cache(void)
3446 {
3447     RAMBlock *block;
3448
3449     memory_global_dirty_log_stop();
3450     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3451         g_free(block->bmap);
3452         block->bmap = NULL;
3453     }
3454
3455     WITH_RCU_READ_LOCK_GUARD() {
3456         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3457             if (block->colo_cache) {
3458                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3459                 block->colo_cache = NULL;
3460             }
3461         }
3462     }
3463     ram_state_cleanup(&ram_state);
3464 }
3465
3466 /**
3467  * ram_load_setup: Setup RAM for migration incoming side
3468  *
3469  * Returns zero to indicate success and negative for error
3470  *
3471  * @f: QEMUFile where to receive the data
3472  * @opaque: RAMState pointer
3473  */
3474 static int ram_load_setup(QEMUFile *f, void *opaque)
3475 {
3476     if (compress_threads_load_setup(f)) {
3477         return -1;
3478     }
3479
3480     xbzrle_load_setup();
3481     ramblock_recv_map_init();
3482
3483     return 0;
3484 }
3485
3486 static int ram_load_cleanup(void *opaque)
3487 {
3488     RAMBlock *rb;
3489
3490     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3491         qemu_ram_block_writeback(rb);
3492     }
3493
3494     xbzrle_load_cleanup();
3495     compress_threads_load_cleanup();
3496
3497     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3498         g_free(rb->receivedmap);
3499         rb->receivedmap = NULL;
3500     }
3501
3502     return 0;
3503 }
3504
3505 /**
3506  * ram_postcopy_incoming_init: allocate postcopy data structures
3507  *
3508  * Returns 0 for success and negative if there was one error
3509  *
3510  * @mis: current migration incoming state
3511  *
3512  * Allocate data structures etc needed by incoming migration with
3513  * postcopy-ram. postcopy-ram's similarly names
3514  * postcopy_ram_incoming_init does the work.
3515  */
3516 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3517 {
3518     return postcopy_ram_incoming_init(mis);
3519 }
3520
3521 /**
3522  * ram_load_postcopy: load a page in postcopy case
3523  *
3524  * Returns 0 for success or -errno in case of error
3525  *
3526  * Called in postcopy mode by ram_load().
3527  * rcu_read_lock is taken prior to this being called.
3528  *
3529  * @f: QEMUFile where to send the data
3530  */
3531 static int ram_load_postcopy(QEMUFile *f)
3532 {
3533     int flags = 0, ret = 0;
3534     bool place_needed = false;
3535     bool matches_target_page_size = false;
3536     MigrationIncomingState *mis = migration_incoming_get_current();
3537     /* Temporary page that is later 'placed' */
3538     void *postcopy_host_page = mis->postcopy_tmp_page;
3539     void *host_page = NULL;
3540     bool all_zero = true;
3541     int target_pages = 0;
3542
3543     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3544         ram_addr_t addr;
3545         void *page_buffer = NULL;
3546         void *place_source = NULL;
3547         RAMBlock *block = NULL;
3548         uint8_t ch;
3549         int len;
3550
3551         addr = qemu_get_be64(f);
3552
3553         /*
3554          * If qemu file error, we should stop here, and then "addr"
3555          * may be invalid
3556          */
3557         ret = qemu_file_get_error(f);
3558         if (ret) {
3559             break;
3560         }
3561
3562         flags = addr & ~TARGET_PAGE_MASK;
3563         addr &= TARGET_PAGE_MASK;
3564
3565         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3566         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3567                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3568             block = ram_block_from_stream(f, flags);
3569             if (!block) {
3570                 ret = -EINVAL;
3571                 break;
3572             }
3573
3574             /*
3575              * Relying on used_length is racy and can result in false positives.
3576              * We might place pages beyond used_length in case RAM was shrunk
3577              * while in postcopy, which is fine - trying to place via
3578              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3579              */
3580             if (!block->host || addr >= block->postcopy_length) {
3581                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3582                 ret = -EINVAL;
3583                 break;
3584             }
3585             target_pages++;
3586             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3587             /*
3588              * Postcopy requires that we place whole host pages atomically;
3589              * these may be huge pages for RAMBlocks that are backed by
3590              * hugetlbfs.
3591              * To make it atomic, the data is read into a temporary page
3592              * that's moved into place later.
3593              * The migration protocol uses,  possibly smaller, target-pages
3594              * however the source ensures it always sends all the components
3595              * of a host page in one chunk.
3596              */
3597             page_buffer = postcopy_host_page +
3598                           host_page_offset_from_ram_block_offset(block, addr);
3599             /* If all TP are zero then we can optimise the place */
3600             if (target_pages == 1) {
3601                 host_page = host_page_from_ram_block_offset(block, addr);
3602             } else if (host_page != host_page_from_ram_block_offset(block,
3603                                                                     addr)) {
3604                 /* not the 1st TP within the HP */
3605                 error_report("Non-same host page %p/%p", host_page,
3606                              host_page_from_ram_block_offset(block, addr));
3607                 ret = -EINVAL;
3608                 break;
3609             }
3610
3611             /*
3612              * If it's the last part of a host page then we place the host
3613              * page
3614              */
3615             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3616                 place_needed = true;
3617             }
3618             place_source = postcopy_host_page;
3619         }
3620
3621         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3622         case RAM_SAVE_FLAG_ZERO:
3623             ch = qemu_get_byte(f);
3624             /*
3625              * Can skip to set page_buffer when
3626              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3627              */
3628             if (ch || !matches_target_page_size) {
3629                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3630             }
3631             if (ch) {
3632                 all_zero = false;
3633             }
3634             break;
3635
3636         case RAM_SAVE_FLAG_PAGE:
3637             all_zero = false;
3638             if (!matches_target_page_size) {
3639                 /* For huge pages, we always use temporary buffer */
3640                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3641             } else {
3642                 /*
3643                  * For small pages that matches target page size, we
3644                  * avoid the qemu_file copy.  Instead we directly use
3645                  * the buffer of QEMUFile to place the page.  Note: we
3646                  * cannot do any QEMUFile operation before using that
3647                  * buffer to make sure the buffer is valid when
3648                  * placing the page.
3649                  */
3650                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3651                                          TARGET_PAGE_SIZE);
3652             }
3653             break;
3654         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3655             all_zero = false;
3656             len = qemu_get_be32(f);
3657             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3658                 error_report("Invalid compressed data length: %d", len);
3659                 ret = -EINVAL;
3660                 break;
3661             }
3662             decompress_data_with_multi_threads(f, page_buffer, len);
3663             break;
3664
3665         case RAM_SAVE_FLAG_EOS:
3666             /* normal exit */
3667             multifd_recv_sync_main();
3668             break;
3669         default:
3670             error_report("Unknown combination of migration flags: 0x%x"
3671                          " (postcopy mode)", flags);
3672             ret = -EINVAL;
3673             break;
3674         }
3675
3676         /* Got the whole host page, wait for decompress before placing. */
3677         if (place_needed) {
3678             ret |= wait_for_decompress_done();
3679         }
3680
3681         /* Detect for any possible file errors */
3682         if (!ret && qemu_file_get_error(f)) {
3683             ret = qemu_file_get_error(f);
3684         }
3685
3686         if (!ret && place_needed) {
3687             if (all_zero) {
3688                 ret = postcopy_place_page_zero(mis, host_page, block);
3689             } else {
3690                 ret = postcopy_place_page(mis, host_page, place_source,
3691                                           block);
3692             }
3693             place_needed = false;
3694             target_pages = 0;
3695             /* Assume we have a zero page until we detect something different */
3696             all_zero = true;
3697         }
3698     }
3699
3700     return ret;
3701 }
3702
3703 static bool postcopy_is_advised(void)
3704 {
3705     PostcopyState ps = postcopy_state_get();
3706     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3707 }
3708
3709 static bool postcopy_is_running(void)
3710 {
3711     PostcopyState ps = postcopy_state_get();
3712     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3713 }
3714
3715 /*
3716  * Flush content of RAM cache into SVM's memory.
3717  * Only flush the pages that be dirtied by PVM or SVM or both.
3718  */
3719 void colo_flush_ram_cache(void)
3720 {
3721     RAMBlock *block = NULL;
3722     void *dst_host;
3723     void *src_host;
3724     unsigned long offset = 0;
3725
3726     memory_global_dirty_log_sync();
3727     qemu_mutex_lock(&ram_state->bitmap_mutex);
3728     WITH_RCU_READ_LOCK_GUARD() {
3729         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3730             ramblock_sync_dirty_bitmap(ram_state, block);
3731         }
3732     }
3733
3734     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3735     WITH_RCU_READ_LOCK_GUARD() {
3736         block = QLIST_FIRST_RCU(&ram_list.blocks);
3737
3738         while (block) {
3739             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3740
3741             if (!offset_in_ramblock(block,
3742                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3743                 offset = 0;
3744                 block = QLIST_NEXT_RCU(block, next);
3745             } else {
3746                 migration_bitmap_clear_dirty(ram_state, block, offset);
3747                 dst_host = block->host
3748                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3749                 src_host = block->colo_cache
3750                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3751                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3752             }
3753         }
3754     }
3755     trace_colo_flush_ram_cache_end();
3756     qemu_mutex_unlock(&ram_state->bitmap_mutex);
3757 }
3758
3759 /**
3760  * ram_load_precopy: load pages in precopy case
3761  *
3762  * Returns 0 for success or -errno in case of error
3763  *
3764  * Called in precopy mode by ram_load().
3765  * rcu_read_lock is taken prior to this being called.
3766  *
3767  * @f: QEMUFile where to send the data
3768  */
3769 static int ram_load_precopy(QEMUFile *f)
3770 {
3771     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3772     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3773     bool postcopy_advised = postcopy_is_advised();
3774     if (!migrate_use_compression()) {
3775         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3776     }
3777
3778     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3779         ram_addr_t addr, total_ram_bytes;
3780         void *host = NULL, *host_bak = NULL;
3781         uint8_t ch;
3782
3783         /*
3784          * Yield periodically to let main loop run, but an iteration of
3785          * the main loop is expensive, so do it each some iterations
3786          */
3787         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3788             aio_co_schedule(qemu_get_current_aio_context(),
3789                             qemu_coroutine_self());
3790             qemu_coroutine_yield();
3791         }
3792         i++;
3793
3794         addr = qemu_get_be64(f);
3795         flags = addr & ~TARGET_PAGE_MASK;
3796         addr &= TARGET_PAGE_MASK;
3797
3798         if (flags & invalid_flags) {
3799             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3800                 error_report("Received an unexpected compressed page");
3801             }
3802
3803             ret = -EINVAL;
3804             break;
3805         }
3806
3807         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3808                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3809             RAMBlock *block = ram_block_from_stream(f, flags);
3810
3811             host = host_from_ram_block_offset(block, addr);
3812             /*
3813              * After going into COLO stage, we should not load the page
3814              * into SVM's memory directly, we put them into colo_cache firstly.
3815              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3816              * Previously, we copied all these memory in preparing stage of COLO
3817              * while we need to stop VM, which is a time-consuming process.
3818              * Here we optimize it by a trick, back-up every page while in
3819              * migration process while COLO is enabled, though it affects the
3820              * speed of the migration, but it obviously reduce the downtime of
3821              * back-up all SVM'S memory in COLO preparing stage.
3822              */
3823             if (migration_incoming_colo_enabled()) {
3824                 if (migration_incoming_in_colo_state()) {
3825                     /* In COLO stage, put all pages into cache temporarily */
3826                     host = colo_cache_from_block_offset(block, addr, true);
3827                 } else {
3828                    /*
3829                     * In migration stage but before COLO stage,
3830                     * Put all pages into both cache and SVM's memory.
3831                     */
3832                     host_bak = colo_cache_from_block_offset(block, addr, false);
3833                 }
3834             }
3835             if (!host) {
3836                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3837                 ret = -EINVAL;
3838                 break;
3839             }
3840             if (!migration_incoming_in_colo_state()) {
3841                 ramblock_recv_bitmap_set(block, host);
3842             }
3843
3844             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3845         }
3846
3847         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3848         case RAM_SAVE_FLAG_MEM_SIZE:
3849             /* Synchronize RAM block list */
3850             total_ram_bytes = addr;
3851             while (!ret && total_ram_bytes) {
3852                 RAMBlock *block;
3853                 char id[256];
3854                 ram_addr_t length;
3855
3856                 len = qemu_get_byte(f);
3857                 qemu_get_buffer(f, (uint8_t *)id, len);
3858                 id[len] = 0;
3859                 length = qemu_get_be64(f);
3860
3861                 block = qemu_ram_block_by_name(id);
3862                 if (block && !qemu_ram_is_migratable(block)) {
3863                     error_report("block %s should not be migrated !", id);
3864                     ret = -EINVAL;
3865                 } else if (block) {
3866                     if (length != block->used_length) {
3867                         Error *local_err = NULL;
3868
3869                         ret = qemu_ram_resize(block, length,
3870                                               &local_err);
3871                         if (local_err) {
3872                             error_report_err(local_err);
3873                         }
3874                     }
3875                     /* For postcopy we need to check hugepage sizes match */
3876                     if (postcopy_advised && migrate_postcopy_ram() &&
3877                         block->page_size != qemu_host_page_size) {
3878                         uint64_t remote_page_size = qemu_get_be64(f);
3879                         if (remote_page_size != block->page_size) {
3880                             error_report("Mismatched RAM page size %s "
3881                                          "(local) %zd != %" PRId64,
3882                                          id, block->page_size,
3883                                          remote_page_size);
3884                             ret = -EINVAL;
3885                         }
3886                     }
3887                     if (migrate_ignore_shared()) {
3888                         hwaddr addr = qemu_get_be64(f);
3889                         if (ramblock_is_ignored(block) &&
3890                             block->mr->addr != addr) {
3891                             error_report("Mismatched GPAs for block %s "
3892                                          "%" PRId64 "!= %" PRId64,
3893                                          id, (uint64_t)addr,
3894                                          (uint64_t)block->mr->addr);
3895                             ret = -EINVAL;
3896                         }
3897                     }
3898                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3899                                           block->idstr);
3900                 } else {
3901                     error_report("Unknown ramblock \"%s\", cannot "
3902                                  "accept migration", id);
3903                     ret = -EINVAL;
3904                 }
3905
3906                 total_ram_bytes -= length;
3907             }
3908             break;
3909
3910         case RAM_SAVE_FLAG_ZERO:
3911             ch = qemu_get_byte(f);
3912             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3913             break;
3914
3915         case RAM_SAVE_FLAG_PAGE:
3916             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3917             break;
3918
3919         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3920             len = qemu_get_be32(f);
3921             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3922                 error_report("Invalid compressed data length: %d", len);
3923                 ret = -EINVAL;
3924                 break;
3925             }
3926             decompress_data_with_multi_threads(f, host, len);
3927             break;
3928
3929         case RAM_SAVE_FLAG_XBZRLE:
3930             if (load_xbzrle(f, addr, host) < 0) {
3931                 error_report("Failed to decompress XBZRLE page at "
3932                              RAM_ADDR_FMT, addr);
3933                 ret = -EINVAL;
3934                 break;
3935             }
3936             break;
3937         case RAM_SAVE_FLAG_EOS:
3938             /* normal exit */
3939             multifd_recv_sync_main();
3940             break;
3941         default:
3942             if (flags & RAM_SAVE_FLAG_HOOK) {
3943                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3944             } else {
3945                 error_report("Unknown combination of migration flags: 0x%x",
3946                              flags);
3947                 ret = -EINVAL;
3948             }
3949         }
3950         if (!ret) {
3951             ret = qemu_file_get_error(f);
3952         }
3953         if (!ret && host_bak) {
3954             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3955         }
3956     }
3957
3958     ret |= wait_for_decompress_done();
3959     return ret;
3960 }
3961
3962 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3963 {
3964     int ret = 0;
3965     static uint64_t seq_iter;
3966     /*
3967      * If system is running in postcopy mode, page inserts to host memory must
3968      * be atomic
3969      */
3970     bool postcopy_running = postcopy_is_running();
3971
3972     seq_iter++;
3973
3974     if (version_id != 4) {
3975         return -EINVAL;
3976     }
3977
3978     /*
3979      * This RCU critical section can be very long running.
3980      * When RCU reclaims in the code start to become numerous,
3981      * it will be necessary to reduce the granularity of this
3982      * critical section.
3983      */
3984     WITH_RCU_READ_LOCK_GUARD() {
3985         if (postcopy_running) {
3986             ret = ram_load_postcopy(f);
3987         } else {
3988             ret = ram_load_precopy(f);
3989         }
3990     }
3991     trace_ram_load_complete(ret, seq_iter);
3992
3993     return ret;
3994 }
3995
3996 static bool ram_has_postcopy(void *opaque)
3997 {
3998     RAMBlock *rb;
3999     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4000         if (ramblock_is_pmem(rb)) {
4001             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4002                          "is not supported now!", rb->idstr, rb->host);
4003             return false;
4004         }
4005     }
4006
4007     return migrate_postcopy_ram();
4008 }
4009
4010 /* Sync all the dirty bitmap with destination VM.  */
4011 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4012 {
4013     RAMBlock *block;
4014     QEMUFile *file = s->to_dst_file;
4015     int ramblock_count = 0;
4016
4017     trace_ram_dirty_bitmap_sync_start();
4018
4019     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4020         qemu_savevm_send_recv_bitmap(file, block->idstr);
4021         trace_ram_dirty_bitmap_request(block->idstr);
4022         ramblock_count++;
4023     }
4024
4025     trace_ram_dirty_bitmap_sync_wait();
4026
4027     /* Wait until all the ramblocks' dirty bitmap synced */
4028     while (ramblock_count--) {
4029         qemu_sem_wait(&s->rp_state.rp_sem);
4030     }
4031
4032     trace_ram_dirty_bitmap_sync_complete();
4033
4034     return 0;
4035 }
4036
4037 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4038 {
4039     qemu_sem_post(&s->rp_state.rp_sem);
4040 }
4041
4042 /*
4043  * Read the received bitmap, revert it as the initial dirty bitmap.
4044  * This is only used when the postcopy migration is paused but wants
4045  * to resume from a middle point.
4046  */
4047 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4048 {
4049     int ret = -EINVAL;
4050     /* from_dst_file is always valid because we're within rp_thread */
4051     QEMUFile *file = s->rp_state.from_dst_file;
4052     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4053     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4054     uint64_t size, end_mark;
4055
4056     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4057
4058     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4059         error_report("%s: incorrect state %s", __func__,
4060                      MigrationStatus_str(s->state));
4061         return -EINVAL;
4062     }
4063
4064     /*
4065      * Note: see comments in ramblock_recv_bitmap_send() on why we
4066      * need the endianness conversion, and the paddings.
4067      */
4068     local_size = ROUND_UP(local_size, 8);
4069
4070     /* Add paddings */
4071     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4072
4073     size = qemu_get_be64(file);
4074
4075     /* The size of the bitmap should match with our ramblock */
4076     if (size != local_size) {
4077         error_report("%s: ramblock '%s' bitmap size mismatch "
4078                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4079                      block->idstr, size, local_size);
4080         ret = -EINVAL;
4081         goto out;
4082     }
4083
4084     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4085     end_mark = qemu_get_be64(file);
4086
4087     ret = qemu_file_get_error(file);
4088     if (ret || size != local_size) {
4089         error_report("%s: read bitmap failed for ramblock '%s': %d"
4090                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4091                      __func__, block->idstr, ret, local_size, size);
4092         ret = -EIO;
4093         goto out;
4094     }
4095
4096     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4097         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4098                      __func__, block->idstr, end_mark);
4099         ret = -EINVAL;
4100         goto out;
4101     }
4102
4103     /*
4104      * Endianness conversion. We are during postcopy (though paused).
4105      * The dirty bitmap won't change. We can directly modify it.
4106      */
4107     bitmap_from_le(block->bmap, le_bitmap, nbits);
4108
4109     /*
4110      * What we received is "received bitmap". Revert it as the initial
4111      * dirty bitmap for this ramblock.
4112      */
4113     bitmap_complement(block->bmap, block->bmap, nbits);
4114
4115     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4116
4117     /*
4118      * We succeeded to sync bitmap for current ramblock. If this is
4119      * the last one to sync, we need to notify the main send thread.
4120      */
4121     ram_dirty_bitmap_reload_notify(s);
4122
4123     ret = 0;
4124 out:
4125     g_free(le_bitmap);
4126     return ret;
4127 }
4128
4129 static int ram_resume_prepare(MigrationState *s, void *opaque)
4130 {
4131     RAMState *rs = *(RAMState **)opaque;
4132     int ret;
4133
4134     ret = ram_dirty_bitmap_sync_all(s, rs);
4135     if (ret) {
4136         return ret;
4137     }
4138
4139     ram_state_resume_prepare(rs, s->to_dst_file);
4140
4141     return 0;
4142 }
4143
4144 static SaveVMHandlers savevm_ram_handlers = {
4145     .save_setup = ram_save_setup,
4146     .save_live_iterate = ram_save_iterate,
4147     .save_live_complete_postcopy = ram_save_complete,
4148     .save_live_complete_precopy = ram_save_complete,
4149     .has_postcopy = ram_has_postcopy,
4150     .save_live_pending = ram_save_pending,
4151     .load_state = ram_load,
4152     .save_cleanup = ram_save_cleanup,
4153     .load_setup = ram_load_setup,
4154     .load_cleanup = ram_load_cleanup,
4155     .resume_prepare = ram_resume_prepare,
4156 };
4157
4158 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4159                                       size_t old_size, size_t new_size)
4160 {
4161     PostcopyState ps = postcopy_state_get();
4162     ram_addr_t offset;
4163     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4164     Error *err = NULL;
4165
4166     if (ramblock_is_ignored(rb)) {
4167         return;
4168     }
4169
4170     if (!migration_is_idle()) {
4171         /*
4172          * Precopy code on the source cannot deal with the size of RAM blocks
4173          * changing at random points in time - especially after sending the
4174          * RAM block sizes in the migration stream, they must no longer change.
4175          * Abort and indicate a proper reason.
4176          */
4177         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4178         migrate_set_error(migrate_get_current(), err);
4179         error_free(err);
4180         migration_cancel();
4181     }
4182
4183     switch (ps) {
4184     case POSTCOPY_INCOMING_ADVISE:
4185         /*
4186          * Update what ram_postcopy_incoming_init()->init_range() does at the
4187          * time postcopy was advised. Syncing RAM blocks with the source will
4188          * result in RAM resizes.
4189          */
4190         if (old_size < new_size) {
4191             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4192                 error_report("RAM block '%s' discard of resized RAM failed",
4193                              rb->idstr);
4194             }
4195         }
4196         rb->postcopy_length = new_size;
4197         break;
4198     case POSTCOPY_INCOMING_NONE:
4199     case POSTCOPY_INCOMING_RUNNING:
4200     case POSTCOPY_INCOMING_END:
4201         /*
4202          * Once our guest is running, postcopy does no longer care about
4203          * resizes. When growing, the new memory was not available on the
4204          * source, no handler needed.
4205          */
4206         break;
4207     default:
4208         error_report("RAM block '%s' resized during postcopy state: %d",
4209                      rb->idstr, ps);
4210         exit(-1);
4211     }
4212 }
4213
4214 static RAMBlockNotifier ram_mig_ram_notifier = {
4215     .ram_block_resized = ram_mig_ram_block_resized,
4216 };
4217
4218 void ram_mig_init(void)
4219 {
4220     qemu_mutex_init(&XBZRLE.lock);
4221     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4222     ram_block_notifier_add(&ram_mig_ram_notifier);
4223 }