migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/main-loop.h"
  34 #include "xbzrle.h"
  35 #include "ram.h"
  36 #include "migration.h"
  37 #include "migration/register.h"
  38 #include "migration/misc.h"
  39 #include "qemu-file.h"
  40 #include "postcopy-ram.h"
  41 #include "page_cache.h"
  42 #include "qemu/error-report.h"
  43 #include "qapi/error.h"
  44 #include "qapi/qapi-types-migration.h"
  45 #include "qapi/qapi-events-migration.h"
  46 #include "qapi/qmp/qerror.h"
  47 #include "trace.h"
  48 #include "exec/ram_addr.h"
  49 #include "exec/target_page.h"
  50 #include "qemu/rcu_queue.h"
  51 #include "migration/colo.h"
  52 #include "block.h"
  53 #include "sysemu/cpu-throttle.h"
  54 #include "savevm.h"
  55 #include "qemu/iov.h"
  56 #include "multifd.h"
  57 #include "sysemu/runstate.h"
  58
  59 #if defined(__linux__)
  60 #include "qemu/userfaultfd.h"
  61 #endif /* defined(__linux__) */
  62
  63 /***********************************************************/
  64 /* ram save/restore */
  65
  66 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  67  * worked for pages that where filled with the same char.  We switched
  68  * it to only search for the zero value.  And to avoid confusion with
  69  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  70  */
  71
  72 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  73 #define RAM_SAVE_FLAG_ZERO     0x02
  74 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  75 #define RAM_SAVE_FLAG_PAGE     0x08
  76 #define RAM_SAVE_FLAG_EOS      0x10
  77 #define RAM_SAVE_FLAG_CONTINUE 0x20
  78 #define RAM_SAVE_FLAG_XBZRLE   0x40
  79 /* 0x80 is reserved in migration.h start with 0x100 next */
  80 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  81
  82 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  83 {
  84     return buffer_is_zero(p, size);
  85 }
  86
  87 XBZRLECacheStats xbzrle_counters;
  88
  89 /* struct contains XBZRLE cache and a static page
  90    used by the compression */
  91 static struct {
  92     /* buffer used for XBZRLE encoding */
  93     uint8_t *encoded_buf;
  94     /* buffer for storing page content */
  95     uint8_t *current_buf;
  96     /* Cache for XBZRLE, Protected by lock. */
  97     PageCache *cache;
  98     QemuMutex lock;
  99     /* it will store a page full of zeros */
 100     uint8_t *zero_target_page;
 101     /* buffer used for XBZRLE decoding */
 102     uint8_t *decoded_buf;
 103 } XBZRLE;
 104
 105 static void XBZRLE_cache_lock(void)
 106 {
 107     if (migrate_use_xbzrle()) {
 108         qemu_mutex_lock(&XBZRLE.lock);
 109     }
 110 }
 111
 112 static void XBZRLE_cache_unlock(void)
 113 {
 114     if (migrate_use_xbzrle()) {
 115         qemu_mutex_unlock(&XBZRLE.lock);
 116     }
 117 }
 118
 119 /**
 120  * xbzrle_cache_resize: resize the xbzrle cache
 121  *
 122  * This function is called from migrate_params_apply in main
 123  * thread, possibly while a migration is in progress.  A running
 124  * migration may be using the cache and might finish during this call,
 125  * hence changes to the cache are protected by XBZRLE.lock().
 126  *
 127  * Returns 0 for success or -1 for error
 128  *
 129  * @new_size: new cache size
 130  * @errp: set *errp if the check failed, with reason
 131  */
 132 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 133 {
 134     PageCache *new_cache;
 135     int64_t ret = 0;
 136
 137     /* Check for truncation */
 138     if (new_size != (size_t)new_size) {
 139         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 140                    "exceeding address space");
 141         return -1;
 142     }
 143
 144     if (new_size == migrate_xbzrle_cache_size()) {
 145         /* nothing to do */
 146         return 0;
 147     }
 148
 149     XBZRLE_cache_lock();
 150
 151     if (XBZRLE.cache != NULL) {
 152         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 153         if (!new_cache) {
 154             ret = -1;
 155             goto out;
 156         }
 157
 158         cache_fini(XBZRLE.cache);
 159         XBZRLE.cache = new_cache;
 160     }
 161 out:
 162     XBZRLE_cache_unlock();
 163     return ret;
 164 }
 165
 166 bool ramblock_is_ignored(RAMBlock *block)
 167 {
 168     return !qemu_ram_is_migratable(block) ||
 169            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 170 }
 171
 172 #undef RAMBLOCK_FOREACH
 173
 174 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 175 {
 176     RAMBlock *block;
 177     int ret = 0;
 178
 179     RCU_READ_LOCK_GUARD();
 180
 181     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 182         ret = func(block, opaque);
 183         if (ret) {
 184             break;
 185         }
 186     }
 187     return ret;
 188 }
 189
 190 static void ramblock_recv_map_init(void)
 191 {
 192     RAMBlock *rb;
 193
 194     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 195         assert(!rb->receivedmap);
 196         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 197     }
 198 }
 199
 200 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 201 {
 202     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 203                     rb->receivedmap);
 204 }
 205
 206 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 207 {
 208     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 209 }
 210
 211 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 212 {
 213     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 214 }
 215
 216 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 217                                     size_t nr)
 218 {
 219     bitmap_set_atomic(rb->receivedmap,
 220                       ramblock_recv_bitmap_offset(host_addr, rb),
 221                       nr);
 222 }
 223
 224 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 225
 226 /*
 227  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 228  *
 229  * Returns >0 if success with sent bytes, or <0 if error.
 230  */
 231 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 232                                   const char *block_name)
 233 {
 234     RAMBlock *block = qemu_ram_block_by_name(block_name);
 235     unsigned long *le_bitmap, nbits;
 236     uint64_t size;
 237
 238     if (!block) {
 239         error_report("%s: invalid block name: %s", __func__, block_name);
 240         return -1;
 241     }
 242
 243     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 244
 245     /*
 246      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 247      * machines we may need 4 more bytes for padding (see below
 248      * comment). So extend it a bit before hand.
 249      */
 250     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 251
 252     /*
 253      * Always use little endian when sending the bitmap. This is
 254      * required that when source and destination VMs are not using the
 255      * same endianness. (Note: big endian won't work.)
 256      */
 257     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 258
 259     /* Size of the bitmap, in bytes */
 260     size = DIV_ROUND_UP(nbits, 8);
 261
 262     /*
 263      * size is always aligned to 8 bytes for 64bit machines, but it
 264      * may not be true for 32bit machines. We need this padding to
 265      * make sure the migration can survive even between 32bit and
 266      * 64bit machines.
 267      */
 268     size = ROUND_UP(size, 8);
 269
 270     qemu_put_be64(file, size);
 271     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 272     /*
 273      * Mark as an end, in case the middle part is screwed up due to
 274      * some "mysterious" reason.
 275      */
 276     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 277     qemu_fflush(file);
 278
 279     g_free(le_bitmap);
 280
 281     if (qemu_file_get_error(file)) {
 282         return qemu_file_get_error(file);
 283     }
 284
 285     return size + sizeof(size);
 286 }
 287
 288 /*
 289  * An outstanding page request, on the source, having been received
 290  * and queued
 291  */
 292 struct RAMSrcPageRequest {
 293     RAMBlock *rb;
 294     hwaddr    offset;
 295     hwaddr    len;
 296
 297     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 298 };
 299
 300 /* State of RAM for migration */
 301 struct RAMState {
 302     /* QEMUFile used for this migration */
 303     QEMUFile *f;
 304     /* UFFD file descriptor, used in 'write-tracking' migration */
 305     int uffdio_fd;
 306     /* Last block that we have visited searching for dirty pages */
 307     RAMBlock *last_seen_block;
 308     /* Last block from where we have sent data */
 309     RAMBlock *last_sent_block;
 310     /* Last dirty target page we have sent */
 311     ram_addr_t last_page;
 312     /* last ram version we have seen */
 313     uint32_t last_version;
 314     /* How many times we have dirty too many pages */
 315     int dirty_rate_high_cnt;
 316     /* these variables are used for bitmap sync */
 317     /* last time we did a full bitmap_sync */
 318     int64_t time_last_bitmap_sync;
 319     /* bytes transferred at start_time */
 320     uint64_t bytes_xfer_prev;
 321     /* number of dirty pages since start_time */
 322     uint64_t num_dirty_pages_period;
 323     /* xbzrle misses since the beginning of the period */
 324     uint64_t xbzrle_cache_miss_prev;
 325     /* Amount of xbzrle pages since the beginning of the period */
 326     uint64_t xbzrle_pages_prev;
 327     /* Amount of xbzrle encoded bytes since the beginning of the period */
 328     uint64_t xbzrle_bytes_prev;
 329     /* Start using XBZRLE (e.g., after the first round). */
 330     bool xbzrle_enabled;
 331
 332     /* compression statistics since the beginning of the period */
 333     /* amount of count that no free thread to compress data */
 334     uint64_t compress_thread_busy_prev;
 335     /* amount bytes after compression */
 336     uint64_t compressed_size_prev;
 337     /* amount of compressed pages */
 338     uint64_t compress_pages_prev;
 339
 340     /* total handled target pages at the beginning of period */
 341     uint64_t target_page_count_prev;
 342     /* total handled target pages since start */
 343     uint64_t target_page_count;
 344     /* number of dirty bits in the bitmap */
 345     uint64_t migration_dirty_pages;
 346     /* Protects modification of the bitmap and migration dirty pages */
 347     QemuMutex bitmap_mutex;
 348     /* The RAMBlock used in the last src_page_requests */
 349     RAMBlock *last_req_rb;
 350     /* Queue of outstanding page requests from the destination */
 351     QemuMutex src_page_req_mutex;
 352     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 353 };
 354 typedef struct RAMState RAMState;
 355
 356 static RAMState *ram_state;
 357
 358 static NotifierWithReturnList precopy_notifier_list;
 359
 360 void precopy_infrastructure_init(void)
 361 {
 362     notifier_with_return_list_init(&precopy_notifier_list);
 363 }
 364
 365 void precopy_add_notifier(NotifierWithReturn *n)
 366 {
 367     notifier_with_return_list_add(&precopy_notifier_list, n);
 368 }
 369
 370 void precopy_remove_notifier(NotifierWithReturn *n)
 371 {
 372     notifier_with_return_remove(n);
 373 }
 374
 375 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 376 {
 377     PrecopyNotifyData pnd;
 378     pnd.reason = reason;
 379     pnd.errp = errp;
 380
 381     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 382 }
 383
 384 uint64_t ram_bytes_remaining(void)
 385 {
 386     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 387                        0;
 388 }
 389
 390 MigrationStats ram_counters;
 391
 392 /* used by the search for pages to send */
 393 struct PageSearchStatus {
 394     /* Current block being searched */
 395     RAMBlock    *block;
 396     /* Current page to search from */
 397     unsigned long page;
 398     /* Set once we wrap around */
 399     bool         complete_round;
 400 };
 401 typedef struct PageSearchStatus PageSearchStatus;
 402
 403 CompressionStats compression_counters;
 404
 405 struct CompressParam {
 406     bool done;
 407     bool quit;
 408     bool zero_page;
 409     QEMUFile *file;
 410     QemuMutex mutex;
 411     QemuCond cond;
 412     RAMBlock *block;
 413     ram_addr_t offset;
 414
 415     /* internally used fields */
 416     z_stream stream;
 417     uint8_t *originbuf;
 418 };
 419 typedef struct CompressParam CompressParam;
 420
 421 struct DecompressParam {
 422     bool done;
 423     bool quit;
 424     QemuMutex mutex;
 425     QemuCond cond;
 426     void *des;
 427     uint8_t *compbuf;
 428     int len;
 429     z_stream stream;
 430 };
 431 typedef struct DecompressParam DecompressParam;
 432
 433 static CompressParam *comp_param;
 434 static QemuThread *compress_threads;
 435 /* comp_done_cond is used to wake up the migration thread when
 436  * one of the compression threads has finished the compression.
 437  * comp_done_lock is used to co-work with comp_done_cond.
 438  */
 439 static QemuMutex comp_done_lock;
 440 static QemuCond comp_done_cond;
 441 /* The empty QEMUFileOps will be used by file in CompressParam */
 442 static const QEMUFileOps empty_ops = { };
 443
 444 static QEMUFile *decomp_file;
 445 static DecompressParam *decomp_param;
 446 static QemuThread *decompress_threads;
 447 static QemuMutex decomp_done_lock;
 448 static QemuCond decomp_done_cond;
 449
 450 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 451                                  ram_addr_t offset, uint8_t *source_buf);
 452
 453 static void *do_data_compress(void *opaque)
 454 {
 455     CompressParam *param = opaque;
 456     RAMBlock *block;
 457     ram_addr_t offset;
 458     bool zero_page;
 459
 460     qemu_mutex_lock(&param->mutex);
 461     while (!param->quit) {
 462         if (param->block) {
 463             block = param->block;
 464             offset = param->offset;
 465             param->block = NULL;
 466             qemu_mutex_unlock(&param->mutex);
 467
 468             zero_page = do_compress_ram_page(param->file, &param->stream,
 469                                              block, offset, param->originbuf);
 470
 471             qemu_mutex_lock(&comp_done_lock);
 472             param->done = true;
 473             param->zero_page = zero_page;
 474             qemu_cond_signal(&comp_done_cond);
 475             qemu_mutex_unlock(&comp_done_lock);
 476
 477             qemu_mutex_lock(&param->mutex);
 478         } else {
 479             qemu_cond_wait(&param->cond, &param->mutex);
 480         }
 481     }
 482     qemu_mutex_unlock(&param->mutex);
 483
 484     return NULL;
 485 }
 486
 487 static void compress_threads_save_cleanup(void)
 488 {
 489     int i, thread_count;
 490
 491     if (!migrate_use_compression() || !comp_param) {
 492         return;
 493     }
 494
 495     thread_count = migrate_compress_threads();
 496     for (i = 0; i < thread_count; i++) {
 497         /*
 498          * we use it as a indicator which shows if the thread is
 499          * properly init'd or not
 500          */
 501         if (!comp_param[i].file) {
 502             break;
 503         }
 504
 505         qemu_mutex_lock(&comp_param[i].mutex);
 506         comp_param[i].quit = true;
 507         qemu_cond_signal(&comp_param[i].cond);
 508         qemu_mutex_unlock(&comp_param[i].mutex);
 509
 510         qemu_thread_join(compress_threads + i);
 511         qemu_mutex_destroy(&comp_param[i].mutex);
 512         qemu_cond_destroy(&comp_param[i].cond);
 513         deflateEnd(&comp_param[i].stream);
 514         g_free(comp_param[i].originbuf);
 515         qemu_fclose(comp_param[i].file);
 516         comp_param[i].file = NULL;
 517     }
 518     qemu_mutex_destroy(&comp_done_lock);
 519     qemu_cond_destroy(&comp_done_cond);
 520     g_free(compress_threads);
 521     g_free(comp_param);
 522     compress_threads = NULL;
 523     comp_param = NULL;
 524 }
 525
 526 static int compress_threads_save_setup(void)
 527 {
 528     int i, thread_count;
 529
 530     if (!migrate_use_compression()) {
 531         return 0;
 532     }
 533     thread_count = migrate_compress_threads();
 534     compress_threads = g_new0(QemuThread, thread_count);
 535     comp_param = g_new0(CompressParam, thread_count);
 536     qemu_cond_init(&comp_done_cond);
 537     qemu_mutex_init(&comp_done_lock);
 538     for (i = 0; i < thread_count; i++) {
 539         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 540         if (!comp_param[i].originbuf) {
 541             goto exit;
 542         }
 543
 544         if (deflateInit(&comp_param[i].stream,
 545                         migrate_compress_level()) != Z_OK) {
 546             g_free(comp_param[i].originbuf);
 547             goto exit;
 548         }
 549
 550         /* comp_param[i].file is just used as a dummy buffer to save data,
 551          * set its ops to empty.
 552          */
 553         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
 554         comp_param[i].done = true;
 555         comp_param[i].quit = false;
 556         qemu_mutex_init(&comp_param[i].mutex);
 557         qemu_cond_init(&comp_param[i].cond);
 558         qemu_thread_create(compress_threads + i, "compress",
 559                            do_data_compress, comp_param + i,
 560                            QEMU_THREAD_JOINABLE);
 561     }
 562     return 0;
 563
 564 exit:
 565     compress_threads_save_cleanup();
 566     return -1;
 567 }
 568
 569 /**
 570  * save_page_header: write page header to wire
 571  *
 572  * If this is the 1st block, it also writes the block identification
 573  *
 574  * Returns the number of bytes written
 575  *
 576  * @f: QEMUFile where to send the data
 577  * @block: block that contains the page we want to send
 578  * @offset: offset inside the block for the page
 579  *          in the lower bits, it contains flags
 580  */
 581 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 582                                ram_addr_t offset)
 583 {
 584     size_t size, len;
 585
 586     if (block == rs->last_sent_block) {
 587         offset |= RAM_SAVE_FLAG_CONTINUE;
 588     }
 589     qemu_put_be64(f, offset);
 590     size = 8;
 591
 592     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 593         len = strlen(block->idstr);
 594         qemu_put_byte(f, len);
 595         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 596         size += 1 + len;
 597         rs->last_sent_block = block;
 598     }
 599     return size;
 600 }
 601
 602 /**
 603  * mig_throttle_guest_down: throttle down the guest
 604  *
 605  * Reduce amount of guest cpu execution to hopefully slow down memory
 606  * writes. If guest dirty memory rate is reduced below the rate at
 607  * which we can transfer pages to the destination then we should be
 608  * able to complete migration. Some workloads dirty memory way too
 609  * fast and will not effectively converge, even with auto-converge.
 610  */
 611 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 612                                     uint64_t bytes_dirty_threshold)
 613 {
 614     MigrationState *s = migrate_get_current();
 615     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 616     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 617     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 618     int pct_max = s->parameters.max_cpu_throttle;
 619
 620     uint64_t throttle_now = cpu_throttle_get_percentage();
 621     uint64_t cpu_now, cpu_ideal, throttle_inc;
 622
 623     /* We have not started throttling yet. Let's start it. */
 624     if (!cpu_throttle_active()) {
 625         cpu_throttle_set(pct_initial);
 626     } else {
 627         /* Throttling already on, just increase the rate */
 628         if (!pct_tailslow) {
 629             throttle_inc = pct_increment;
 630         } else {
 631             /* Compute the ideal CPU percentage used by Guest, which may
 632              * make the dirty rate match the dirty rate threshold. */
 633             cpu_now = 100 - throttle_now;
 634             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 635                         bytes_dirty_period);
 636             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 637         }
 638         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 639     }
 640 }
 641
 642 /**
 643  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 644  *
 645  * @rs: current RAM state
 646  * @current_addr: address for the zero page
 647  *
 648  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 649  * The important thing is that a stale (not-yet-0'd) page be replaced
 650  * by the new data.
 651  * As a bonus, if the page wasn't in the cache it gets added so that
 652  * when a small write is made into the 0'd page it gets XBZRLE sent.
 653  */
 654 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 655 {
 656     if (!rs->xbzrle_enabled) {
 657         return;
 658     }
 659
 660     /* We don't care if this fails to allocate a new cache page
 661      * as long as it updated an old one */
 662     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 663                  ram_counters.dirty_sync_count);
 664 }
 665
 666 #define ENCODING_FLAG_XBZRLE 0x1
 667
 668 /**
 669  * save_xbzrle_page: compress and send current page
 670  *
 671  * Returns: 1 means that we wrote the page
 672  *          0 means that page is identical to the one already sent
 673  *          -1 means that xbzrle would be longer than normal
 674  *
 675  * @rs: current RAM state
 676  * @current_data: pointer to the address of the page contents
 677  * @current_addr: addr of the page
 678  * @block: block that contains the page we want to send
 679  * @offset: offset inside the block for the page
 680  * @last_stage: if we are at the completion stage
 681  */
 682 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 683                             ram_addr_t current_addr, RAMBlock *block,
 684                             ram_addr_t offset, bool last_stage)
 685 {
 686     int encoded_len = 0, bytes_xbzrle;
 687     uint8_t *prev_cached_page;
 688
 689     if (!cache_is_cached(XBZRLE.cache, current_addr,
 690                          ram_counters.dirty_sync_count)) {
 691         xbzrle_counters.cache_miss++;
 692         if (!last_stage) {
 693             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 694                              ram_counters.dirty_sync_count) == -1) {
 695                 return -1;
 696             } else {
 697                 /* update *current_data when the page has been
 698                    inserted into cache */
 699                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 700             }
 701         }
 702         return -1;
 703     }
 704
 705     /*
 706      * Reaching here means the page has hit the xbzrle cache, no matter what
 707      * encoding result it is (normal encoding, overflow or skipping the page),
 708      * count the page as encoded. This is used to calculate the encoding rate.
 709      *
 710      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 711      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 712      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 713      * skipped page included. In this way, the encoding rate can tell if the
 714      * guest page is good for xbzrle encoding.
 715      */
 716     xbzrle_counters.pages++;
 717     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 718
 719     /* save current buffer into memory */
 720     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 721
 722     /* XBZRLE encoding (if there is no overflow) */
 723     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 724                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 725                                        TARGET_PAGE_SIZE);
 726
 727     /*
 728      * Update the cache contents, so that it corresponds to the data
 729      * sent, in all cases except where we skip the page.
 730      */
 731     if (!last_stage && encoded_len != 0) {
 732         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 733         /*
 734          * In the case where we couldn't compress, ensure that the caller
 735          * sends the data from the cache, since the guest might have
 736          * changed the RAM since we copied it.
 737          */
 738         *current_data = prev_cached_page;
 739     }
 740
 741     if (encoded_len == 0) {
 742         trace_save_xbzrle_page_skipping();
 743         return 0;
 744     } else if (encoded_len == -1) {
 745         trace_save_xbzrle_page_overflow();
 746         xbzrle_counters.overflow++;
 747         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 748         return -1;
 749     }
 750
 751     /* Send XBZRLE based compressed page */
 752     bytes_xbzrle = save_page_header(rs, rs->f, block,
 753                                     offset | RAM_SAVE_FLAG_XBZRLE);
 754     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 755     qemu_put_be16(rs->f, encoded_len);
 756     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 757     bytes_xbzrle += encoded_len + 1 + 2;
 758     /*
 759      * Like compressed_size (please see update_compress_thread_counts),
 760      * the xbzrle encoded bytes don't count the 8 byte header with
 761      * RAM_SAVE_FLAG_CONTINUE.
 762      */
 763     xbzrle_counters.bytes += bytes_xbzrle - 8;
 764     ram_counters.transferred += bytes_xbzrle;
 765
 766     return 1;
 767 }
 768
 769 /**
 770  * migration_bitmap_find_dirty: find the next dirty page from start
 771  *
 772  * Returns the page offset within memory region of the start of a dirty page
 773  *
 774  * @rs: current RAM state
 775  * @rb: RAMBlock where to search for dirty pages
 776  * @start: page where we start the search
 777  */
 778 static inline
 779 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 780                                           unsigned long start)
 781 {
 782     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 783     unsigned long *bitmap = rb->bmap;
 784
 785     if (ramblock_is_ignored(rb)) {
 786         return size;
 787     }
 788
 789     return find_next_bit(bitmap, size, start);
 790 }
 791
 792 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
 793                                                        unsigned long page)
 794 {
 795     uint8_t shift;
 796     hwaddr size, start;
 797
 798     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
 799         return;
 800     }
 801
 802     shift = rb->clear_bmap_shift;
 803     /*
 804      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 805      * can make things easier sometimes since then start address
 806      * of the small chunk will always be 64 pages aligned so the
 807      * bitmap will always be aligned to unsigned long. We should
 808      * even be able to remove this restriction but I'm simply
 809      * keeping it.
 810      */
 811     assert(shift >= 6);
 812
 813     size = 1ULL << (TARGET_PAGE_BITS + shift);
 814     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
 815     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 816     memory_region_clear_dirty_bitmap(rb->mr, start, size);
 817 }
 818
 819 static void
 820 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
 821                                                  unsigned long start,
 822                                                  unsigned long npages)
 823 {
 824     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
 825     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
 826     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
 827
 828     /*
 829      * Clear pages from start to start + npages - 1, so the end boundary is
 830      * exclusive.
 831      */
 832     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
 833         migration_clear_memory_region_dirty_bitmap(rb, i);
 834     }
 835 }
 836
 837 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 838                                                 RAMBlock *rb,
 839                                                 unsigned long page)
 840 {
 841     bool ret;
 842
 843     /*
 844      * Clear dirty bitmap if needed.  This _must_ be called before we
 845      * send any of the page in the chunk because we need to make sure
 846      * we can capture further page content changes when we sync dirty
 847      * log the next time.  So as long as we are going to send any of
 848      * the page in the chunk we clear the remote dirty bitmap for all.
 849      * Clearing it earlier won't be a problem, but too late will.
 850      */
 851     migration_clear_memory_region_dirty_bitmap(rb, page);
 852
 853     ret = test_and_clear_bit(page, rb->bmap);
 854     if (ret) {
 855         rs->migration_dirty_pages--;
 856     }
 857
 858     return ret;
 859 }
 860
 861 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
 862                                        void *opaque)
 863 {
 864     const hwaddr offset = section->offset_within_region;
 865     const hwaddr size = int128_get64(section->size);
 866     const unsigned long start = offset >> TARGET_PAGE_BITS;
 867     const unsigned long npages = size >> TARGET_PAGE_BITS;
 868     RAMBlock *rb = section->mr->ram_block;
 869     uint64_t *cleared_bits = opaque;
 870
 871     /*
 872      * We don't grab ram_state->bitmap_mutex because we expect to run
 873      * only when starting migration or during postcopy recovery where
 874      * we don't have concurrent access.
 875      */
 876     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
 877         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
 878     }
 879     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
 880     bitmap_clear(rb->bmap, start, npages);
 881 }
 882
 883 /*
 884  * Exclude all dirty pages from migration that fall into a discarded range as
 885  * managed by a RamDiscardManager responsible for the mapped memory region of
 886  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
 887  *
 888  * Discarded pages ("logically unplugged") have undefined content and must
 889  * not get migrated, because even reading these pages for migration might
 890  * result in undesired behavior.
 891  *
 892  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
 893  *
 894  * Note: The result is only stable while migrating (precopy/postcopy).
 895  */
 896 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
 897 {
 898     uint64_t cleared_bits = 0;
 899
 900     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
 901         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 902         MemoryRegionSection section = {
 903             .mr = rb->mr,
 904             .offset_within_region = 0,
 905             .size = int128_make64(qemu_ram_get_used_length(rb)),
 906         };
 907
 908         ram_discard_manager_replay_discarded(rdm, &section,
 909                                              dirty_bitmap_clear_section,
 910                                              &cleared_bits);
 911     }
 912     return cleared_bits;
 913 }
 914
 915 /*
 916  * Check if a host-page aligned page falls into a discarded range as managed by
 917  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
 918  *
 919  * Note: The result is only stable while migrating (precopy/postcopy).
 920  */
 921 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
 922 {
 923     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
 924         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 925         MemoryRegionSection section = {
 926             .mr = rb->mr,
 927             .offset_within_region = start,
 928             .size = int128_make64(qemu_ram_pagesize(rb)),
 929         };
 930
 931         return !ram_discard_manager_is_populated(rdm, &section);
 932     }
 933     return false;
 934 }
 935
 936 /* Called with RCU critical section */
 937 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 938 {
 939     uint64_t new_dirty_pages =
 940         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
 941
 942     rs->migration_dirty_pages += new_dirty_pages;
 943     rs->num_dirty_pages_period += new_dirty_pages;
 944 }
 945
 946 /**
 947  * ram_pagesize_summary: calculate all the pagesizes of a VM
 948  *
 949  * Returns a summary bitmap of the page sizes of all RAMBlocks
 950  *
 951  * For VMs with just normal pages this is equivalent to the host page
 952  * size. If it's got some huge pages then it's the OR of all the
 953  * different page sizes.
 954  */
 955 uint64_t ram_pagesize_summary(void)
 956 {
 957     RAMBlock *block;
 958     uint64_t summary = 0;
 959
 960     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 961         summary |= block->page_size;
 962     }
 963
 964     return summary;
 965 }
 966
 967 uint64_t ram_get_total_transferred_pages(void)
 968 {
 969     return  ram_counters.normal + ram_counters.duplicate +
 970                 compression_counters.pages + xbzrle_counters.pages;
 971 }
 972
 973 static void migration_update_rates(RAMState *rs, int64_t end_time)
 974 {
 975     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 976     double compressed_size;
 977
 978     /* calculate period counters */
 979     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 980                 / (end_time - rs->time_last_bitmap_sync);
 981
 982     if (!page_count) {
 983         return;
 984     }
 985
 986     if (migrate_use_xbzrle()) {
 987         double encoded_size, unencoded_size;
 988
 989         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 990             rs->xbzrle_cache_miss_prev) / page_count;
 991         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 992         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
 993                          TARGET_PAGE_SIZE;
 994         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
 995         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
 996             xbzrle_counters.encoding_rate = 0;
 997         } else {
 998             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
 999         }
1000         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1001         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1002     }
1003
1004     if (migrate_use_compression()) {
1005         compression_counters.busy_rate = (double)(compression_counters.busy -
1006             rs->compress_thread_busy_prev) / page_count;
1007         rs->compress_thread_busy_prev = compression_counters.busy;
1008
1009         compressed_size = compression_counters.compressed_size -
1010                           rs->compressed_size_prev;
1011         if (compressed_size) {
1012             double uncompressed_size = (compression_counters.pages -
1013                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1014
1015             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1016             compression_counters.compression_rate =
1017                                         uncompressed_size / compressed_size;
1018
1019             rs->compress_pages_prev = compression_counters.pages;
1020             rs->compressed_size_prev = compression_counters.compressed_size;
1021         }
1022     }
1023 }
1024
1025 static void migration_trigger_throttle(RAMState *rs)
1026 {
1027     MigrationState *s = migrate_get_current();
1028     uint64_t threshold = s->parameters.throttle_trigger_threshold;
1029
1030     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1031     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1032     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1033
1034     /* During block migration the auto-converge logic incorrectly detects
1035      * that ram migration makes no progress. Avoid this by disabling the
1036      * throttling logic during the bulk phase of block migration. */
1037     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1038         /* The following detection logic can be refined later. For now:
1039            Check to see if the ratio between dirtied bytes and the approx.
1040            amount of bytes that just got transferred since the last time
1041            we were in this routine reaches the threshold. If that happens
1042            twice, start or increase throttling. */
1043
1044         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1045             (++rs->dirty_rate_high_cnt >= 2)) {
1046             trace_migration_throttle();
1047             rs->dirty_rate_high_cnt = 0;
1048             mig_throttle_guest_down(bytes_dirty_period,
1049                                     bytes_dirty_threshold);
1050         }
1051     }
1052 }
1053
1054 static void migration_bitmap_sync(RAMState *rs)
1055 {
1056     RAMBlock *block;
1057     int64_t end_time;
1058
1059     ram_counters.dirty_sync_count++;
1060
1061     if (!rs->time_last_bitmap_sync) {
1062         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1063     }
1064
1065     trace_migration_bitmap_sync_start();
1066     memory_global_dirty_log_sync();
1067
1068     qemu_mutex_lock(&rs->bitmap_mutex);
1069     WITH_RCU_READ_LOCK_GUARD() {
1070         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1071             ramblock_sync_dirty_bitmap(rs, block);
1072         }
1073         ram_counters.remaining = ram_bytes_remaining();
1074     }
1075     qemu_mutex_unlock(&rs->bitmap_mutex);
1076
1077     memory_global_after_dirty_log_sync();
1078     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1079
1080     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1081
1082     /* more than 1 second = 1000 millisecons */
1083     if (end_time > rs->time_last_bitmap_sync + 1000) {
1084         migration_trigger_throttle(rs);
1085
1086         migration_update_rates(rs, end_time);
1087
1088         rs->target_page_count_prev = rs->target_page_count;
1089
1090         /* reset period counters */
1091         rs->time_last_bitmap_sync = end_time;
1092         rs->num_dirty_pages_period = 0;
1093         rs->bytes_xfer_prev = ram_counters.transferred;
1094     }
1095     if (migrate_use_events()) {
1096         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1097     }
1098 }
1099
1100 static void migration_bitmap_sync_precopy(RAMState *rs)
1101 {
1102     Error *local_err = NULL;
1103
1104     /*
1105      * The current notifier usage is just an optimization to migration, so we
1106      * don't stop the normal migration process in the error case.
1107      */
1108     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1109         error_report_err(local_err);
1110         local_err = NULL;
1111     }
1112
1113     migration_bitmap_sync(rs);
1114
1115     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1116         error_report_err(local_err);
1117     }
1118 }
1119
1120 /**
1121  * save_zero_page_to_file: send the zero page to the file
1122  *
1123  * Returns the size of data written to the file, 0 means the page is not
1124  * a zero page
1125  *
1126  * @rs: current RAM state
1127  * @file: the file where the data is saved
1128  * @block: block that contains the page we want to send
1129  * @offset: offset inside the block for the page
1130  */
1131 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1132                                   RAMBlock *block, ram_addr_t offset)
1133 {
1134     uint8_t *p = block->host + offset;
1135     int len = 0;
1136
1137     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1138         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1139         qemu_put_byte(file, 0);
1140         len += 1;
1141     }
1142     return len;
1143 }
1144
1145 /**
1146  * save_zero_page: send the zero page to the stream
1147  *
1148  * Returns the number of pages written.
1149  *
1150  * @rs: current RAM state
1151  * @block: block that contains the page we want to send
1152  * @offset: offset inside the block for the page
1153  */
1154 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1155 {
1156     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1157
1158     if (len) {
1159         ram_counters.duplicate++;
1160         ram_counters.transferred += len;
1161         return 1;
1162     }
1163     return -1;
1164 }
1165
1166 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1167 {
1168     if (!migrate_release_ram() || !migration_in_postcopy()) {
1169         return;
1170     }
1171
1172     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1173 }
1174
1175 /*
1176  * @pages: the number of pages written by the control path,
1177  *        < 0 - error
1178  *        > 0 - number of pages written
1179  *
1180  * Return true if the pages has been saved, otherwise false is returned.
1181  */
1182 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1183                               int *pages)
1184 {
1185     uint64_t bytes_xmit = 0;
1186     int ret;
1187
1188     *pages = -1;
1189     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1190                                 &bytes_xmit);
1191     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1192         return false;
1193     }
1194
1195     if (bytes_xmit) {
1196         ram_counters.transferred += bytes_xmit;
1197         *pages = 1;
1198     }
1199
1200     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1201         return true;
1202     }
1203
1204     if (bytes_xmit > 0) {
1205         ram_counters.normal++;
1206     } else if (bytes_xmit == 0) {
1207         ram_counters.duplicate++;
1208     }
1209
1210     return true;
1211 }
1212
1213 /*
1214  * directly send the page to the stream
1215  *
1216  * Returns the number of pages written.
1217  *
1218  * @rs: current RAM state
1219  * @block: block that contains the page we want to send
1220  * @offset: offset inside the block for the page
1221  * @buf: the page to be sent
1222  * @async: send to page asyncly
1223  */
1224 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1225                             uint8_t *buf, bool async)
1226 {
1227     ram_counters.transferred += save_page_header(rs, rs->f, block,
1228                                                  offset | RAM_SAVE_FLAG_PAGE);
1229     if (async) {
1230         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1231                               migrate_release_ram() &
1232                               migration_in_postcopy());
1233     } else {
1234         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1235     }
1236     ram_counters.transferred += TARGET_PAGE_SIZE;
1237     ram_counters.normal++;
1238     return 1;
1239 }
1240
1241 /**
1242  * ram_save_page: send the given page to the stream
1243  *
1244  * Returns the number of pages written.
1245  *          < 0 - error
1246  *          >=0 - Number of pages written - this might legally be 0
1247  *                if xbzrle noticed the page was the same.
1248  *
1249  * @rs: current RAM state
1250  * @block: block that contains the page we want to send
1251  * @offset: offset inside the block for the page
1252  * @last_stage: if we are at the completion stage
1253  */
1254 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1255 {
1256     int pages = -1;
1257     uint8_t *p;
1258     bool send_async = true;
1259     RAMBlock *block = pss->block;
1260     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1261     ram_addr_t current_addr = block->offset + offset;
1262
1263     p = block->host + offset;
1264     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1265
1266     XBZRLE_cache_lock();
1267     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1268         pages = save_xbzrle_page(rs, &p, current_addr, block,
1269                                  offset, last_stage);
1270         if (!last_stage) {
1271             /* Can't send this cached data async, since the cache page
1272              * might get updated before it gets to the wire
1273              */
1274             send_async = false;
1275         }
1276     }
1277
1278     /* XBZRLE overflow or normal page */
1279     if (pages == -1) {
1280         pages = save_normal_page(rs, block, offset, p, send_async);
1281     }
1282
1283     XBZRLE_cache_unlock();
1284
1285     return pages;
1286 }
1287
1288 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1289                                  ram_addr_t offset)
1290 {
1291     if (multifd_queue_page(rs->f, block, offset) < 0) {
1292         return -1;
1293     }
1294     ram_counters.normal++;
1295
1296     return 1;
1297 }
1298
1299 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1300                                  ram_addr_t offset, uint8_t *source_buf)
1301 {
1302     RAMState *rs = ram_state;
1303     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1304     bool zero_page = false;
1305     int ret;
1306
1307     if (save_zero_page_to_file(rs, f, block, offset)) {
1308         zero_page = true;
1309         goto exit;
1310     }
1311
1312     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1313
1314     /*
1315      * copy it to a internal buffer to avoid it being modified by VM
1316      * so that we can catch up the error during compression and
1317      * decompression
1318      */
1319     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1320     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1321     if (ret < 0) {
1322         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1323         error_report("compressed data failed!");
1324         return false;
1325     }
1326
1327 exit:
1328     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1329     return zero_page;
1330 }
1331
1332 static void
1333 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1334 {
1335     ram_counters.transferred += bytes_xmit;
1336
1337     if (param->zero_page) {
1338         ram_counters.duplicate++;
1339         return;
1340     }
1341
1342     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1343     compression_counters.compressed_size += bytes_xmit - 8;
1344     compression_counters.pages++;
1345 }
1346
1347 static bool save_page_use_compression(RAMState *rs);
1348
1349 static void flush_compressed_data(RAMState *rs)
1350 {
1351     int idx, len, thread_count;
1352
1353     if (!save_page_use_compression(rs)) {
1354         return;
1355     }
1356     thread_count = migrate_compress_threads();
1357
1358     qemu_mutex_lock(&comp_done_lock);
1359     for (idx = 0; idx < thread_count; idx++) {
1360         while (!comp_param[idx].done) {
1361             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1362         }
1363     }
1364     qemu_mutex_unlock(&comp_done_lock);
1365
1366     for (idx = 0; idx < thread_count; idx++) {
1367         qemu_mutex_lock(&comp_param[idx].mutex);
1368         if (!comp_param[idx].quit) {
1369             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1370             /*
1371              * it's safe to fetch zero_page without holding comp_done_lock
1372              * as there is no further request submitted to the thread,
1373              * i.e, the thread should be waiting for a request at this point.
1374              */
1375             update_compress_thread_counts(&comp_param[idx], len);
1376         }
1377         qemu_mutex_unlock(&comp_param[idx].mutex);
1378     }
1379 }
1380
1381 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1382                                        ram_addr_t offset)
1383 {
1384     param->block = block;
1385     param->offset = offset;
1386 }
1387
1388 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1389                                            ram_addr_t offset)
1390 {
1391     int idx, thread_count, bytes_xmit = -1, pages = -1;
1392     bool wait = migrate_compress_wait_thread();
1393
1394     thread_count = migrate_compress_threads();
1395     qemu_mutex_lock(&comp_done_lock);
1396 retry:
1397     for (idx = 0; idx < thread_count; idx++) {
1398         if (comp_param[idx].done) {
1399             comp_param[idx].done = false;
1400             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1401             qemu_mutex_lock(&comp_param[idx].mutex);
1402             set_compress_params(&comp_param[idx], block, offset);
1403             qemu_cond_signal(&comp_param[idx].cond);
1404             qemu_mutex_unlock(&comp_param[idx].mutex);
1405             pages = 1;
1406             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1407             break;
1408         }
1409     }
1410
1411     /*
1412      * wait for the free thread if the user specifies 'compress-wait-thread',
1413      * otherwise we will post the page out in the main thread as normal page.
1414      */
1415     if (pages < 0 && wait) {
1416         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1417         goto retry;
1418     }
1419     qemu_mutex_unlock(&comp_done_lock);
1420
1421     return pages;
1422 }
1423
1424 /**
1425  * find_dirty_block: find the next dirty page and update any state
1426  * associated with the search process.
1427  *
1428  * Returns true if a page is found
1429  *
1430  * @rs: current RAM state
1431  * @pss: data about the state of the current dirty page scan
1432  * @again: set to false if the search has scanned the whole of RAM
1433  */
1434 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1435 {
1436     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1437     if (pss->complete_round && pss->block == rs->last_seen_block &&
1438         pss->page >= rs->last_page) {
1439         /*
1440          * We've been once around the RAM and haven't found anything.
1441          * Give up.
1442          */
1443         *again = false;
1444         return false;
1445     }
1446     if (!offset_in_ramblock(pss->block,
1447                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1448         /* Didn't find anything in this RAM Block */
1449         pss->page = 0;
1450         pss->block = QLIST_NEXT_RCU(pss->block, next);
1451         if (!pss->block) {
1452             /*
1453              * If memory migration starts over, we will meet a dirtied page
1454              * which may still exists in compression threads's ring, so we
1455              * should flush the compressed data to make sure the new page
1456              * is not overwritten by the old one in the destination.
1457              *
1458              * Also If xbzrle is on, stop using the data compression at this
1459              * point. In theory, xbzrle can do better than compression.
1460              */
1461             flush_compressed_data(rs);
1462
1463             /* Hit the end of the list */
1464             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1465             /* Flag that we've looped */
1466             pss->complete_round = true;
1467             /* After the first round, enable XBZRLE. */
1468             if (migrate_use_xbzrle()) {
1469                 rs->xbzrle_enabled = true;
1470             }
1471         }
1472         /* Didn't find anything this time, but try again on the new block */
1473         *again = true;
1474         return false;
1475     } else {
1476         /* Can go around again, but... */
1477         *again = true;
1478         /* We've found something so probably don't need to */
1479         return true;
1480     }
1481 }
1482
1483 /**
1484  * unqueue_page: gets a page of the queue
1485  *
1486  * Helper for 'get_queued_page' - gets a page off the queue
1487  *
1488  * Returns the block of the page (or NULL if none available)
1489  *
1490  * @rs: current RAM state
1491  * @offset: used to return the offset within the RAMBlock
1492  */
1493 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1494 {
1495     RAMBlock *block = NULL;
1496
1497     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1498         return NULL;
1499     }
1500
1501     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1502     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1503         struct RAMSrcPageRequest *entry =
1504                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1505         block = entry->rb;
1506         *offset = entry->offset;
1507
1508         if (entry->len > TARGET_PAGE_SIZE) {
1509             entry->len -= TARGET_PAGE_SIZE;
1510             entry->offset += TARGET_PAGE_SIZE;
1511         } else {
1512             memory_region_unref(block->mr);
1513             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1514             g_free(entry);
1515             migration_consume_urgent_request();
1516         }
1517     }
1518
1519     return block;
1520 }
1521
1522 #if defined(__linux__)
1523 /**
1524  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1525  *   is found, return RAM block pointer and page offset
1526  *
1527  * Returns pointer to the RAMBlock containing faulting page,
1528  *   NULL if no write faults are pending
1529  *
1530  * @rs: current RAM state
1531  * @offset: page offset from the beginning of the block
1532  */
1533 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1534 {
1535     struct uffd_msg uffd_msg;
1536     void *page_address;
1537     RAMBlock *block;
1538     int res;
1539
1540     if (!migrate_background_snapshot()) {
1541         return NULL;
1542     }
1543
1544     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1545     if (res <= 0) {
1546         return NULL;
1547     }
1548
1549     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1550     block = qemu_ram_block_from_host(page_address, false, offset);
1551     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1552     return block;
1553 }
1554
1555 /**
1556  * ram_save_release_protection: release UFFD write protection after
1557  *   a range of pages has been saved
1558  *
1559  * @rs: current RAM state
1560  * @pss: page-search-status structure
1561  * @start_page: index of the first page in the range relative to pss->block
1562  *
1563  * Returns 0 on success, negative value in case of an error
1564 */
1565 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1566         unsigned long start_page)
1567 {
1568     int res = 0;
1569
1570     /* Check if page is from UFFD-managed region. */
1571     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1572         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1573         uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1574
1575         /* Flush async buffers before un-protect. */
1576         qemu_fflush(rs->f);
1577         /* Un-protect memory range. */
1578         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1579                 false, false);
1580     }
1581
1582     return res;
1583 }
1584
1585 /* ram_write_tracking_available: check if kernel supports required UFFD features
1586  *
1587  * Returns true if supports, false otherwise
1588  */
1589 bool ram_write_tracking_available(void)
1590 {
1591     uint64_t uffd_features;
1592     int res;
1593
1594     res = uffd_query_features(&uffd_features);
1595     return (res == 0 &&
1596             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1597 }
1598
1599 /* ram_write_tracking_compatible: check if guest configuration is
1600  *   compatible with 'write-tracking'
1601  *
1602  * Returns true if compatible, false otherwise
1603  */
1604 bool ram_write_tracking_compatible(void)
1605 {
1606     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1607     int uffd_fd;
1608     RAMBlock *block;
1609     bool ret = false;
1610
1611     /* Open UFFD file descriptor */
1612     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1613     if (uffd_fd < 0) {
1614         return false;
1615     }
1616
1617     RCU_READ_LOCK_GUARD();
1618
1619     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1620         uint64_t uffd_ioctls;
1621
1622         /* Nothing to do with read-only and MMIO-writable regions */
1623         if (block->mr->readonly || block->mr->rom_device) {
1624             continue;
1625         }
1626         /* Try to register block memory via UFFD-IO to track writes */
1627         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1628                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1629             goto out;
1630         }
1631         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1632             goto out;
1633         }
1634     }
1635     ret = true;
1636
1637 out:
1638     uffd_close_fd(uffd_fd);
1639     return ret;
1640 }
1641
1642 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1643                                        ram_addr_t size)
1644 {
1645     /*
1646      * We read one byte of each page; this will preallocate page tables if
1647      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1648      * where no page was populated yet. This might require adaption when
1649      * supporting other mappings, like shmem.
1650      */
1651     for (; offset < size; offset += block->page_size) {
1652         char tmp = *((char *)block->host + offset);
1653
1654         /* Don't optimize the read out */
1655         asm volatile("" : "+r" (tmp));
1656     }
1657 }
1658
1659 static inline int populate_read_section(MemoryRegionSection *section,
1660                                         void *opaque)
1661 {
1662     const hwaddr size = int128_get64(section->size);
1663     hwaddr offset = section->offset_within_region;
1664     RAMBlock *block = section->mr->ram_block;
1665
1666     populate_read_range(block, offset, size);
1667     return 0;
1668 }
1669
1670 /*
1671  * ram_block_populate_read: preallocate page tables and populate pages in the
1672  *   RAM block by reading a byte of each page.
1673  *
1674  * Since it's solely used for userfault_fd WP feature, here we just
1675  *   hardcode page size to qemu_real_host_page_size.
1676  *
1677  * @block: RAM block to populate
1678  */
1679 static void ram_block_populate_read(RAMBlock *rb)
1680 {
1681     /*
1682      * Skip populating all pages that fall into a discarded range as managed by
1683      * a RamDiscardManager responsible for the mapped memory region of the
1684      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1685      * must not get populated automatically. We don't have to track
1686      * modifications via userfaultfd WP reliably, because these pages will
1687      * not be part of the migration stream either way -- see
1688      * ramblock_dirty_bitmap_exclude_discarded_pages().
1689      *
1690      * Note: The result is only stable while migrating (precopy/postcopy).
1691      */
1692     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1693         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1694         MemoryRegionSection section = {
1695             .mr = rb->mr,
1696             .offset_within_region = 0,
1697             .size = rb->mr->size,
1698         };
1699
1700         ram_discard_manager_replay_populated(rdm, &section,
1701                                              populate_read_section, NULL);
1702     } else {
1703         populate_read_range(rb, 0, rb->used_length);
1704     }
1705 }
1706
1707 /*
1708  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1709  */
1710 void ram_write_tracking_prepare(void)
1711 {
1712     RAMBlock *block;
1713
1714     RCU_READ_LOCK_GUARD();
1715
1716     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1717         /* Nothing to do with read-only and MMIO-writable regions */
1718         if (block->mr->readonly || block->mr->rom_device) {
1719             continue;
1720         }
1721
1722         /*
1723          * Populate pages of the RAM block before enabling userfault_fd
1724          * write protection.
1725          *
1726          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1727          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1728          * pages with pte_none() entries in page table.
1729          */
1730         ram_block_populate_read(block);
1731     }
1732 }
1733
1734 /*
1735  * ram_write_tracking_start: start UFFD-WP memory tracking
1736  *
1737  * Returns 0 for success or negative value in case of error
1738  */
1739 int ram_write_tracking_start(void)
1740 {
1741     int uffd_fd;
1742     RAMState *rs = ram_state;
1743     RAMBlock *block;
1744
1745     /* Open UFFD file descriptor */
1746     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1747     if (uffd_fd < 0) {
1748         return uffd_fd;
1749     }
1750     rs->uffdio_fd = uffd_fd;
1751
1752     RCU_READ_LOCK_GUARD();
1753
1754     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1755         /* Nothing to do with read-only and MMIO-writable regions */
1756         if (block->mr->readonly || block->mr->rom_device) {
1757             continue;
1758         }
1759
1760         /* Register block memory with UFFD to track writes */
1761         if (uffd_register_memory(rs->uffdio_fd, block->host,
1762                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1763             goto fail;
1764         }
1765         /* Apply UFFD write protection to the block memory range */
1766         if (uffd_change_protection(rs->uffdio_fd, block->host,
1767                 block->max_length, true, false)) {
1768             goto fail;
1769         }
1770         block->flags |= RAM_UF_WRITEPROTECT;
1771         memory_region_ref(block->mr);
1772
1773         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1774                 block->host, block->max_length);
1775     }
1776
1777     return 0;
1778
1779 fail:
1780     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1781
1782     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1783         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1784             continue;
1785         }
1786         /*
1787          * In case some memory block failed to be write-protected
1788          * remove protection and unregister all succeeded RAM blocks
1789          */
1790         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1791                 false, false);
1792         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1793         /* Cleanup flags and remove reference */
1794         block->flags &= ~RAM_UF_WRITEPROTECT;
1795         memory_region_unref(block->mr);
1796     }
1797
1798     uffd_close_fd(uffd_fd);
1799     rs->uffdio_fd = -1;
1800     return -1;
1801 }
1802
1803 /**
1804  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1805  */
1806 void ram_write_tracking_stop(void)
1807 {
1808     RAMState *rs = ram_state;
1809     RAMBlock *block;
1810
1811     RCU_READ_LOCK_GUARD();
1812
1813     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1814         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1815             continue;
1816         }
1817         /* Remove protection and unregister all affected RAM blocks */
1818         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1819                 false, false);
1820         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1821
1822         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1823                 block->host, block->max_length);
1824
1825         /* Cleanup flags and remove reference */
1826         block->flags &= ~RAM_UF_WRITEPROTECT;
1827         memory_region_unref(block->mr);
1828     }
1829
1830     /* Finally close UFFD file descriptor */
1831     uffd_close_fd(rs->uffdio_fd);
1832     rs->uffdio_fd = -1;
1833 }
1834
1835 #else
1836 /* No target OS support, stubs just fail or ignore */
1837
1838 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1839 {
1840     (void) rs;
1841     (void) offset;
1842
1843     return NULL;
1844 }
1845
1846 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1847         unsigned long start_page)
1848 {
1849     (void) rs;
1850     (void) pss;
1851     (void) start_page;
1852
1853     return 0;
1854 }
1855
1856 bool ram_write_tracking_available(void)
1857 {
1858     return false;
1859 }
1860
1861 bool ram_write_tracking_compatible(void)
1862 {
1863     assert(0);
1864     return false;
1865 }
1866
1867 int ram_write_tracking_start(void)
1868 {
1869     assert(0);
1870     return -1;
1871 }
1872
1873 void ram_write_tracking_stop(void)
1874 {
1875     assert(0);
1876 }
1877 #endif /* defined(__linux__) */
1878
1879 /**
1880  * get_queued_page: unqueue a page from the postcopy requests
1881  *
1882  * Skips pages that are already sent (!dirty)
1883  *
1884  * Returns true if a queued page is found
1885  *
1886  * @rs: current RAM state
1887  * @pss: data about the state of the current dirty page scan
1888  */
1889 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1890 {
1891     RAMBlock  *block;
1892     ram_addr_t offset;
1893     bool dirty;
1894
1895     do {
1896         block = unqueue_page(rs, &offset);
1897         /*
1898          * We're sending this page, and since it's postcopy nothing else
1899          * will dirty it, and we must make sure it doesn't get sent again
1900          * even if this queue request was received after the background
1901          * search already sent it.
1902          */
1903         if (block) {
1904             unsigned long page;
1905
1906             page = offset >> TARGET_PAGE_BITS;
1907             dirty = test_bit(page, block->bmap);
1908             if (!dirty) {
1909                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1910                                                 page);
1911             } else {
1912                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1913             }
1914         }
1915
1916     } while (block && !dirty);
1917
1918     if (!block) {
1919         /*
1920          * Poll write faults too if background snapshot is enabled; that's
1921          * when we have vcpus got blocked by the write protected pages.
1922          */
1923         block = poll_fault_page(rs, &offset);
1924     }
1925
1926     if (block) {
1927         /*
1928          * We want the background search to continue from the queued page
1929          * since the guest is likely to want other pages near to the page
1930          * it just requested.
1931          */
1932         pss->block = block;
1933         pss->page = offset >> TARGET_PAGE_BITS;
1934
1935         /*
1936          * This unqueued page would break the "one round" check, even is
1937          * really rare.
1938          */
1939         pss->complete_round = false;
1940     }
1941
1942     return !!block;
1943 }
1944
1945 /**
1946  * migration_page_queue_free: drop any remaining pages in the ram
1947  * request queue
1948  *
1949  * It should be empty at the end anyway, but in error cases there may
1950  * be some left.  in case that there is any page left, we drop it.
1951  *
1952  */
1953 static void migration_page_queue_free(RAMState *rs)
1954 {
1955     struct RAMSrcPageRequest *mspr, *next_mspr;
1956     /* This queue generally should be empty - but in the case of a failed
1957      * migration might have some droppings in.
1958      */
1959     RCU_READ_LOCK_GUARD();
1960     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1961         memory_region_unref(mspr->rb->mr);
1962         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1963         g_free(mspr);
1964     }
1965 }
1966
1967 /**
1968  * ram_save_queue_pages: queue the page for transmission
1969  *
1970  * A request from postcopy destination for example.
1971  *
1972  * Returns zero on success or negative on error
1973  *
1974  * @rbname: Name of the RAMBLock of the request. NULL means the
1975  *          same that last one.
1976  * @start: starting address from the start of the RAMBlock
1977  * @len: length (in bytes) to send
1978  */
1979 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1980 {
1981     RAMBlock *ramblock;
1982     RAMState *rs = ram_state;
1983
1984     ram_counters.postcopy_requests++;
1985     RCU_READ_LOCK_GUARD();
1986
1987     if (!rbname) {
1988         /* Reuse last RAMBlock */
1989         ramblock = rs->last_req_rb;
1990
1991         if (!ramblock) {
1992             /*
1993              * Shouldn't happen, we can't reuse the last RAMBlock if
1994              * it's the 1st request.
1995              */
1996             error_report("ram_save_queue_pages no previous block");
1997             return -1;
1998         }
1999     } else {
2000         ramblock = qemu_ram_block_by_name(rbname);
2001
2002         if (!ramblock) {
2003             /* We shouldn't be asked for a non-existent RAMBlock */
2004             error_report("ram_save_queue_pages no block '%s'", rbname);
2005             return -1;
2006         }
2007         rs->last_req_rb = ramblock;
2008     }
2009     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2010     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2011         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2012                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2013                      __func__, start, len, ramblock->used_length);
2014         return -1;
2015     }
2016
2017     struct RAMSrcPageRequest *new_entry =
2018         g_malloc0(sizeof(struct RAMSrcPageRequest));
2019     new_entry->rb = ramblock;
2020     new_entry->offset = start;
2021     new_entry->len = len;
2022
2023     memory_region_ref(ramblock->mr);
2024     qemu_mutex_lock(&rs->src_page_req_mutex);
2025     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2026     migration_make_urgent_request();
2027     qemu_mutex_unlock(&rs->src_page_req_mutex);
2028
2029     return 0;
2030 }
2031
2032 static bool save_page_use_compression(RAMState *rs)
2033 {
2034     if (!migrate_use_compression()) {
2035         return false;
2036     }
2037
2038     /*
2039      * If xbzrle is enabled (e.g., after first round of migration), stop
2040      * using the data compression. In theory, xbzrle can do better than
2041      * compression.
2042      */
2043     if (rs->xbzrle_enabled) {
2044         return false;
2045     }
2046
2047     return true;
2048 }
2049
2050 /*
2051  * try to compress the page before posting it out, return true if the page
2052  * has been properly handled by compression, otherwise needs other
2053  * paths to handle it
2054  */
2055 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2056 {
2057     if (!save_page_use_compression(rs)) {
2058         return false;
2059     }
2060
2061     /*
2062      * When starting the process of a new block, the first page of
2063      * the block should be sent out before other pages in the same
2064      * block, and all the pages in last block should have been sent
2065      * out, keeping this order is important, because the 'cont' flag
2066      * is used to avoid resending the block name.
2067      *
2068      * We post the fist page as normal page as compression will take
2069      * much CPU resource.
2070      */
2071     if (block != rs->last_sent_block) {
2072         flush_compressed_data(rs);
2073         return false;
2074     }
2075
2076     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2077         return true;
2078     }
2079
2080     compression_counters.busy++;
2081     return false;
2082 }
2083
2084 /**
2085  * ram_save_target_page: save one target page
2086  *
2087  * Returns the number of pages written
2088  *
2089  * @rs: current RAM state
2090  * @pss: data about the page we want to send
2091  * @last_stage: if we are at the completion stage
2092  */
2093 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2094                                 bool last_stage)
2095 {
2096     RAMBlock *block = pss->block;
2097     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2098     int res;
2099
2100     if (control_save_page(rs, block, offset, &res)) {
2101         return res;
2102     }
2103
2104     if (save_compress_page(rs, block, offset)) {
2105         return 1;
2106     }
2107
2108     res = save_zero_page(rs, block, offset);
2109     if (res > 0) {
2110         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2111          * page would be stale
2112          */
2113         if (!save_page_use_compression(rs)) {
2114             XBZRLE_cache_lock();
2115             xbzrle_cache_zero_page(rs, block->offset + offset);
2116             XBZRLE_cache_unlock();
2117         }
2118         ram_release_pages(block->idstr, offset, res);
2119         return res;
2120     }
2121
2122     /*
2123      * Do not use multifd for:
2124      * 1. Compression as the first page in the new block should be posted out
2125      *    before sending the compressed page
2126      * 2. In postcopy as one whole host page should be placed
2127      */
2128     if (!save_page_use_compression(rs) && migrate_use_multifd()
2129         && !migration_in_postcopy()) {
2130         return ram_save_multifd_page(rs, block, offset);
2131     }
2132
2133     return ram_save_page(rs, pss, last_stage);
2134 }
2135
2136 /**
2137  * ram_save_host_page: save a whole host page
2138  *
2139  * Starting at *offset send pages up to the end of the current host
2140  * page. It's valid for the initial offset to point into the middle of
2141  * a host page in which case the remainder of the hostpage is sent.
2142  * Only dirty target pages are sent. Note that the host page size may
2143  * be a huge page for this block.
2144  * The saving stops at the boundary of the used_length of the block
2145  * if the RAMBlock isn't a multiple of the host page size.
2146  *
2147  * Returns the number of pages written or negative on error
2148  *
2149  * @rs: current RAM state
2150  * @ms: current migration state
2151  * @pss: data about the page we want to send
2152  * @last_stage: if we are at the completion stage
2153  */
2154 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2155                               bool last_stage)
2156 {
2157     int tmppages, pages = 0;
2158     size_t pagesize_bits =
2159         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2160     unsigned long hostpage_boundary =
2161         QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2162     unsigned long start_page = pss->page;
2163     int res;
2164
2165     if (ramblock_is_ignored(pss->block)) {
2166         error_report("block %s should not be migrated !", pss->block->idstr);
2167         return 0;
2168     }
2169
2170     do {
2171         /* Check the pages is dirty and if it is send it */
2172         if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2173             tmppages = ram_save_target_page(rs, pss, last_stage);
2174             if (tmppages < 0) {
2175                 return tmppages;
2176             }
2177
2178             pages += tmppages;
2179             /*
2180              * Allow rate limiting to happen in the middle of huge pages if
2181              * something is sent in the current iteration.
2182              */
2183             if (pagesize_bits > 1 && tmppages > 0) {
2184                 migration_rate_limit();
2185             }
2186         }
2187         pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2188     } while ((pss->page < hostpage_boundary) &&
2189              offset_in_ramblock(pss->block,
2190                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2191     /* The offset we leave with is the min boundary of host page and block */
2192     pss->page = MIN(pss->page, hostpage_boundary) - 1;
2193
2194     res = ram_save_release_protection(rs, pss, start_page);
2195     return (res < 0 ? res : pages);
2196 }
2197
2198 /**
2199  * ram_find_and_save_block: finds a dirty page and sends it to f
2200  *
2201  * Called within an RCU critical section.
2202  *
2203  * Returns the number of pages written where zero means no dirty pages,
2204  * or negative on error
2205  *
2206  * @rs: current RAM state
2207  * @last_stage: if we are at the completion stage
2208  *
2209  * On systems where host-page-size > target-page-size it will send all the
2210  * pages in a host page that are dirty.
2211  */
2212
2213 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2214 {
2215     PageSearchStatus pss;
2216     int pages = 0;
2217     bool again, found;
2218
2219     /* No dirty page as there is zero RAM */
2220     if (!ram_bytes_total()) {
2221         return pages;
2222     }
2223
2224     pss.block = rs->last_seen_block;
2225     pss.page = rs->last_page;
2226     pss.complete_round = false;
2227
2228     if (!pss.block) {
2229         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2230     }
2231
2232     do {
2233         again = true;
2234         found = get_queued_page(rs, &pss);
2235
2236         if (!found) {
2237             /* priority queue empty, so just search for something dirty */
2238             found = find_dirty_block(rs, &pss, &again);
2239         }
2240
2241         if (found) {
2242             pages = ram_save_host_page(rs, &pss, last_stage);
2243         }
2244     } while (!pages && again);
2245
2246     rs->last_seen_block = pss.block;
2247     rs->last_page = pss.page;
2248
2249     return pages;
2250 }
2251
2252 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2253 {
2254     uint64_t pages = size / TARGET_PAGE_SIZE;
2255
2256     if (zero) {
2257         ram_counters.duplicate += pages;
2258     } else {
2259         ram_counters.normal += pages;
2260         ram_counters.transferred += size;
2261         qemu_update_position(f, size);
2262     }
2263 }
2264
2265 static uint64_t ram_bytes_total_common(bool count_ignored)
2266 {
2267     RAMBlock *block;
2268     uint64_t total = 0;
2269
2270     RCU_READ_LOCK_GUARD();
2271
2272     if (count_ignored) {
2273         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2274             total += block->used_length;
2275         }
2276     } else {
2277         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2278             total += block->used_length;
2279         }
2280     }
2281     return total;
2282 }
2283
2284 uint64_t ram_bytes_total(void)
2285 {
2286     return ram_bytes_total_common(false);
2287 }
2288
2289 static void xbzrle_load_setup(void)
2290 {
2291     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2292 }
2293
2294 static void xbzrle_load_cleanup(void)
2295 {
2296     g_free(XBZRLE.decoded_buf);
2297     XBZRLE.decoded_buf = NULL;
2298 }
2299
2300 static void ram_state_cleanup(RAMState **rsp)
2301 {
2302     if (*rsp) {
2303         migration_page_queue_free(*rsp);
2304         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2305         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2306         g_free(*rsp);
2307         *rsp = NULL;
2308     }
2309 }
2310
2311 static void xbzrle_cleanup(void)
2312 {
2313     XBZRLE_cache_lock();
2314     if (XBZRLE.cache) {
2315         cache_fini(XBZRLE.cache);
2316         g_free(XBZRLE.encoded_buf);
2317         g_free(XBZRLE.current_buf);
2318         g_free(XBZRLE.zero_target_page);
2319         XBZRLE.cache = NULL;
2320         XBZRLE.encoded_buf = NULL;
2321         XBZRLE.current_buf = NULL;
2322         XBZRLE.zero_target_page = NULL;
2323     }
2324     XBZRLE_cache_unlock();
2325 }
2326
2327 static void ram_save_cleanup(void *opaque)
2328 {
2329     RAMState **rsp = opaque;
2330     RAMBlock *block;
2331
2332     /* We don't use dirty log with background snapshots */
2333     if (!migrate_background_snapshot()) {
2334         /* caller have hold iothread lock or is in a bh, so there is
2335          * no writing race against the migration bitmap
2336          */
2337         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2338             /*
2339              * do not stop dirty log without starting it, since
2340              * memory_global_dirty_log_stop will assert that
2341              * memory_global_dirty_log_start/stop used in pairs
2342              */
2343             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2344         }
2345     }
2346
2347     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2348         g_free(block->clear_bmap);
2349         block->clear_bmap = NULL;
2350         g_free(block->bmap);
2351         block->bmap = NULL;
2352     }
2353
2354     xbzrle_cleanup();
2355     compress_threads_save_cleanup();
2356     ram_state_cleanup(rsp);
2357 }
2358
2359 static void ram_state_reset(RAMState *rs)
2360 {
2361     rs->last_seen_block = NULL;
2362     rs->last_sent_block = NULL;
2363     rs->last_page = 0;
2364     rs->last_version = ram_list.version;
2365     rs->xbzrle_enabled = false;
2366 }
2367
2368 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2369
2370 /*
2371  * 'expected' is the value you expect the bitmap mostly to be full
2372  * of; it won't bother printing lines that are all this value.
2373  * If 'todump' is null the migration bitmap is dumped.
2374  */
2375 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2376                            unsigned long pages)
2377 {
2378     int64_t cur;
2379     int64_t linelen = 128;
2380     char linebuf[129];
2381
2382     for (cur = 0; cur < pages; cur += linelen) {
2383         int64_t curb;
2384         bool found = false;
2385         /*
2386          * Last line; catch the case where the line length
2387          * is longer than remaining ram
2388          */
2389         if (cur + linelen > pages) {
2390             linelen = pages - cur;
2391         }
2392         for (curb = 0; curb < linelen; curb++) {
2393             bool thisbit = test_bit(cur + curb, todump);
2394             linebuf[curb] = thisbit ? '1' : '.';
2395             found = found || (thisbit != expected);
2396         }
2397         if (found) {
2398             linebuf[curb] = '\0';
2399             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2400         }
2401     }
2402 }
2403
2404 /* **** functions for postcopy ***** */
2405
2406 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2407 {
2408     struct RAMBlock *block;
2409
2410     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2411         unsigned long *bitmap = block->bmap;
2412         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2413         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2414
2415         while (run_start < range) {
2416             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2417             ram_discard_range(block->idstr,
2418                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2419                               ((ram_addr_t)(run_end - run_start))
2420                                 << TARGET_PAGE_BITS);
2421             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2422         }
2423     }
2424 }
2425
2426 /**
2427  * postcopy_send_discard_bm_ram: discard a RAMBlock
2428  *
2429  * Returns zero on success
2430  *
2431  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2432  *
2433  * @ms: current migration state
2434  * @block: RAMBlock to discard
2435  */
2436 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2437 {
2438     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2439     unsigned long current;
2440     unsigned long *bitmap = block->bmap;
2441
2442     for (current = 0; current < end; ) {
2443         unsigned long one = find_next_bit(bitmap, end, current);
2444         unsigned long zero, discard_length;
2445
2446         if (one >= end) {
2447             break;
2448         }
2449
2450         zero = find_next_zero_bit(bitmap, end, one + 1);
2451
2452         if (zero >= end) {
2453             discard_length = end - one;
2454         } else {
2455             discard_length = zero - one;
2456         }
2457         postcopy_discard_send_range(ms, one, discard_length);
2458         current = one + discard_length;
2459     }
2460
2461     return 0;
2462 }
2463
2464 /**
2465  * postcopy_each_ram_send_discard: discard all RAMBlocks
2466  *
2467  * Returns 0 for success or negative for error
2468  *
2469  * Utility for the outgoing postcopy code.
2470  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2471  *   passing it bitmap indexes and name.
2472  * (qemu_ram_foreach_block ends up passing unscaled lengths
2473  *  which would mean postcopy code would have to deal with target page)
2474  *
2475  * @ms: current migration state
2476  */
2477 static int postcopy_each_ram_send_discard(MigrationState *ms)
2478 {
2479     struct RAMBlock *block;
2480     int ret;
2481
2482     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2483         postcopy_discard_send_init(ms, block->idstr);
2484
2485         /*
2486          * Postcopy sends chunks of bitmap over the wire, but it
2487          * just needs indexes at this point, avoids it having
2488          * target page specific code.
2489          */
2490         ret = postcopy_send_discard_bm_ram(ms, block);
2491         postcopy_discard_send_finish(ms);
2492         if (ret) {
2493             return ret;
2494         }
2495     }
2496
2497     return 0;
2498 }
2499
2500 /**
2501  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2502  *
2503  * Helper for postcopy_chunk_hostpages; it's called twice to
2504  * canonicalize the two bitmaps, that are similar, but one is
2505  * inverted.
2506  *
2507  * Postcopy requires that all target pages in a hostpage are dirty or
2508  * clean, not a mix.  This function canonicalizes the bitmaps.
2509  *
2510  * @ms: current migration state
2511  * @block: block that contains the page we want to canonicalize
2512  */
2513 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2514 {
2515     RAMState *rs = ram_state;
2516     unsigned long *bitmap = block->bmap;
2517     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2518     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2519     unsigned long run_start;
2520
2521     if (block->page_size == TARGET_PAGE_SIZE) {
2522         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2523         return;
2524     }
2525
2526     /* Find a dirty page */
2527     run_start = find_next_bit(bitmap, pages, 0);
2528
2529     while (run_start < pages) {
2530
2531         /*
2532          * If the start of this run of pages is in the middle of a host
2533          * page, then we need to fixup this host page.
2534          */
2535         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2536             /* Find the end of this run */
2537             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2538             /*
2539              * If the end isn't at the start of a host page, then the
2540              * run doesn't finish at the end of a host page
2541              * and we need to discard.
2542              */
2543         }
2544
2545         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2546             unsigned long page;
2547             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2548                                                              host_ratio);
2549             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2550
2551             /* Clean up the bitmap */
2552             for (page = fixup_start_addr;
2553                  page < fixup_start_addr + host_ratio; page++) {
2554                 /*
2555                  * Remark them as dirty, updating the count for any pages
2556                  * that weren't previously dirty.
2557                  */
2558                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2559             }
2560         }
2561
2562         /* Find the next dirty page for the next iteration */
2563         run_start = find_next_bit(bitmap, pages, run_start);
2564     }
2565 }
2566
2567 /**
2568  * postcopy_chunk_hostpages: discard any partially sent host page
2569  *
2570  * Utility for the outgoing postcopy code.
2571  *
2572  * Discard any partially sent host-page size chunks, mark any partially
2573  * dirty host-page size chunks as all dirty.  In this case the host-page
2574  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2575  *
2576  * Returns zero on success
2577  *
2578  * @ms: current migration state
2579  * @block: block we want to work with
2580  */
2581 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2582 {
2583     postcopy_discard_send_init(ms, block->idstr);
2584
2585     /*
2586      * Ensure that all partially dirty host pages are made fully dirty.
2587      */
2588     postcopy_chunk_hostpages_pass(ms, block);
2589
2590     postcopy_discard_send_finish(ms);
2591     return 0;
2592 }
2593
2594 /**
2595  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2596  *
2597  * Returns zero on success
2598  *
2599  * Transmit the set of pages to be discarded after precopy to the target
2600  * these are pages that:
2601  *     a) Have been previously transmitted but are now dirty again
2602  *     b) Pages that have never been transmitted, this ensures that
2603  *        any pages on the destination that have been mapped by background
2604  *        tasks get discarded (transparent huge pages is the specific concern)
2605  * Hopefully this is pretty sparse
2606  *
2607  * @ms: current migration state
2608  */
2609 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2610 {
2611     RAMState *rs = ram_state;
2612     RAMBlock *block;
2613     int ret;
2614
2615     RCU_READ_LOCK_GUARD();
2616
2617     /* This should be our last sync, the src is now paused */
2618     migration_bitmap_sync(rs);
2619
2620     /* Easiest way to make sure we don't resume in the middle of a host-page */
2621     rs->last_seen_block = NULL;
2622     rs->last_sent_block = NULL;
2623     rs->last_page = 0;
2624
2625     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2626         /* Deal with TPS != HPS and huge pages */
2627         ret = postcopy_chunk_hostpages(ms, block);
2628         if (ret) {
2629             return ret;
2630         }
2631
2632 #ifdef DEBUG_POSTCOPY
2633         ram_debug_dump_bitmap(block->bmap, true,
2634                               block->used_length >> TARGET_PAGE_BITS);
2635 #endif
2636     }
2637     trace_ram_postcopy_send_discard_bitmap();
2638
2639     return postcopy_each_ram_send_discard(ms);
2640 }
2641
2642 /**
2643  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2644  *
2645  * Returns zero on success
2646  *
2647  * @rbname: name of the RAMBlock of the request. NULL means the
2648  *          same that last one.
2649  * @start: RAMBlock starting page
2650  * @length: RAMBlock size
2651  */
2652 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2653 {
2654     trace_ram_discard_range(rbname, start, length);
2655
2656     RCU_READ_LOCK_GUARD();
2657     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2658
2659     if (!rb) {
2660         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2661         return -1;
2662     }
2663
2664     /*
2665      * On source VM, we don't need to update the received bitmap since
2666      * we don't even have one.
2667      */
2668     if (rb->receivedmap) {
2669         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2670                      length >> qemu_target_page_bits());
2671     }
2672
2673     return ram_block_discard_range(rb, start, length);
2674 }
2675
2676 /*
2677  * For every allocation, we will try not to crash the VM if the
2678  * allocation failed.
2679  */
2680 static int xbzrle_init(void)
2681 {
2682     Error *local_err = NULL;
2683
2684     if (!migrate_use_xbzrle()) {
2685         return 0;
2686     }
2687
2688     XBZRLE_cache_lock();
2689
2690     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2691     if (!XBZRLE.zero_target_page) {
2692         error_report("%s: Error allocating zero page", __func__);
2693         goto err_out;
2694     }
2695
2696     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2697                               TARGET_PAGE_SIZE, &local_err);
2698     if (!XBZRLE.cache) {
2699         error_report_err(local_err);
2700         goto free_zero_page;
2701     }
2702
2703     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2704     if (!XBZRLE.encoded_buf) {
2705         error_report("%s: Error allocating encoded_buf", __func__);
2706         goto free_cache;
2707     }
2708
2709     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2710     if (!XBZRLE.current_buf) {
2711         error_report("%s: Error allocating current_buf", __func__);
2712         goto free_encoded_buf;
2713     }
2714
2715     /* We are all good */
2716     XBZRLE_cache_unlock();
2717     return 0;
2718
2719 free_encoded_buf:
2720     g_free(XBZRLE.encoded_buf);
2721     XBZRLE.encoded_buf = NULL;
2722 free_cache:
2723     cache_fini(XBZRLE.cache);
2724     XBZRLE.cache = NULL;
2725 free_zero_page:
2726     g_free(XBZRLE.zero_target_page);
2727     XBZRLE.zero_target_page = NULL;
2728 err_out:
2729     XBZRLE_cache_unlock();
2730     return -ENOMEM;
2731 }
2732
2733 static int ram_state_init(RAMState **rsp)
2734 {
2735     *rsp = g_try_new0(RAMState, 1);
2736
2737     if (!*rsp) {
2738         error_report("%s: Init ramstate fail", __func__);
2739         return -1;
2740     }
2741
2742     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2743     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2744     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2745
2746     /*
2747      * Count the total number of pages used by ram blocks not including any
2748      * gaps due to alignment or unplugs.
2749      * This must match with the initial values of dirty bitmap.
2750      */
2751     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2752     ram_state_reset(*rsp);
2753
2754     return 0;
2755 }
2756
2757 static void ram_list_init_bitmaps(void)
2758 {
2759     MigrationState *ms = migrate_get_current();
2760     RAMBlock *block;
2761     unsigned long pages;
2762     uint8_t shift;
2763
2764     /* Skip setting bitmap if there is no RAM */
2765     if (ram_bytes_total()) {
2766         shift = ms->clear_bitmap_shift;
2767         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2768             error_report("clear_bitmap_shift (%u) too big, using "
2769                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2770             shift = CLEAR_BITMAP_SHIFT_MAX;
2771         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2772             error_report("clear_bitmap_shift (%u) too small, using "
2773                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2774             shift = CLEAR_BITMAP_SHIFT_MIN;
2775         }
2776
2777         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2778             pages = block->max_length >> TARGET_PAGE_BITS;
2779             /*
2780              * The initial dirty bitmap for migration must be set with all
2781              * ones to make sure we'll migrate every guest RAM page to
2782              * destination.
2783              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2784              * new migration after a failed migration, ram_list.
2785              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2786              * guest memory.
2787              */
2788             block->bmap = bitmap_new(pages);
2789             bitmap_set(block->bmap, 0, pages);
2790             block->clear_bmap_shift = shift;
2791             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2792         }
2793     }
2794 }
2795
2796 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2797 {
2798     unsigned long pages;
2799     RAMBlock *rb;
2800
2801     RCU_READ_LOCK_GUARD();
2802
2803     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2804             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2805             rs->migration_dirty_pages -= pages;
2806     }
2807 }
2808
2809 static void ram_init_bitmaps(RAMState *rs)
2810 {
2811     /* For memory_global_dirty_log_start below.  */
2812     qemu_mutex_lock_iothread();
2813     qemu_mutex_lock_ramlist();
2814
2815     WITH_RCU_READ_LOCK_GUARD() {
2816         ram_list_init_bitmaps();
2817         /* We don't use dirty log with background snapshots */
2818         if (!migrate_background_snapshot()) {
2819             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2820             migration_bitmap_sync_precopy(rs);
2821         }
2822     }
2823     qemu_mutex_unlock_ramlist();
2824     qemu_mutex_unlock_iothread();
2825
2826     /*
2827      * After an eventual first bitmap sync, fixup the initial bitmap
2828      * containing all 1s to exclude any discarded pages from migration.
2829      */
2830     migration_bitmap_clear_discarded_pages(rs);
2831 }
2832
2833 static int ram_init_all(RAMState **rsp)
2834 {
2835     if (ram_state_init(rsp)) {
2836         return -1;
2837     }
2838
2839     if (xbzrle_init()) {
2840         ram_state_cleanup(rsp);
2841         return -1;
2842     }
2843
2844     ram_init_bitmaps(*rsp);
2845
2846     return 0;
2847 }
2848
2849 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2850 {
2851     RAMBlock *block;
2852     uint64_t pages = 0;
2853
2854     /*
2855      * Postcopy is not using xbzrle/compression, so no need for that.
2856      * Also, since source are already halted, we don't need to care
2857      * about dirty page logging as well.
2858      */
2859
2860     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2861         pages += bitmap_count_one(block->bmap,
2862                                   block->used_length >> TARGET_PAGE_BITS);
2863     }
2864
2865     /* This may not be aligned with current bitmaps. Recalculate. */
2866     rs->migration_dirty_pages = pages;
2867
2868     ram_state_reset(rs);
2869
2870     /* Update RAMState cache of output QEMUFile */
2871     rs->f = out;
2872
2873     trace_ram_state_resume_prepare(pages);
2874 }
2875
2876 /*
2877  * This function clears bits of the free pages reported by the caller from the
2878  * migration dirty bitmap. @addr is the host address corresponding to the
2879  * start of the continuous guest free pages, and @len is the total bytes of
2880  * those pages.
2881  */
2882 void qemu_guest_free_page_hint(void *addr, size_t len)
2883 {
2884     RAMBlock *block;
2885     ram_addr_t offset;
2886     size_t used_len, start, npages;
2887     MigrationState *s = migrate_get_current();
2888
2889     /* This function is currently expected to be used during live migration */
2890     if (!migration_is_setup_or_active(s->state)) {
2891         return;
2892     }
2893
2894     for (; len > 0; len -= used_len, addr += used_len) {
2895         block = qemu_ram_block_from_host(addr, false, &offset);
2896         if (unlikely(!block || offset >= block->used_length)) {
2897             /*
2898              * The implementation might not support RAMBlock resize during
2899              * live migration, but it could happen in theory with future
2900              * updates. So we add a check here to capture that case.
2901              */
2902             error_report_once("%s unexpected error", __func__);
2903             return;
2904         }
2905
2906         if (len <= block->used_length - offset) {
2907             used_len = len;
2908         } else {
2909             used_len = block->used_length - offset;
2910         }
2911
2912         start = offset >> TARGET_PAGE_BITS;
2913         npages = used_len >> TARGET_PAGE_BITS;
2914
2915         qemu_mutex_lock(&ram_state->bitmap_mutex);
2916         /*
2917          * The skipped free pages are equavalent to be sent from clear_bmap's
2918          * perspective, so clear the bits from the memory region bitmap which
2919          * are initially set. Otherwise those skipped pages will be sent in
2920          * the next round after syncing from the memory region bitmap.
2921          */
2922         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2923         ram_state->migration_dirty_pages -=
2924                       bitmap_count_one_with_offset(block->bmap, start, npages);
2925         bitmap_clear(block->bmap, start, npages);
2926         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2927     }
2928 }
2929
2930 /*
2931  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2932  * long-running RCU critical section.  When rcu-reclaims in the code
2933  * start to become numerous it will be necessary to reduce the
2934  * granularity of these critical sections.
2935  */
2936
2937 /**
2938  * ram_save_setup: Setup RAM for migration
2939  *
2940  * Returns zero to indicate success and negative for error
2941  *
2942  * @f: QEMUFile where to send the data
2943  * @opaque: RAMState pointer
2944  */
2945 static int ram_save_setup(QEMUFile *f, void *opaque)
2946 {
2947     RAMState **rsp = opaque;
2948     RAMBlock *block;
2949
2950     if (compress_threads_save_setup()) {
2951         return -1;
2952     }
2953
2954     /* migration has already setup the bitmap, reuse it. */
2955     if (!migration_in_colo_state()) {
2956         if (ram_init_all(rsp) != 0) {
2957             compress_threads_save_cleanup();
2958             return -1;
2959         }
2960     }
2961     (*rsp)->f = f;
2962
2963     WITH_RCU_READ_LOCK_GUARD() {
2964         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2965
2966         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2967             qemu_put_byte(f, strlen(block->idstr));
2968             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2969             qemu_put_be64(f, block->used_length);
2970             if (migrate_postcopy_ram() && block->page_size !=
2971                                           qemu_host_page_size) {
2972                 qemu_put_be64(f, block->page_size);
2973             }
2974             if (migrate_ignore_shared()) {
2975                 qemu_put_be64(f, block->mr->addr);
2976             }
2977         }
2978     }
2979
2980     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2981     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2982
2983     multifd_send_sync_main(f);
2984     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2985     qemu_fflush(f);
2986
2987     return 0;
2988 }
2989
2990 /**
2991  * ram_save_iterate: iterative stage for migration
2992  *
2993  * Returns zero to indicate success and negative for error
2994  *
2995  * @f: QEMUFile where to send the data
2996  * @opaque: RAMState pointer
2997  */
2998 static int ram_save_iterate(QEMUFile *f, void *opaque)
2999 {
3000     RAMState **temp = opaque;
3001     RAMState *rs = *temp;
3002     int ret = 0;
3003     int i;
3004     int64_t t0;
3005     int done = 0;
3006
3007     if (blk_mig_bulk_active()) {
3008         /* Avoid transferring ram during bulk phase of block migration as
3009          * the bulk phase will usually take a long time and transferring
3010          * ram updates during that time is pointless. */
3011         goto out;
3012     }
3013
3014     /*
3015      * We'll take this lock a little bit long, but it's okay for two reasons.
3016      * Firstly, the only possible other thread to take it is who calls
3017      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3018      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3019      * guarantees that we'll at least released it in a regular basis.
3020      */
3021     qemu_mutex_lock(&rs->bitmap_mutex);
3022     WITH_RCU_READ_LOCK_GUARD() {
3023         if (ram_list.version != rs->last_version) {
3024             ram_state_reset(rs);
3025         }
3026
3027         /* Read version before ram_list.blocks */
3028         smp_rmb();
3029
3030         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3031
3032         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3033         i = 0;
3034         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3035                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3036             int pages;
3037
3038             if (qemu_file_get_error(f)) {
3039                 break;
3040             }
3041
3042             pages = ram_find_and_save_block(rs, false);
3043             /* no more pages to sent */
3044             if (pages == 0) {
3045                 done = 1;
3046                 break;
3047             }
3048
3049             if (pages < 0) {
3050                 qemu_file_set_error(f, pages);
3051                 break;
3052             }
3053
3054             rs->target_page_count += pages;
3055
3056             /*
3057              * During postcopy, it is necessary to make sure one whole host
3058              * page is sent in one chunk.
3059              */
3060             if (migrate_postcopy_ram()) {
3061                 flush_compressed_data(rs);
3062             }
3063
3064             /*
3065              * we want to check in the 1st loop, just in case it was the 1st
3066              * time and we had to sync the dirty bitmap.
3067              * qemu_clock_get_ns() is a bit expensive, so we only check each
3068              * some iterations
3069              */
3070             if ((i & 63) == 0) {
3071                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3072                               1000000;
3073                 if (t1 > MAX_WAIT) {
3074                     trace_ram_save_iterate_big_wait(t1, i);
3075                     break;
3076                 }
3077             }
3078             i++;
3079         }
3080     }
3081     qemu_mutex_unlock(&rs->bitmap_mutex);
3082
3083     /*
3084      * Must occur before EOS (or any QEMUFile operation)
3085      * because of RDMA protocol.
3086      */
3087     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3088
3089 out:
3090     if (ret >= 0
3091         && migration_is_setup_or_active(migrate_get_current()->state)) {
3092         multifd_send_sync_main(rs->f);
3093         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3094         qemu_fflush(f);
3095         ram_counters.transferred += 8;
3096
3097         ret = qemu_file_get_error(f);
3098     }
3099     if (ret < 0) {
3100         return ret;
3101     }
3102
3103     return done;
3104 }
3105
3106 /**
3107  * ram_save_complete: function called to send the remaining amount of ram
3108  *
3109  * Returns zero to indicate success or negative on error
3110  *
3111  * Called with iothread lock
3112  *
3113  * @f: QEMUFile where to send the data
3114  * @opaque: RAMState pointer
3115  */
3116 static int ram_save_complete(QEMUFile *f, void *opaque)
3117 {
3118     RAMState **temp = opaque;
3119     RAMState *rs = *temp;
3120     int ret = 0;
3121
3122     WITH_RCU_READ_LOCK_GUARD() {
3123         if (!migration_in_postcopy()) {
3124             migration_bitmap_sync_precopy(rs);
3125         }
3126
3127         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3128
3129         /* try transferring iterative blocks of memory */
3130
3131         /* flush all remaining blocks regardless of rate limiting */
3132         while (true) {
3133             int pages;
3134
3135             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3136             /* no more blocks to sent */
3137             if (pages == 0) {
3138                 break;
3139             }
3140             if (pages < 0) {
3141                 ret = pages;
3142                 break;
3143             }
3144         }
3145
3146         flush_compressed_data(rs);
3147         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3148     }
3149
3150     if (ret >= 0) {
3151         multifd_send_sync_main(rs->f);
3152         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3153         qemu_fflush(f);
3154     }
3155
3156     return ret;
3157 }
3158
3159 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3160                              uint64_t *res_precopy_only,
3161                              uint64_t *res_compatible,
3162                              uint64_t *res_postcopy_only)
3163 {
3164     RAMState **temp = opaque;
3165     RAMState *rs = *temp;
3166     uint64_t remaining_size;
3167
3168     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3169
3170     if (!migration_in_postcopy() &&
3171         remaining_size < max_size) {
3172         qemu_mutex_lock_iothread();
3173         WITH_RCU_READ_LOCK_GUARD() {
3174             migration_bitmap_sync_precopy(rs);
3175         }
3176         qemu_mutex_unlock_iothread();
3177         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3178     }
3179
3180     if (migrate_postcopy_ram()) {
3181         /* We can do postcopy, and all the data is postcopiable */
3182         *res_compatible += remaining_size;
3183     } else {
3184         *res_precopy_only += remaining_size;
3185     }
3186 }
3187
3188 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3189 {
3190     unsigned int xh_len;
3191     int xh_flags;
3192     uint8_t *loaded_data;
3193
3194     /* extract RLE header */
3195     xh_flags = qemu_get_byte(f);
3196     xh_len = qemu_get_be16(f);
3197
3198     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3199         error_report("Failed to load XBZRLE page - wrong compression!");
3200         return -1;
3201     }
3202
3203     if (xh_len > TARGET_PAGE_SIZE) {
3204         error_report("Failed to load XBZRLE page - len overflow!");
3205         return -1;
3206     }
3207     loaded_data = XBZRLE.decoded_buf;
3208     /* load data and decode */
3209     /* it can change loaded_data to point to an internal buffer */
3210     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3211
3212     /* decode RLE */
3213     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3214                              TARGET_PAGE_SIZE) == -1) {
3215         error_report("Failed to load XBZRLE page - decode error!");
3216         return -1;
3217     }
3218
3219     return 0;
3220 }
3221
3222 /**
3223  * ram_block_from_stream: read a RAMBlock id from the migration stream
3224  *
3225  * Must be called from within a rcu critical section.
3226  *
3227  * Returns a pointer from within the RCU-protected ram_list.
3228  *
3229  * @f: QEMUFile where to read the data from
3230  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3231  */
3232 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3233 {
3234     static RAMBlock *block;
3235     char id[256];
3236     uint8_t len;
3237
3238     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3239         if (!block) {
3240             error_report("Ack, bad migration stream!");
3241             return NULL;
3242         }
3243         return block;
3244     }
3245
3246     len = qemu_get_byte(f);
3247     qemu_get_buffer(f, (uint8_t *)id, len);
3248     id[len] = 0;
3249
3250     block = qemu_ram_block_by_name(id);
3251     if (!block) {
3252         error_report("Can't find block %s", id);
3253         return NULL;
3254     }
3255
3256     if (ramblock_is_ignored(block)) {
3257         error_report("block %s should not be migrated !", id);
3258         return NULL;
3259     }
3260
3261     return block;
3262 }
3263
3264 static inline void *host_from_ram_block_offset(RAMBlock *block,
3265                                                ram_addr_t offset)
3266 {
3267     if (!offset_in_ramblock(block, offset)) {
3268         return NULL;
3269     }
3270
3271     return block->host + offset;
3272 }
3273
3274 static void *host_page_from_ram_block_offset(RAMBlock *block,
3275                                              ram_addr_t offset)
3276 {
3277     /* Note: Explicitly no check against offset_in_ramblock(). */
3278     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3279                                    block->page_size);
3280 }
3281
3282 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3283                                                          ram_addr_t offset)
3284 {
3285     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3286 }
3287
3288 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3289                              ram_addr_t offset, bool record_bitmap)
3290 {
3291     if (!offset_in_ramblock(block, offset)) {
3292         return NULL;
3293     }
3294     if (!block->colo_cache) {
3295         error_report("%s: colo_cache is NULL in block :%s",
3296                      __func__, block->idstr);
3297         return NULL;
3298     }
3299
3300     /*
3301     * During colo checkpoint, we need bitmap of these migrated pages.
3302     * It help us to decide which pages in ram cache should be flushed
3303     * into VM's RAM later.
3304     */
3305     if (record_bitmap &&
3306         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3307         ram_state->migration_dirty_pages++;
3308     }
3309     return block->colo_cache + offset;
3310 }
3311
3312 /**
3313  * ram_handle_compressed: handle the zero page case
3314  *
3315  * If a page (or a whole RDMA chunk) has been
3316  * determined to be zero, then zap it.
3317  *
3318  * @host: host address for the zero page
3319  * @ch: what the page is filled from.  We only support zero
3320  * @size: size of the zero page
3321  */
3322 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3323 {
3324     if (ch != 0 || !is_zero_range(host, size)) {
3325         memset(host, ch, size);
3326     }
3327 }
3328
3329 /* return the size after decompression, or negative value on error */
3330 static int
3331 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3332                      const uint8_t *source, size_t source_len)
3333 {
3334     int err;
3335
3336     err = inflateReset(stream);
3337     if (err != Z_OK) {
3338         return -1;
3339     }
3340
3341     stream->avail_in = source_len;
3342     stream->next_in = (uint8_t *)source;
3343     stream->avail_out = dest_len;
3344     stream->next_out = dest;
3345
3346     err = inflate(stream, Z_NO_FLUSH);
3347     if (err != Z_STREAM_END) {
3348         return -1;
3349     }
3350
3351     return stream->total_out;
3352 }
3353
3354 static void *do_data_decompress(void *opaque)
3355 {
3356     DecompressParam *param = opaque;
3357     unsigned long pagesize;
3358     uint8_t *des;
3359     int len, ret;
3360
3361     qemu_mutex_lock(&param->mutex);
3362     while (!param->quit) {
3363         if (param->des) {
3364             des = param->des;
3365             len = param->len;
3366             param->des = 0;
3367             qemu_mutex_unlock(&param->mutex);
3368
3369             pagesize = TARGET_PAGE_SIZE;
3370
3371             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3372                                        param->compbuf, len);
3373             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3374                 error_report("decompress data failed");
3375                 qemu_file_set_error(decomp_file, ret);
3376             }
3377
3378             qemu_mutex_lock(&decomp_done_lock);
3379             param->done = true;
3380             qemu_cond_signal(&decomp_done_cond);
3381             qemu_mutex_unlock(&decomp_done_lock);
3382
3383             qemu_mutex_lock(&param->mutex);
3384         } else {
3385             qemu_cond_wait(&param->cond, &param->mutex);
3386         }
3387     }
3388     qemu_mutex_unlock(&param->mutex);
3389
3390     return NULL;
3391 }
3392
3393 static int wait_for_decompress_done(void)
3394 {
3395     int idx, thread_count;
3396
3397     if (!migrate_use_compression()) {
3398         return 0;
3399     }
3400
3401     thread_count = migrate_decompress_threads();
3402     qemu_mutex_lock(&decomp_done_lock);
3403     for (idx = 0; idx < thread_count; idx++) {
3404         while (!decomp_param[idx].done) {
3405             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3406         }
3407     }
3408     qemu_mutex_unlock(&decomp_done_lock);
3409     return qemu_file_get_error(decomp_file);
3410 }
3411
3412 static void compress_threads_load_cleanup(void)
3413 {
3414     int i, thread_count;
3415
3416     if (!migrate_use_compression()) {
3417         return;
3418     }
3419     thread_count = migrate_decompress_threads();
3420     for (i = 0; i < thread_count; i++) {
3421         /*
3422          * we use it as a indicator which shows if the thread is
3423          * properly init'd or not
3424          */
3425         if (!decomp_param[i].compbuf) {
3426             break;
3427         }
3428
3429         qemu_mutex_lock(&decomp_param[i].mutex);
3430         decomp_param[i].quit = true;
3431         qemu_cond_signal(&decomp_param[i].cond);
3432         qemu_mutex_unlock(&decomp_param[i].mutex);
3433     }
3434     for (i = 0; i < thread_count; i++) {
3435         if (!decomp_param[i].compbuf) {
3436             break;
3437         }
3438
3439         qemu_thread_join(decompress_threads + i);
3440         qemu_mutex_destroy(&decomp_param[i].mutex);
3441         qemu_cond_destroy(&decomp_param[i].cond);
3442         inflateEnd(&decomp_param[i].stream);
3443         g_free(decomp_param[i].compbuf);
3444         decomp_param[i].compbuf = NULL;
3445     }
3446     g_free(decompress_threads);
3447     g_free(decomp_param);
3448     decompress_threads = NULL;
3449     decomp_param = NULL;
3450     decomp_file = NULL;
3451 }
3452
3453 static int compress_threads_load_setup(QEMUFile *f)
3454 {
3455     int i, thread_count;
3456
3457     if (!migrate_use_compression()) {
3458         return 0;
3459     }
3460
3461     thread_count = migrate_decompress_threads();
3462     decompress_threads = g_new0(QemuThread, thread_count);
3463     decomp_param = g_new0(DecompressParam, thread_count);
3464     qemu_mutex_init(&decomp_done_lock);
3465     qemu_cond_init(&decomp_done_cond);
3466     decomp_file = f;
3467     for (i = 0; i < thread_count; i++) {
3468         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3469             goto exit;
3470         }
3471
3472         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3473         qemu_mutex_init(&decomp_param[i].mutex);
3474         qemu_cond_init(&decomp_param[i].cond);
3475         decomp_param[i].done = true;
3476         decomp_param[i].quit = false;
3477         qemu_thread_create(decompress_threads + i, "decompress",
3478                            do_data_decompress, decomp_param + i,
3479                            QEMU_THREAD_JOINABLE);
3480     }
3481     return 0;
3482 exit:
3483     compress_threads_load_cleanup();
3484     return -1;
3485 }
3486
3487 static void decompress_data_with_multi_threads(QEMUFile *f,
3488                                                void *host, int len)
3489 {
3490     int idx, thread_count;
3491
3492     thread_count = migrate_decompress_threads();
3493     QEMU_LOCK_GUARD(&decomp_done_lock);
3494     while (true) {
3495         for (idx = 0; idx < thread_count; idx++) {
3496             if (decomp_param[idx].done) {
3497                 decomp_param[idx].done = false;
3498                 qemu_mutex_lock(&decomp_param[idx].mutex);
3499                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3500                 decomp_param[idx].des = host;
3501                 decomp_param[idx].len = len;
3502                 qemu_cond_signal(&decomp_param[idx].cond);
3503                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3504                 break;
3505             }
3506         }
3507         if (idx < thread_count) {
3508             break;
3509         } else {
3510             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3511         }
3512     }
3513 }
3514
3515 static void colo_init_ram_state(void)
3516 {
3517     ram_state_init(&ram_state);
3518 }
3519
3520 /*
3521  * colo cache: this is for secondary VM, we cache the whole
3522  * memory of the secondary VM, it is need to hold the global lock
3523  * to call this helper.
3524  */
3525 int colo_init_ram_cache(void)
3526 {
3527     RAMBlock *block;
3528
3529     WITH_RCU_READ_LOCK_GUARD() {
3530         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3531             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3532                                                     NULL, false, false);
3533             if (!block->colo_cache) {
3534                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3535                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3536                              block->used_length);
3537                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3538                     if (block->colo_cache) {
3539                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3540                         block->colo_cache = NULL;
3541                     }
3542                 }
3543                 return -errno;
3544             }
3545         }
3546     }
3547
3548     /*
3549     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3550     * with to decide which page in cache should be flushed into SVM's RAM. Here
3551     * we use the same name 'ram_bitmap' as for migration.
3552     */
3553     if (ram_bytes_total()) {
3554         RAMBlock *block;
3555
3556         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3557             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3558             block->bmap = bitmap_new(pages);
3559         }
3560     }
3561
3562     colo_init_ram_state();
3563     return 0;
3564 }
3565
3566 /* TODO: duplicated with ram_init_bitmaps */
3567 void colo_incoming_start_dirty_log(void)
3568 {
3569     RAMBlock *block = NULL;
3570     /* For memory_global_dirty_log_start below. */
3571     qemu_mutex_lock_iothread();
3572     qemu_mutex_lock_ramlist();
3573
3574     memory_global_dirty_log_sync();
3575     WITH_RCU_READ_LOCK_GUARD() {
3576         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3577             ramblock_sync_dirty_bitmap(ram_state, block);
3578             /* Discard this dirty bitmap record */
3579             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3580         }
3581         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3582     }
3583     ram_state->migration_dirty_pages = 0;
3584     qemu_mutex_unlock_ramlist();
3585     qemu_mutex_unlock_iothread();
3586 }
3587
3588 /* It is need to hold the global lock to call this helper */
3589 void colo_release_ram_cache(void)
3590 {
3591     RAMBlock *block;
3592
3593     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3594     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3595         g_free(block->bmap);
3596         block->bmap = NULL;
3597     }
3598
3599     WITH_RCU_READ_LOCK_GUARD() {
3600         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3601             if (block->colo_cache) {
3602                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3603                 block->colo_cache = NULL;
3604             }
3605         }
3606     }
3607     ram_state_cleanup(&ram_state);
3608 }
3609
3610 /**
3611  * ram_load_setup: Setup RAM for migration incoming side
3612  *
3613  * Returns zero to indicate success and negative for error
3614  *
3615  * @f: QEMUFile where to receive the data
3616  * @opaque: RAMState pointer
3617  */
3618 static int ram_load_setup(QEMUFile *f, void *opaque)
3619 {
3620     if (compress_threads_load_setup(f)) {
3621         return -1;
3622     }
3623
3624     xbzrle_load_setup();
3625     ramblock_recv_map_init();
3626
3627     return 0;
3628 }
3629
3630 static int ram_load_cleanup(void *opaque)
3631 {
3632     RAMBlock *rb;
3633
3634     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3635         qemu_ram_block_writeback(rb);
3636     }
3637
3638     xbzrle_load_cleanup();
3639     compress_threads_load_cleanup();
3640
3641     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3642         g_free(rb->receivedmap);
3643         rb->receivedmap = NULL;
3644     }
3645
3646     return 0;
3647 }
3648
3649 /**
3650  * ram_postcopy_incoming_init: allocate postcopy data structures
3651  *
3652  * Returns 0 for success and negative if there was one error
3653  *
3654  * @mis: current migration incoming state
3655  *
3656  * Allocate data structures etc needed by incoming migration with
3657  * postcopy-ram. postcopy-ram's similarly names
3658  * postcopy_ram_incoming_init does the work.
3659  */
3660 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3661 {
3662     return postcopy_ram_incoming_init(mis);
3663 }
3664
3665 /**
3666  * ram_load_postcopy: load a page in postcopy case
3667  *
3668  * Returns 0 for success or -errno in case of error
3669  *
3670  * Called in postcopy mode by ram_load().
3671  * rcu_read_lock is taken prior to this being called.
3672  *
3673  * @f: QEMUFile where to send the data
3674  */
3675 static int ram_load_postcopy(QEMUFile *f)
3676 {
3677     int flags = 0, ret = 0;
3678     bool place_needed = false;
3679     bool matches_target_page_size = false;
3680     MigrationIncomingState *mis = migration_incoming_get_current();
3681     /* Temporary page that is later 'placed' */
3682     void *postcopy_host_page = mis->postcopy_tmp_page;
3683     void *host_page = NULL;
3684     bool all_zero = true;
3685     int target_pages = 0;
3686
3687     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3688         ram_addr_t addr;
3689         void *page_buffer = NULL;
3690         void *place_source = NULL;
3691         RAMBlock *block = NULL;
3692         uint8_t ch;
3693         int len;
3694
3695         addr = qemu_get_be64(f);
3696
3697         /*
3698          * If qemu file error, we should stop here, and then "addr"
3699          * may be invalid
3700          */
3701         ret = qemu_file_get_error(f);
3702         if (ret) {
3703             break;
3704         }
3705
3706         flags = addr & ~TARGET_PAGE_MASK;
3707         addr &= TARGET_PAGE_MASK;
3708
3709         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3710         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3711                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3712             block = ram_block_from_stream(f, flags);
3713             if (!block) {
3714                 ret = -EINVAL;
3715                 break;
3716             }
3717
3718             /*
3719              * Relying on used_length is racy and can result in false positives.
3720              * We might place pages beyond used_length in case RAM was shrunk
3721              * while in postcopy, which is fine - trying to place via
3722              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3723              */
3724             if (!block->host || addr >= block->postcopy_length) {
3725                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3726                 ret = -EINVAL;
3727                 break;
3728             }
3729             target_pages++;
3730             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3731             /*
3732              * Postcopy requires that we place whole host pages atomically;
3733              * these may be huge pages for RAMBlocks that are backed by
3734              * hugetlbfs.
3735              * To make it atomic, the data is read into a temporary page
3736              * that's moved into place later.
3737              * The migration protocol uses,  possibly smaller, target-pages
3738              * however the source ensures it always sends all the components
3739              * of a host page in one chunk.
3740              */
3741             page_buffer = postcopy_host_page +
3742                           host_page_offset_from_ram_block_offset(block, addr);
3743             /* If all TP are zero then we can optimise the place */
3744             if (target_pages == 1) {
3745                 host_page = host_page_from_ram_block_offset(block, addr);
3746             } else if (host_page != host_page_from_ram_block_offset(block,
3747                                                                     addr)) {
3748                 /* not the 1st TP within the HP */
3749                 error_report("Non-same host page %p/%p", host_page,
3750                              host_page_from_ram_block_offset(block, addr));
3751                 ret = -EINVAL;
3752                 break;
3753             }
3754
3755             /*
3756              * If it's the last part of a host page then we place the host
3757              * page
3758              */
3759             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3760                 place_needed = true;
3761             }
3762             place_source = postcopy_host_page;
3763         }
3764
3765         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3766         case RAM_SAVE_FLAG_ZERO:
3767             ch = qemu_get_byte(f);
3768             /*
3769              * Can skip to set page_buffer when
3770              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3771              */
3772             if (ch || !matches_target_page_size) {
3773                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3774             }
3775             if (ch) {
3776                 all_zero = false;
3777             }
3778             break;
3779
3780         case RAM_SAVE_FLAG_PAGE:
3781             all_zero = false;
3782             if (!matches_target_page_size) {
3783                 /* For huge pages, we always use temporary buffer */
3784                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3785             } else {
3786                 /*
3787                  * For small pages that matches target page size, we
3788                  * avoid the qemu_file copy.  Instead we directly use
3789                  * the buffer of QEMUFile to place the page.  Note: we
3790                  * cannot do any QEMUFile operation before using that
3791                  * buffer to make sure the buffer is valid when
3792                  * placing the page.
3793                  */
3794                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3795                                          TARGET_PAGE_SIZE);
3796             }
3797             break;
3798         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3799             all_zero = false;
3800             len = qemu_get_be32(f);
3801             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3802                 error_report("Invalid compressed data length: %d", len);
3803                 ret = -EINVAL;
3804                 break;
3805             }
3806             decompress_data_with_multi_threads(f, page_buffer, len);
3807             break;
3808
3809         case RAM_SAVE_FLAG_EOS:
3810             /* normal exit */
3811             multifd_recv_sync_main();
3812             break;
3813         default:
3814             error_report("Unknown combination of migration flags: 0x%x"
3815                          " (postcopy mode)", flags);
3816             ret = -EINVAL;
3817             break;
3818         }
3819
3820         /* Got the whole host page, wait for decompress before placing. */
3821         if (place_needed) {
3822             ret |= wait_for_decompress_done();
3823         }
3824
3825         /* Detect for any possible file errors */
3826         if (!ret && qemu_file_get_error(f)) {
3827             ret = qemu_file_get_error(f);
3828         }
3829
3830         if (!ret && place_needed) {
3831             if (all_zero) {
3832                 ret = postcopy_place_page_zero(mis, host_page, block);
3833             } else {
3834                 ret = postcopy_place_page(mis, host_page, place_source,
3835                                           block);
3836             }
3837             place_needed = false;
3838             target_pages = 0;
3839             /* Assume we have a zero page until we detect something different */
3840             all_zero = true;
3841         }
3842     }
3843
3844     return ret;
3845 }
3846
3847 static bool postcopy_is_advised(void)
3848 {
3849     PostcopyState ps = postcopy_state_get();
3850     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3851 }
3852
3853 static bool postcopy_is_running(void)
3854 {
3855     PostcopyState ps = postcopy_state_get();
3856     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3857 }
3858
3859 /*
3860  * Flush content of RAM cache into SVM's memory.
3861  * Only flush the pages that be dirtied by PVM or SVM or both.
3862  */
3863 void colo_flush_ram_cache(void)
3864 {
3865     RAMBlock *block = NULL;
3866     void *dst_host;
3867     void *src_host;
3868     unsigned long offset = 0;
3869
3870     memory_global_dirty_log_sync();
3871     qemu_mutex_lock(&ram_state->bitmap_mutex);
3872     WITH_RCU_READ_LOCK_GUARD() {
3873         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3874             ramblock_sync_dirty_bitmap(ram_state, block);
3875         }
3876     }
3877
3878     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3879     WITH_RCU_READ_LOCK_GUARD() {
3880         block = QLIST_FIRST_RCU(&ram_list.blocks);
3881
3882         while (block) {
3883             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3884
3885             if (!offset_in_ramblock(block,
3886                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3887                 offset = 0;
3888                 block = QLIST_NEXT_RCU(block, next);
3889             } else {
3890                 migration_bitmap_clear_dirty(ram_state, block, offset);
3891                 dst_host = block->host
3892                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3893                 src_host = block->colo_cache
3894                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3895                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3896             }
3897         }
3898     }
3899     trace_colo_flush_ram_cache_end();
3900     qemu_mutex_unlock(&ram_state->bitmap_mutex);
3901 }
3902
3903 /**
3904  * ram_load_precopy: load pages in precopy case
3905  *
3906  * Returns 0 for success or -errno in case of error
3907  *
3908  * Called in precopy mode by ram_load().
3909  * rcu_read_lock is taken prior to this being called.
3910  *
3911  * @f: QEMUFile where to send the data
3912  */
3913 static int ram_load_precopy(QEMUFile *f)
3914 {
3915     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3916     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3917     bool postcopy_advised = postcopy_is_advised();
3918     if (!migrate_use_compression()) {
3919         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3920     }
3921
3922     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3923         ram_addr_t addr, total_ram_bytes;
3924         void *host = NULL, *host_bak = NULL;
3925         uint8_t ch;
3926
3927         /*
3928          * Yield periodically to let main loop run, but an iteration of
3929          * the main loop is expensive, so do it each some iterations
3930          */
3931         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3932             aio_co_schedule(qemu_get_current_aio_context(),
3933                             qemu_coroutine_self());
3934             qemu_coroutine_yield();
3935         }
3936         i++;
3937
3938         addr = qemu_get_be64(f);
3939         flags = addr & ~TARGET_PAGE_MASK;
3940         addr &= TARGET_PAGE_MASK;
3941
3942         if (flags & invalid_flags) {
3943             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3944                 error_report("Received an unexpected compressed page");
3945             }
3946
3947             ret = -EINVAL;
3948             break;
3949         }
3950
3951         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3952                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3953             RAMBlock *block = ram_block_from_stream(f, flags);
3954
3955             host = host_from_ram_block_offset(block, addr);
3956             /*
3957              * After going into COLO stage, we should not load the page
3958              * into SVM's memory directly, we put them into colo_cache firstly.
3959              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3960              * Previously, we copied all these memory in preparing stage of COLO
3961              * while we need to stop VM, which is a time-consuming process.
3962              * Here we optimize it by a trick, back-up every page while in
3963              * migration process while COLO is enabled, though it affects the
3964              * speed of the migration, but it obviously reduce the downtime of
3965              * back-up all SVM'S memory in COLO preparing stage.
3966              */
3967             if (migration_incoming_colo_enabled()) {
3968                 if (migration_incoming_in_colo_state()) {
3969                     /* In COLO stage, put all pages into cache temporarily */
3970                     host = colo_cache_from_block_offset(block, addr, true);
3971                 } else {
3972                    /*
3973                     * In migration stage but before COLO stage,
3974                     * Put all pages into both cache and SVM's memory.
3975                     */
3976                     host_bak = colo_cache_from_block_offset(block, addr, false);
3977                 }
3978             }
3979             if (!host) {
3980                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3981                 ret = -EINVAL;
3982                 break;
3983             }
3984             if (!migration_incoming_in_colo_state()) {
3985                 ramblock_recv_bitmap_set(block, host);
3986             }
3987
3988             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3989         }
3990
3991         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3992         case RAM_SAVE_FLAG_MEM_SIZE:
3993             /* Synchronize RAM block list */
3994             total_ram_bytes = addr;
3995             while (!ret && total_ram_bytes) {
3996                 RAMBlock *block;
3997                 char id[256];
3998                 ram_addr_t length;
3999
4000                 len = qemu_get_byte(f);
4001                 qemu_get_buffer(f, (uint8_t *)id, len);
4002                 id[len] = 0;
4003                 length = qemu_get_be64(f);
4004
4005                 block = qemu_ram_block_by_name(id);
4006                 if (block && !qemu_ram_is_migratable(block)) {
4007                     error_report("block %s should not be migrated !", id);
4008                     ret = -EINVAL;
4009                 } else if (block) {
4010                     if (length != block->used_length) {
4011                         Error *local_err = NULL;
4012
4013                         ret = qemu_ram_resize(block, length,
4014                                               &local_err);
4015                         if (local_err) {
4016                             error_report_err(local_err);
4017                         }
4018                     }
4019                     /* For postcopy we need to check hugepage sizes match */
4020                     if (postcopy_advised && migrate_postcopy_ram() &&
4021                         block->page_size != qemu_host_page_size) {
4022                         uint64_t remote_page_size = qemu_get_be64(f);
4023                         if (remote_page_size != block->page_size) {
4024                             error_report("Mismatched RAM page size %s "
4025                                          "(local) %zd != %" PRId64,
4026                                          id, block->page_size,
4027                                          remote_page_size);
4028                             ret = -EINVAL;
4029                         }
4030                     }
4031                     if (migrate_ignore_shared()) {
4032                         hwaddr addr = qemu_get_be64(f);
4033                         if (ramblock_is_ignored(block) &&
4034                             block->mr->addr != addr) {
4035                             error_report("Mismatched GPAs for block %s "
4036                                          "%" PRId64 "!= %" PRId64,
4037                                          id, (uint64_t)addr,
4038                                          (uint64_t)block->mr->addr);
4039                             ret = -EINVAL;
4040                         }
4041                     }
4042                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4043                                           block->idstr);
4044                 } else {
4045                     error_report("Unknown ramblock \"%s\", cannot "
4046                                  "accept migration", id);
4047                     ret = -EINVAL;
4048                 }
4049
4050                 total_ram_bytes -= length;
4051             }
4052             break;
4053
4054         case RAM_SAVE_FLAG_ZERO:
4055             ch = qemu_get_byte(f);
4056             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4057             break;
4058
4059         case RAM_SAVE_FLAG_PAGE:
4060             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4061             break;
4062
4063         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4064             len = qemu_get_be32(f);
4065             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4066                 error_report("Invalid compressed data length: %d", len);
4067                 ret = -EINVAL;
4068                 break;
4069             }
4070             decompress_data_with_multi_threads(f, host, len);
4071             break;
4072
4073         case RAM_SAVE_FLAG_XBZRLE:
4074             if (load_xbzrle(f, addr, host) < 0) {
4075                 error_report("Failed to decompress XBZRLE page at "
4076                              RAM_ADDR_FMT, addr);
4077                 ret = -EINVAL;
4078                 break;
4079             }
4080             break;
4081         case RAM_SAVE_FLAG_EOS:
4082             /* normal exit */
4083             multifd_recv_sync_main();
4084             break;
4085         default:
4086             if (flags & RAM_SAVE_FLAG_HOOK) {
4087                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4088             } else {
4089                 error_report("Unknown combination of migration flags: 0x%x",
4090                              flags);
4091                 ret = -EINVAL;
4092             }
4093         }
4094         if (!ret) {
4095             ret = qemu_file_get_error(f);
4096         }
4097         if (!ret && host_bak) {
4098             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4099         }
4100     }
4101
4102     ret |= wait_for_decompress_done();
4103     return ret;
4104 }
4105
4106 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4107 {
4108     int ret = 0;
4109     static uint64_t seq_iter;
4110     /*
4111      * If system is running in postcopy mode, page inserts to host memory must
4112      * be atomic
4113      */
4114     bool postcopy_running = postcopy_is_running();
4115
4116     seq_iter++;
4117
4118     if (version_id != 4) {
4119         return -EINVAL;
4120     }
4121
4122     /*
4123      * This RCU critical section can be very long running.
4124      * When RCU reclaims in the code start to become numerous,
4125      * it will be necessary to reduce the granularity of this
4126      * critical section.
4127      */
4128     WITH_RCU_READ_LOCK_GUARD() {
4129         if (postcopy_running) {
4130             ret = ram_load_postcopy(f);
4131         } else {
4132             ret = ram_load_precopy(f);
4133         }
4134     }
4135     trace_ram_load_complete(ret, seq_iter);
4136
4137     return ret;
4138 }
4139
4140 static bool ram_has_postcopy(void *opaque)
4141 {
4142     RAMBlock *rb;
4143     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4144         if (ramblock_is_pmem(rb)) {
4145             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4146                          "is not supported now!", rb->idstr, rb->host);
4147             return false;
4148         }
4149     }
4150
4151     return migrate_postcopy_ram();
4152 }
4153
4154 /* Sync all the dirty bitmap with destination VM.  */
4155 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4156 {
4157     RAMBlock *block;
4158     QEMUFile *file = s->to_dst_file;
4159     int ramblock_count = 0;
4160
4161     trace_ram_dirty_bitmap_sync_start();
4162
4163     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4164         qemu_savevm_send_recv_bitmap(file, block->idstr);
4165         trace_ram_dirty_bitmap_request(block->idstr);
4166         ramblock_count++;
4167     }
4168
4169     trace_ram_dirty_bitmap_sync_wait();
4170
4171     /* Wait until all the ramblocks' dirty bitmap synced */
4172     while (ramblock_count--) {
4173         qemu_sem_wait(&s->rp_state.rp_sem);
4174     }
4175
4176     trace_ram_dirty_bitmap_sync_complete();
4177
4178     return 0;
4179 }
4180
4181 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4182 {
4183     qemu_sem_post(&s->rp_state.rp_sem);
4184 }
4185
4186 /*
4187  * Read the received bitmap, revert it as the initial dirty bitmap.
4188  * This is only used when the postcopy migration is paused but wants
4189  * to resume from a middle point.
4190  */
4191 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4192 {
4193     int ret = -EINVAL;
4194     /* from_dst_file is always valid because we're within rp_thread */
4195     QEMUFile *file = s->rp_state.from_dst_file;
4196     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4197     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4198     uint64_t size, end_mark;
4199
4200     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4201
4202     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4203         error_report("%s: incorrect state %s", __func__,
4204                      MigrationStatus_str(s->state));
4205         return -EINVAL;
4206     }
4207
4208     /*
4209      * Note: see comments in ramblock_recv_bitmap_send() on why we
4210      * need the endianness conversion, and the paddings.
4211      */
4212     local_size = ROUND_UP(local_size, 8);
4213
4214     /* Add paddings */
4215     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4216
4217     size = qemu_get_be64(file);
4218
4219     /* The size of the bitmap should match with our ramblock */
4220     if (size != local_size) {
4221         error_report("%s: ramblock '%s' bitmap size mismatch "
4222                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4223                      block->idstr, size, local_size);
4224         ret = -EINVAL;
4225         goto out;
4226     }
4227
4228     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4229     end_mark = qemu_get_be64(file);
4230
4231     ret = qemu_file_get_error(file);
4232     if (ret || size != local_size) {
4233         error_report("%s: read bitmap failed for ramblock '%s': %d"
4234                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4235                      __func__, block->idstr, ret, local_size, size);
4236         ret = -EIO;
4237         goto out;
4238     }
4239
4240     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4241         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4242                      __func__, block->idstr, end_mark);
4243         ret = -EINVAL;
4244         goto out;
4245     }
4246
4247     /*
4248      * Endianness conversion. We are during postcopy (though paused).
4249      * The dirty bitmap won't change. We can directly modify it.
4250      */
4251     bitmap_from_le(block->bmap, le_bitmap, nbits);
4252
4253     /*
4254      * What we received is "received bitmap". Revert it as the initial
4255      * dirty bitmap for this ramblock.
4256      */
4257     bitmap_complement(block->bmap, block->bmap, nbits);
4258
4259     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4260     ramblock_dirty_bitmap_clear_discarded_pages(block);
4261
4262     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4263     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4264
4265     /*
4266      * We succeeded to sync bitmap for current ramblock. If this is
4267      * the last one to sync, we need to notify the main send thread.
4268      */
4269     ram_dirty_bitmap_reload_notify(s);
4270
4271     ret = 0;
4272 out:
4273     g_free(le_bitmap);
4274     return ret;
4275 }
4276
4277 static int ram_resume_prepare(MigrationState *s, void *opaque)
4278 {
4279     RAMState *rs = *(RAMState **)opaque;
4280     int ret;
4281
4282     ret = ram_dirty_bitmap_sync_all(s, rs);
4283     if (ret) {
4284         return ret;
4285     }
4286
4287     ram_state_resume_prepare(rs, s->to_dst_file);
4288
4289     return 0;
4290 }
4291
4292 static SaveVMHandlers savevm_ram_handlers = {
4293     .save_setup = ram_save_setup,
4294     .save_live_iterate = ram_save_iterate,
4295     .save_live_complete_postcopy = ram_save_complete,
4296     .save_live_complete_precopy = ram_save_complete,
4297     .has_postcopy = ram_has_postcopy,
4298     .save_live_pending = ram_save_pending,
4299     .load_state = ram_load,
4300     .save_cleanup = ram_save_cleanup,
4301     .load_setup = ram_load_setup,
4302     .load_cleanup = ram_load_cleanup,
4303     .resume_prepare = ram_resume_prepare,
4304 };
4305
4306 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4307                                       size_t old_size, size_t new_size)
4308 {
4309     PostcopyState ps = postcopy_state_get();
4310     ram_addr_t offset;
4311     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4312     Error *err = NULL;
4313
4314     if (ramblock_is_ignored(rb)) {
4315         return;
4316     }
4317
4318     if (!migration_is_idle()) {
4319         /*
4320          * Precopy code on the source cannot deal with the size of RAM blocks
4321          * changing at random points in time - especially after sending the
4322          * RAM block sizes in the migration stream, they must no longer change.
4323          * Abort and indicate a proper reason.
4324          */
4325         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4326         migrate_set_error(migrate_get_current(), err);
4327         error_free(err);
4328         migration_cancel();
4329     }
4330
4331     switch (ps) {
4332     case POSTCOPY_INCOMING_ADVISE:
4333         /*
4334          * Update what ram_postcopy_incoming_init()->init_range() does at the
4335          * time postcopy was advised. Syncing RAM blocks with the source will
4336          * result in RAM resizes.
4337          */
4338         if (old_size < new_size) {
4339             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4340                 error_report("RAM block '%s' discard of resized RAM failed",
4341                              rb->idstr);
4342             }
4343         }
4344         rb->postcopy_length = new_size;
4345         break;
4346     case POSTCOPY_INCOMING_NONE:
4347     case POSTCOPY_INCOMING_RUNNING:
4348     case POSTCOPY_INCOMING_END:
4349         /*
4350          * Once our guest is running, postcopy does no longer care about
4351          * resizes. When growing, the new memory was not available on the
4352          * source, no handler needed.
4353          */
4354         break;
4355     default:
4356         error_report("RAM block '%s' resized during postcopy state: %d",
4357                      rb->idstr, ps);
4358         exit(-1);
4359     }
4360 }
4361
4362 static RAMBlockNotifier ram_mig_ram_notifier = {
4363     .ram_block_resized = ram_mig_ram_block_resized,
4364 };
4365
4366 void ram_mig_init(void)
4367 {
4368     qemu_mutex_init(&XBZRLE.lock);
4369     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4370     ram_block_notifier_add(&ram_mig_ram_notifier);
4371 }