migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include "qemu/cutils.h"
  32 #include "qemu/bitops.h"
  33 #include "qemu/bitmap.h"
  34 #include "qemu/main-loop.h"
  35 #include "xbzrle.h"
  36 #include "ram.h"
  37 #include "migration.h"
  38 #include "migration/register.h"
  39 #include "migration/misc.h"
  40 #include "qemu-file.h"
  41 #include "postcopy-ram.h"
  42 #include "page_cache.h"
  43 #include "qemu/error-report.h"
  44 #include "qapi/error.h"
  45 #include "qapi/qapi-types-migration.h"
  46 #include "qapi/qapi-events-migration.h"
  47 #include "qapi/qmp/qerror.h"
  48 #include "trace.h"
  49 #include "exec/ram_addr.h"
  50 #include "exec/target_page.h"
  51 #include "qemu/rcu_queue.h"
  52 #include "migration/colo.h"
  53 #include "block.h"
  54 #include "sysemu/sysemu.h"
  55 #include "sysemu/cpu-throttle.h"
  56 #include "savevm.h"
  57 #include "qemu/iov.h"
  58 #include "multifd.h"
  59
  60 /***********************************************************/
  61 /* ram save/restore */
  62
  63 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  64  * worked for pages that where filled with the same char.  We switched
  65  * it to only search for the zero value.  And to avoid confusion with
  66  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  67  */
  68
  69 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  70 #define RAM_SAVE_FLAG_ZERO     0x02
  71 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  72 #define RAM_SAVE_FLAG_PAGE     0x08
  73 #define RAM_SAVE_FLAG_EOS      0x10
  74 #define RAM_SAVE_FLAG_CONTINUE 0x20
  75 #define RAM_SAVE_FLAG_XBZRLE   0x40
  76 /* 0x80 is reserved in migration.h start with 0x100 next */
  77 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  78
  79 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  80 {
  81     return buffer_is_zero(p, size);
  82 }
  83
  84 XBZRLECacheStats xbzrle_counters;
  85
  86 /* struct contains XBZRLE cache and a static page
  87    used by the compression */
  88 static struct {
  89     /* buffer used for XBZRLE encoding */
  90     uint8_t *encoded_buf;
  91     /* buffer for storing page content */
  92     uint8_t *current_buf;
  93     /* Cache for XBZRLE, Protected by lock. */
  94     PageCache *cache;
  95     QemuMutex lock;
  96     /* it will store a page full of zeros */
  97     uint8_t *zero_target_page;
  98     /* buffer used for XBZRLE decoding */
  99     uint8_t *decoded_buf;
 100 } XBZRLE;
 101
 102 static void XBZRLE_cache_lock(void)
 103 {
 104     if (migrate_use_xbzrle())
 105         qemu_mutex_lock(&XBZRLE.lock);
 106 }
 107
 108 static void XBZRLE_cache_unlock(void)
 109 {
 110     if (migrate_use_xbzrle())
 111         qemu_mutex_unlock(&XBZRLE.lock);
 112 }
 113
 114 /**
 115  * xbzrle_cache_resize: resize the xbzrle cache
 116  *
 117  * This function is called from qmp_migrate_set_cache_size in main
 118  * thread, possibly while a migration is in progress.  A running
 119  * migration may be using the cache and might finish during this call,
 120  * hence changes to the cache are protected by XBZRLE.lock().
 121  *
 122  * Returns 0 for success or -1 for error
 123  *
 124  * @new_size: new cache size
 125  * @errp: set *errp if the check failed, with reason
 126  */
 127 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 128 {
 129     PageCache *new_cache;
 130     int64_t ret = 0;
 131
 132     /* Check for truncation */
 133     if (new_size != (size_t)new_size) {
 134         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 135                    "exceeding address space");
 136         return -1;
 137     }
 138
 139     if (new_size == migrate_xbzrle_cache_size()) {
 140         /* nothing to do */
 141         return 0;
 142     }
 143
 144     XBZRLE_cache_lock();
 145
 146     if (XBZRLE.cache != NULL) {
 147         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 148         if (!new_cache) {
 149             ret = -1;
 150             goto out;
 151         }
 152
 153         cache_fini(XBZRLE.cache);
 154         XBZRLE.cache = new_cache;
 155     }
 156 out:
 157     XBZRLE_cache_unlock();
 158     return ret;
 159 }
 160
 161 static bool ramblock_is_ignored(RAMBlock *block)
 162 {
 163     return !qemu_ram_is_migratable(block) ||
 164            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 165 }
 166
 167 /* Should be holding either ram_list.mutex, or the RCU lock. */
 168 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
 169     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 170         if (ramblock_is_ignored(block)) {} else
 171
 172 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 173     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 174         if (!qemu_ram_is_migratable(block)) {} else
 175
 176 #undef RAMBLOCK_FOREACH
 177
 178 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 179 {
 180     RAMBlock *block;
 181     int ret = 0;
 182
 183     RCU_READ_LOCK_GUARD();
 184
 185     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 186         ret = func(block, opaque);
 187         if (ret) {
 188             break;
 189         }
 190     }
 191     return ret;
 192 }
 193
 194 static void ramblock_recv_map_init(void)
 195 {
 196     RAMBlock *rb;
 197
 198     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 199         assert(!rb->receivedmap);
 200         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 201     }
 202 }
 203
 204 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 205 {
 206     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 207                     rb->receivedmap);
 208 }
 209
 210 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 211 {
 212     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 213 }
 214
 215 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 216 {
 217     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 218 }
 219
 220 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 221                                     size_t nr)
 222 {
 223     bitmap_set_atomic(rb->receivedmap,
 224                       ramblock_recv_bitmap_offset(host_addr, rb),
 225                       nr);
 226 }
 227
 228 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 229
 230 /*
 231  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 232  *
 233  * Returns >0 if success with sent bytes, or <0 if error.
 234  */
 235 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 236                                   const char *block_name)
 237 {
 238     RAMBlock *block = qemu_ram_block_by_name(block_name);
 239     unsigned long *le_bitmap, nbits;
 240     uint64_t size;
 241
 242     if (!block) {
 243         error_report("%s: invalid block name: %s", __func__, block_name);
 244         return -1;
 245     }
 246
 247     nbits = block->used_length >> TARGET_PAGE_BITS;
 248
 249     /*
 250      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 251      * machines we may need 4 more bytes for padding (see below
 252      * comment). So extend it a bit before hand.
 253      */
 254     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 255
 256     /*
 257      * Always use little endian when sending the bitmap. This is
 258      * required that when source and destination VMs are not using the
 259      * same endianess. (Note: big endian won't work.)
 260      */
 261     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 262
 263     /* Size of the bitmap, in bytes */
 264     size = DIV_ROUND_UP(nbits, 8);
 265
 266     /*
 267      * size is always aligned to 8 bytes for 64bit machines, but it
 268      * may not be true for 32bit machines. We need this padding to
 269      * make sure the migration can survive even between 32bit and
 270      * 64bit machines.
 271      */
 272     size = ROUND_UP(size, 8);
 273
 274     qemu_put_be64(file, size);
 275     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 276     /*
 277      * Mark as an end, in case the middle part is screwed up due to
 278      * some "misterious" reason.
 279      */
 280     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 281     qemu_fflush(file);
 282
 283     g_free(le_bitmap);
 284
 285     if (qemu_file_get_error(file)) {
 286         return qemu_file_get_error(file);
 287     }
 288
 289     return size + sizeof(size);
 290 }
 291
 292 /*
 293  * An outstanding page request, on the source, having been received
 294  * and queued
 295  */
 296 struct RAMSrcPageRequest {
 297     RAMBlock *rb;
 298     hwaddr    offset;
 299     hwaddr    len;
 300
 301     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 302 };
 303
 304 /* State of RAM for migration */
 305 struct RAMState {
 306     /* QEMUFile used for this migration */
 307     QEMUFile *f;
 308     /* Last block that we have visited searching for dirty pages */
 309     RAMBlock *last_seen_block;
 310     /* Last block from where we have sent data */
 311     RAMBlock *last_sent_block;
 312     /* Last dirty target page we have sent */
 313     ram_addr_t last_page;
 314     /* last ram version we have seen */
 315     uint32_t last_version;
 316     /* We are in the first round */
 317     bool ram_bulk_stage;
 318     /* The free page optimization is enabled */
 319     bool fpo_enabled;
 320     /* How many times we have dirty too many pages */
 321     int dirty_rate_high_cnt;
 322     /* these variables are used for bitmap sync */
 323     /* last time we did a full bitmap_sync */
 324     int64_t time_last_bitmap_sync;
 325     /* bytes transferred at start_time */
 326     uint64_t bytes_xfer_prev;
 327     /* number of dirty pages since start_time */
 328     uint64_t num_dirty_pages_period;
 329     /* xbzrle misses since the beginning of the period */
 330     uint64_t xbzrle_cache_miss_prev;
 331     /* Amount of xbzrle pages since the beginning of the period */
 332     uint64_t xbzrle_pages_prev;
 333     /* Amount of xbzrle encoded bytes since the beginning of the period */
 334     uint64_t xbzrle_bytes_prev;
 335
 336     /* compression statistics since the beginning of the period */
 337     /* amount of count that no free thread to compress data */
 338     uint64_t compress_thread_busy_prev;
 339     /* amount bytes after compression */
 340     uint64_t compressed_size_prev;
 341     /* amount of compressed pages */
 342     uint64_t compress_pages_prev;
 343
 344     /* total handled target pages at the beginning of period */
 345     uint64_t target_page_count_prev;
 346     /* total handled target pages since start */
 347     uint64_t target_page_count;
 348     /* number of dirty bits in the bitmap */
 349     uint64_t migration_dirty_pages;
 350     /* Protects modification of the bitmap and migration dirty pages */
 351     QemuMutex bitmap_mutex;
 352     /* The RAMBlock used in the last src_page_requests */
 353     RAMBlock *last_req_rb;
 354     /* Queue of outstanding page requests from the destination */
 355     QemuMutex src_page_req_mutex;
 356     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 357 };
 358 typedef struct RAMState RAMState;
 359
 360 static RAMState *ram_state;
 361
 362 static NotifierWithReturnList precopy_notifier_list;
 363
 364 void precopy_infrastructure_init(void)
 365 {
 366     notifier_with_return_list_init(&precopy_notifier_list);
 367 }
 368
 369 void precopy_add_notifier(NotifierWithReturn *n)
 370 {
 371     notifier_with_return_list_add(&precopy_notifier_list, n);
 372 }
 373
 374 void precopy_remove_notifier(NotifierWithReturn *n)
 375 {
 376     notifier_with_return_remove(n);
 377 }
 378
 379 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 380 {
 381     PrecopyNotifyData pnd;
 382     pnd.reason = reason;
 383     pnd.errp = errp;
 384
 385     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 386 }
 387
 388 void precopy_enable_free_page_optimization(void)
 389 {
 390     if (!ram_state) {
 391         return;
 392     }
 393
 394     ram_state->fpo_enabled = true;
 395 }
 396
 397 uint64_t ram_bytes_remaining(void)
 398 {
 399     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 400                        0;
 401 }
 402
 403 MigrationStats ram_counters;
 404
 405 /* used by the search for pages to send */
 406 struct PageSearchStatus {
 407     /* Current block being searched */
 408     RAMBlock    *block;
 409     /* Current page to search from */
 410     unsigned long page;
 411     /* Set once we wrap around */
 412     bool         complete_round;
 413 };
 414 typedef struct PageSearchStatus PageSearchStatus;
 415
 416 CompressionStats compression_counters;
 417
 418 struct CompressParam {
 419     bool done;
 420     bool quit;
 421     bool zero_page;
 422     QEMUFile *file;
 423     QemuMutex mutex;
 424     QemuCond cond;
 425     RAMBlock *block;
 426     ram_addr_t offset;
 427
 428     /* internally used fields */
 429     z_stream stream;
 430     uint8_t *originbuf;
 431 };
 432 typedef struct CompressParam CompressParam;
 433
 434 struct DecompressParam {
 435     bool done;
 436     bool quit;
 437     QemuMutex mutex;
 438     QemuCond cond;
 439     void *des;
 440     uint8_t *compbuf;
 441     int len;
 442     z_stream stream;
 443 };
 444 typedef struct DecompressParam DecompressParam;
 445
 446 static CompressParam *comp_param;
 447 static QemuThread *compress_threads;
 448 /* comp_done_cond is used to wake up the migration thread when
 449  * one of the compression threads has finished the compression.
 450  * comp_done_lock is used to co-work with comp_done_cond.
 451  */
 452 static QemuMutex comp_done_lock;
 453 static QemuCond comp_done_cond;
 454 /* The empty QEMUFileOps will be used by file in CompressParam */
 455 static const QEMUFileOps empty_ops = { };
 456
 457 static QEMUFile *decomp_file;
 458 static DecompressParam *decomp_param;
 459 static QemuThread *decompress_threads;
 460 static QemuMutex decomp_done_lock;
 461 static QemuCond decomp_done_cond;
 462
 463 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 464                                  ram_addr_t offset, uint8_t *source_buf);
 465
 466 static void *do_data_compress(void *opaque)
 467 {
 468     CompressParam *param = opaque;
 469     RAMBlock *block;
 470     ram_addr_t offset;
 471     bool zero_page;
 472
 473     qemu_mutex_lock(&param->mutex);
 474     while (!param->quit) {
 475         if (param->block) {
 476             block = param->block;
 477             offset = param->offset;
 478             param->block = NULL;
 479             qemu_mutex_unlock(&param->mutex);
 480
 481             zero_page = do_compress_ram_page(param->file, &param->stream,
 482                                              block, offset, param->originbuf);
 483
 484             qemu_mutex_lock(&comp_done_lock);
 485             param->done = true;
 486             param->zero_page = zero_page;
 487             qemu_cond_signal(&comp_done_cond);
 488             qemu_mutex_unlock(&comp_done_lock);
 489
 490             qemu_mutex_lock(&param->mutex);
 491         } else {
 492             qemu_cond_wait(&param->cond, &param->mutex);
 493         }
 494     }
 495     qemu_mutex_unlock(&param->mutex);
 496
 497     return NULL;
 498 }
 499
 500 static void compress_threads_save_cleanup(void)
 501 {
 502     int i, thread_count;
 503
 504     if (!migrate_use_compression() || !comp_param) {
 505         return;
 506     }
 507
 508     thread_count = migrate_compress_threads();
 509     for (i = 0; i < thread_count; i++) {
 510         /*
 511          * we use it as a indicator which shows if the thread is
 512          * properly init'd or not
 513          */
 514         if (!comp_param[i].file) {
 515             break;
 516         }
 517
 518         qemu_mutex_lock(&comp_param[i].mutex);
 519         comp_param[i].quit = true;
 520         qemu_cond_signal(&comp_param[i].cond);
 521         qemu_mutex_unlock(&comp_param[i].mutex);
 522
 523         qemu_thread_join(compress_threads + i);
 524         qemu_mutex_destroy(&comp_param[i].mutex);
 525         qemu_cond_destroy(&comp_param[i].cond);
 526         deflateEnd(&comp_param[i].stream);
 527         g_free(comp_param[i].originbuf);
 528         qemu_fclose(comp_param[i].file);
 529         comp_param[i].file = NULL;
 530     }
 531     qemu_mutex_destroy(&comp_done_lock);
 532     qemu_cond_destroy(&comp_done_cond);
 533     g_free(compress_threads);
 534     g_free(comp_param);
 535     compress_threads = NULL;
 536     comp_param = NULL;
 537 }
 538
 539 static int compress_threads_save_setup(void)
 540 {
 541     int i, thread_count;
 542
 543     if (!migrate_use_compression()) {
 544         return 0;
 545     }
 546     thread_count = migrate_compress_threads();
 547     compress_threads = g_new0(QemuThread, thread_count);
 548     comp_param = g_new0(CompressParam, thread_count);
 549     qemu_cond_init(&comp_done_cond);
 550     qemu_mutex_init(&comp_done_lock);
 551     for (i = 0; i < thread_count; i++) {
 552         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 553         if (!comp_param[i].originbuf) {
 554             goto exit;
 555         }
 556
 557         if (deflateInit(&comp_param[i].stream,
 558                         migrate_compress_level()) != Z_OK) {
 559             g_free(comp_param[i].originbuf);
 560             goto exit;
 561         }
 562
 563         /* comp_param[i].file is just used as a dummy buffer to save data,
 564          * set its ops to empty.
 565          */
 566         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 567         comp_param[i].done = true;
 568         comp_param[i].quit = false;
 569         qemu_mutex_init(&comp_param[i].mutex);
 570         qemu_cond_init(&comp_param[i].cond);
 571         qemu_thread_create(compress_threads + i, "compress",
 572                            do_data_compress, comp_param + i,
 573                            QEMU_THREAD_JOINABLE);
 574     }
 575     return 0;
 576
 577 exit:
 578     compress_threads_save_cleanup();
 579     return -1;
 580 }
 581
 582 /**
 583  * save_page_header: write page header to wire
 584  *
 585  * If this is the 1st block, it also writes the block identification
 586  *
 587  * Returns the number of bytes written
 588  *
 589  * @f: QEMUFile where to send the data
 590  * @block: block that contains the page we want to send
 591  * @offset: offset inside the block for the page
 592  *          in the lower bits, it contains flags
 593  */
 594 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 595                                ram_addr_t offset)
 596 {
 597     size_t size, len;
 598
 599     if (block == rs->last_sent_block) {
 600         offset |= RAM_SAVE_FLAG_CONTINUE;
 601     }
 602     qemu_put_be64(f, offset);
 603     size = 8;
 604
 605     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 606         len = strlen(block->idstr);
 607         qemu_put_byte(f, len);
 608         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 609         size += 1 + len;
 610         rs->last_sent_block = block;
 611     }
 612     return size;
 613 }
 614
 615 /**
 616  * mig_throttle_guest_down: throotle down the guest
 617  *
 618  * Reduce amount of guest cpu execution to hopefully slow down memory
 619  * writes. If guest dirty memory rate is reduced below the rate at
 620  * which we can transfer pages to the destination then we should be
 621  * able to complete migration. Some workloads dirty memory way too
 622  * fast and will not effectively converge, even with auto-converge.
 623  */
 624 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 625                                     uint64_t bytes_dirty_threshold)
 626 {
 627     MigrationState *s = migrate_get_current();
 628     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 629     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 630     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 631     int pct_max = s->parameters.max_cpu_throttle;
 632
 633     uint64_t throttle_now = cpu_throttle_get_percentage();
 634     uint64_t cpu_now, cpu_ideal, throttle_inc;
 635
 636     /* We have not started throttling yet. Let's start it. */
 637     if (!cpu_throttle_active()) {
 638         cpu_throttle_set(pct_initial);
 639     } else {
 640         /* Throttling already on, just increase the rate */
 641         if (!pct_tailslow) {
 642             throttle_inc = pct_increment;
 643         } else {
 644             /* Compute the ideal CPU percentage used by Guest, which may
 645              * make the dirty rate match the dirty rate threshold. */
 646             cpu_now = 100 - throttle_now;
 647             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 648                         bytes_dirty_period);
 649             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 650         }
 651         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 652     }
 653 }
 654
 655 /**
 656  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 657  *
 658  * @rs: current RAM state
 659  * @current_addr: address for the zero page
 660  *
 661  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 662  * The important thing is that a stale (not-yet-0'd) page be replaced
 663  * by the new data.
 664  * As a bonus, if the page wasn't in the cache it gets added so that
 665  * when a small write is made into the 0'd page it gets XBZRLE sent.
 666  */
 667 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 668 {
 669     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 670         return;
 671     }
 672
 673     /* We don't care if this fails to allocate a new cache page
 674      * as long as it updated an old one */
 675     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 676                  ram_counters.dirty_sync_count);
 677 }
 678
 679 #define ENCODING_FLAG_XBZRLE 0x1
 680
 681 /**
 682  * save_xbzrle_page: compress and send current page
 683  *
 684  * Returns: 1 means that we wrote the page
 685  *          0 means that page is identical to the one already sent
 686  *          -1 means that xbzrle would be longer than normal
 687  *
 688  * @rs: current RAM state
 689  * @current_data: pointer to the address of the page contents
 690  * @current_addr: addr of the page
 691  * @block: block that contains the page we want to send
 692  * @offset: offset inside the block for the page
 693  * @last_stage: if we are at the completion stage
 694  */
 695 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 696                             ram_addr_t current_addr, RAMBlock *block,
 697                             ram_addr_t offset, bool last_stage)
 698 {
 699     int encoded_len = 0, bytes_xbzrle;
 700     uint8_t *prev_cached_page;
 701
 702     if (!cache_is_cached(XBZRLE.cache, current_addr,
 703                          ram_counters.dirty_sync_count)) {
 704         xbzrle_counters.cache_miss++;
 705         if (!last_stage) {
 706             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 707                              ram_counters.dirty_sync_count) == -1) {
 708                 return -1;
 709             } else {
 710                 /* update *current_data when the page has been
 711                    inserted into cache */
 712                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 713             }
 714         }
 715         return -1;
 716     }
 717
 718     /*
 719      * Reaching here means the page has hit the xbzrle cache, no matter what
 720      * encoding result it is (normal encoding, overflow or skipping the page),
 721      * count the page as encoded. This is used to caculate the encoding rate.
 722      *
 723      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 724      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 725      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 726      * skipped page included. In this way, the encoding rate can tell if the
 727      * guest page is good for xbzrle encoding.
 728      */
 729     xbzrle_counters.pages++;
 730     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 731
 732     /* save current buffer into memory */
 733     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 734
 735     /* XBZRLE encoding (if there is no overflow) */
 736     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 737                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 738                                        TARGET_PAGE_SIZE);
 739
 740     /*
 741      * Update the cache contents, so that it corresponds to the data
 742      * sent, in all cases except where we skip the page.
 743      */
 744     if (!last_stage && encoded_len != 0) {
 745         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 746         /*
 747          * In the case where we couldn't compress, ensure that the caller
 748          * sends the data from the cache, since the guest might have
 749          * changed the RAM since we copied it.
 750          */
 751         *current_data = prev_cached_page;
 752     }
 753
 754     if (encoded_len == 0) {
 755         trace_save_xbzrle_page_skipping();
 756         return 0;
 757     } else if (encoded_len == -1) {
 758         trace_save_xbzrle_page_overflow();
 759         xbzrle_counters.overflow++;
 760         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 761         return -1;
 762     }
 763
 764     /* Send XBZRLE based compressed page */
 765     bytes_xbzrle = save_page_header(rs, rs->f, block,
 766                                     offset | RAM_SAVE_FLAG_XBZRLE);
 767     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 768     qemu_put_be16(rs->f, encoded_len);
 769     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 770     bytes_xbzrle += encoded_len + 1 + 2;
 771     /*
 772      * Like compressed_size (please see update_compress_thread_counts),
 773      * the xbzrle encoded bytes don't count the 8 byte header with
 774      * RAM_SAVE_FLAG_CONTINUE.
 775      */
 776     xbzrle_counters.bytes += bytes_xbzrle - 8;
 777     ram_counters.transferred += bytes_xbzrle;
 778
 779     return 1;
 780 }
 781
 782 /**
 783  * migration_bitmap_find_dirty: find the next dirty page from start
 784  *
 785  * Returns the page offset within memory region of the start of a dirty page
 786  *
 787  * @rs: current RAM state
 788  * @rb: RAMBlock where to search for dirty pages
 789  * @start: page where we start the search
 790  */
 791 static inline
 792 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 793                                           unsigned long start)
 794 {
 795     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 796     unsigned long *bitmap = rb->bmap;
 797     unsigned long next;
 798
 799     if (ramblock_is_ignored(rb)) {
 800         return size;
 801     }
 802
 803     /*
 804      * When the free page optimization is enabled, we need to check the bitmap
 805      * to send the non-free pages rather than all the pages in the bulk stage.
 806      */
 807     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
 808         next = start + 1;
 809     } else {
 810         next = find_next_bit(bitmap, size, start);
 811     }
 812
 813     return next;
 814 }
 815
 816 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 817                                                 RAMBlock *rb,
 818                                                 unsigned long page)
 819 {
 820     bool ret;
 821
 822     qemu_mutex_lock(&rs->bitmap_mutex);
 823
 824     /*
 825      * Clear dirty bitmap if needed.  This _must_ be called before we
 826      * send any of the page in the chunk because we need to make sure
 827      * we can capture further page content changes when we sync dirty
 828      * log the next time.  So as long as we are going to send any of
 829      * the page in the chunk we clear the remote dirty bitmap for all.
 830      * Clearing it earlier won't be a problem, but too late will.
 831      */
 832     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
 833         uint8_t shift = rb->clear_bmap_shift;
 834         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
 835         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
 836
 837         /*
 838          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 839          * can make things easier sometimes since then start address
 840          * of the small chunk will always be 64 pages aligned so the
 841          * bitmap will always be aligned to unsigned long.  We should
 842          * even be able to remove this restriction but I'm simply
 843          * keeping it.
 844          */
 845         assert(shift >= 6);
 846         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 847         memory_region_clear_dirty_bitmap(rb->mr, start, size);
 848     }
 849
 850     ret = test_and_clear_bit(page, rb->bmap);
 851
 852     if (ret) {
 853         rs->migration_dirty_pages--;
 854     }
 855     qemu_mutex_unlock(&rs->bitmap_mutex);
 856
 857     return ret;
 858 }
 859
 860 /* Called with RCU critical section */
 861 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 862 {
 863     uint64_t new_dirty_pages =
 864         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
 865
 866     rs->migration_dirty_pages += new_dirty_pages;
 867     rs->num_dirty_pages_period += new_dirty_pages;
 868 }
 869
 870 /**
 871  * ram_pagesize_summary: calculate all the pagesizes of a VM
 872  *
 873  * Returns a summary bitmap of the page sizes of all RAMBlocks
 874  *
 875  * For VMs with just normal pages this is equivalent to the host page
 876  * size. If it's got some huge pages then it's the OR of all the
 877  * different page sizes.
 878  */
 879 uint64_t ram_pagesize_summary(void)
 880 {
 881     RAMBlock *block;
 882     uint64_t summary = 0;
 883
 884     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 885         summary |= block->page_size;
 886     }
 887
 888     return summary;
 889 }
 890
 891 uint64_t ram_get_total_transferred_pages(void)
 892 {
 893     return  ram_counters.normal + ram_counters.duplicate +
 894                 compression_counters.pages + xbzrle_counters.pages;
 895 }
 896
 897 static void migration_update_rates(RAMState *rs, int64_t end_time)
 898 {
 899     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 900     double compressed_size;
 901
 902     /* calculate period counters */
 903     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 904                 / (end_time - rs->time_last_bitmap_sync);
 905
 906     if (!page_count) {
 907         return;
 908     }
 909
 910     if (migrate_use_xbzrle()) {
 911         double encoded_size, unencoded_size;
 912
 913         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 914             rs->xbzrle_cache_miss_prev) / page_count;
 915         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 916         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
 917                          TARGET_PAGE_SIZE;
 918         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
 919         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
 920             xbzrle_counters.encoding_rate = 0;
 921         } else {
 922             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
 923         }
 924         rs->xbzrle_pages_prev = xbzrle_counters.pages;
 925         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
 926     }
 927
 928     if (migrate_use_compression()) {
 929         compression_counters.busy_rate = (double)(compression_counters.busy -
 930             rs->compress_thread_busy_prev) / page_count;
 931         rs->compress_thread_busy_prev = compression_counters.busy;
 932
 933         compressed_size = compression_counters.compressed_size -
 934                           rs->compressed_size_prev;
 935         if (compressed_size) {
 936             double uncompressed_size = (compression_counters.pages -
 937                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
 938
 939             /* Compression-Ratio = Uncompressed-size / Compressed-size */
 940             compression_counters.compression_rate =
 941                                         uncompressed_size / compressed_size;
 942
 943             rs->compress_pages_prev = compression_counters.pages;
 944             rs->compressed_size_prev = compression_counters.compressed_size;
 945         }
 946     }
 947 }
 948
 949 static void migration_trigger_throttle(RAMState *rs)
 950 {
 951     MigrationState *s = migrate_get_current();
 952     uint64_t threshold = s->parameters.throttle_trigger_threshold;
 953
 954     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
 955     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
 956     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
 957
 958     /* During block migration the auto-converge logic incorrectly detects
 959      * that ram migration makes no progress. Avoid this by disabling the
 960      * throttling logic during the bulk phase of block migration. */
 961     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 962         /* The following detection logic can be refined later. For now:
 963            Check to see if the ratio between dirtied bytes and the approx.
 964            amount of bytes that just got transferred since the last time
 965            we were in this routine reaches the threshold. If that happens
 966            twice, start or increase throttling. */
 967
 968         if ((bytes_dirty_period > bytes_dirty_threshold) &&
 969             (++rs->dirty_rate_high_cnt >= 2)) {
 970             trace_migration_throttle();
 971             rs->dirty_rate_high_cnt = 0;
 972             mig_throttle_guest_down(bytes_dirty_period,
 973                                     bytes_dirty_threshold);
 974         }
 975     }
 976 }
 977
 978 static void migration_bitmap_sync(RAMState *rs)
 979 {
 980     RAMBlock *block;
 981     int64_t end_time;
 982
 983     ram_counters.dirty_sync_count++;
 984
 985     if (!rs->time_last_bitmap_sync) {
 986         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 987     }
 988
 989     trace_migration_bitmap_sync_start();
 990     memory_global_dirty_log_sync();
 991
 992     qemu_mutex_lock(&rs->bitmap_mutex);
 993     WITH_RCU_READ_LOCK_GUARD() {
 994         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 995             ramblock_sync_dirty_bitmap(rs, block);
 996         }
 997         ram_counters.remaining = ram_bytes_remaining();
 998     }
 999     qemu_mutex_unlock(&rs->bitmap_mutex);
1000
1001     memory_global_after_dirty_log_sync();
1002     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1003
1004     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1005
1006     /* more than 1 second = 1000 millisecons */
1007     if (end_time > rs->time_last_bitmap_sync + 1000) {
1008         migration_trigger_throttle(rs);
1009
1010         migration_update_rates(rs, end_time);
1011
1012         rs->target_page_count_prev = rs->target_page_count;
1013
1014         /* reset period counters */
1015         rs->time_last_bitmap_sync = end_time;
1016         rs->num_dirty_pages_period = 0;
1017         rs->bytes_xfer_prev = ram_counters.transferred;
1018     }
1019     if (migrate_use_events()) {
1020         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1021     }
1022 }
1023
1024 static void migration_bitmap_sync_precopy(RAMState *rs)
1025 {
1026     Error *local_err = NULL;
1027
1028     /*
1029      * The current notifier usage is just an optimization to migration, so we
1030      * don't stop the normal migration process in the error case.
1031      */
1032     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1033         error_report_err(local_err);
1034         local_err = NULL;
1035     }
1036
1037     migration_bitmap_sync(rs);
1038
1039     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1040         error_report_err(local_err);
1041     }
1042 }
1043
1044 /**
1045  * save_zero_page_to_file: send the zero page to the file
1046  *
1047  * Returns the size of data written to the file, 0 means the page is not
1048  * a zero page
1049  *
1050  * @rs: current RAM state
1051  * @file: the file where the data is saved
1052  * @block: block that contains the page we want to send
1053  * @offset: offset inside the block for the page
1054  */
1055 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1056                                   RAMBlock *block, ram_addr_t offset)
1057 {
1058     uint8_t *p = block->host + offset;
1059     int len = 0;
1060
1061     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1062         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1063         qemu_put_byte(file, 0);
1064         len += 1;
1065     }
1066     return len;
1067 }
1068
1069 /**
1070  * save_zero_page: send the zero page to the stream
1071  *
1072  * Returns the number of pages written.
1073  *
1074  * @rs: current RAM state
1075  * @block: block that contains the page we want to send
1076  * @offset: offset inside the block for the page
1077  */
1078 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1079 {
1080     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1081
1082     if (len) {
1083         ram_counters.duplicate++;
1084         ram_counters.transferred += len;
1085         return 1;
1086     }
1087     return -1;
1088 }
1089
1090 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1091 {
1092     if (!migrate_release_ram() || !migration_in_postcopy()) {
1093         return;
1094     }
1095
1096     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1097 }
1098
1099 /*
1100  * @pages: the number of pages written by the control path,
1101  *        < 0 - error
1102  *        > 0 - number of pages written
1103  *
1104  * Return true if the pages has been saved, otherwise false is returned.
1105  */
1106 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1107                               int *pages)
1108 {
1109     uint64_t bytes_xmit = 0;
1110     int ret;
1111
1112     *pages = -1;
1113     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1114                                 &bytes_xmit);
1115     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1116         return false;
1117     }
1118
1119     if (bytes_xmit) {
1120         ram_counters.transferred += bytes_xmit;
1121         *pages = 1;
1122     }
1123
1124     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1125         return true;
1126     }
1127
1128     if (bytes_xmit > 0) {
1129         ram_counters.normal++;
1130     } else if (bytes_xmit == 0) {
1131         ram_counters.duplicate++;
1132     }
1133
1134     return true;
1135 }
1136
1137 /*
1138  * directly send the page to the stream
1139  *
1140  * Returns the number of pages written.
1141  *
1142  * @rs: current RAM state
1143  * @block: block that contains the page we want to send
1144  * @offset: offset inside the block for the page
1145  * @buf: the page to be sent
1146  * @async: send to page asyncly
1147  */
1148 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1149                             uint8_t *buf, bool async)
1150 {
1151     ram_counters.transferred += save_page_header(rs, rs->f, block,
1152                                                  offset | RAM_SAVE_FLAG_PAGE);
1153     if (async) {
1154         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1155                               migrate_release_ram() &
1156                               migration_in_postcopy());
1157     } else {
1158         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1159     }
1160     ram_counters.transferred += TARGET_PAGE_SIZE;
1161     ram_counters.normal++;
1162     return 1;
1163 }
1164
1165 /**
1166  * ram_save_page: send the given page to the stream
1167  *
1168  * Returns the number of pages written.
1169  *          < 0 - error
1170  *          >=0 - Number of pages written - this might legally be 0
1171  *                if xbzrle noticed the page was the same.
1172  *
1173  * @rs: current RAM state
1174  * @block: block that contains the page we want to send
1175  * @offset: offset inside the block for the page
1176  * @last_stage: if we are at the completion stage
1177  */
1178 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1179 {
1180     int pages = -1;
1181     uint8_t *p;
1182     bool send_async = true;
1183     RAMBlock *block = pss->block;
1184     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1185     ram_addr_t current_addr = block->offset + offset;
1186
1187     p = block->host + offset;
1188     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1189
1190     XBZRLE_cache_lock();
1191     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1192         migrate_use_xbzrle()) {
1193         pages = save_xbzrle_page(rs, &p, current_addr, block,
1194                                  offset, last_stage);
1195         if (!last_stage) {
1196             /* Can't send this cached data async, since the cache page
1197              * might get updated before it gets to the wire
1198              */
1199             send_async = false;
1200         }
1201     }
1202
1203     /* XBZRLE overflow or normal page */
1204     if (pages == -1) {
1205         pages = save_normal_page(rs, block, offset, p, send_async);
1206     }
1207
1208     XBZRLE_cache_unlock();
1209
1210     return pages;
1211 }
1212
1213 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1214                                  ram_addr_t offset)
1215 {
1216     if (multifd_queue_page(rs->f, block, offset) < 0) {
1217         return -1;
1218     }
1219     ram_counters.normal++;
1220
1221     return 1;
1222 }
1223
1224 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1225                                  ram_addr_t offset, uint8_t *source_buf)
1226 {
1227     RAMState *rs = ram_state;
1228     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1229     bool zero_page = false;
1230     int ret;
1231
1232     if (save_zero_page_to_file(rs, f, block, offset)) {
1233         zero_page = true;
1234         goto exit;
1235     }
1236
1237     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1238
1239     /*
1240      * copy it to a internal buffer to avoid it being modified by VM
1241      * so that we can catch up the error during compression and
1242      * decompression
1243      */
1244     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1245     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1246     if (ret < 0) {
1247         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1248         error_report("compressed data failed!");
1249         return false;
1250     }
1251
1252 exit:
1253     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1254     return zero_page;
1255 }
1256
1257 static void
1258 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1259 {
1260     ram_counters.transferred += bytes_xmit;
1261
1262     if (param->zero_page) {
1263         ram_counters.duplicate++;
1264         return;
1265     }
1266
1267     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1268     compression_counters.compressed_size += bytes_xmit - 8;
1269     compression_counters.pages++;
1270 }
1271
1272 static bool save_page_use_compression(RAMState *rs);
1273
1274 static void flush_compressed_data(RAMState *rs)
1275 {
1276     int idx, len, thread_count;
1277
1278     if (!save_page_use_compression(rs)) {
1279         return;
1280     }
1281     thread_count = migrate_compress_threads();
1282
1283     qemu_mutex_lock(&comp_done_lock);
1284     for (idx = 0; idx < thread_count; idx++) {
1285         while (!comp_param[idx].done) {
1286             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1287         }
1288     }
1289     qemu_mutex_unlock(&comp_done_lock);
1290
1291     for (idx = 0; idx < thread_count; idx++) {
1292         qemu_mutex_lock(&comp_param[idx].mutex);
1293         if (!comp_param[idx].quit) {
1294             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1295             /*
1296              * it's safe to fetch zero_page without holding comp_done_lock
1297              * as there is no further request submitted to the thread,
1298              * i.e, the thread should be waiting for a request at this point.
1299              */
1300             update_compress_thread_counts(&comp_param[idx], len);
1301         }
1302         qemu_mutex_unlock(&comp_param[idx].mutex);
1303     }
1304 }
1305
1306 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1307                                        ram_addr_t offset)
1308 {
1309     param->block = block;
1310     param->offset = offset;
1311 }
1312
1313 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1314                                            ram_addr_t offset)
1315 {
1316     int idx, thread_count, bytes_xmit = -1, pages = -1;
1317     bool wait = migrate_compress_wait_thread();
1318
1319     thread_count = migrate_compress_threads();
1320     qemu_mutex_lock(&comp_done_lock);
1321 retry:
1322     for (idx = 0; idx < thread_count; idx++) {
1323         if (comp_param[idx].done) {
1324             comp_param[idx].done = false;
1325             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1326             qemu_mutex_lock(&comp_param[idx].mutex);
1327             set_compress_params(&comp_param[idx], block, offset);
1328             qemu_cond_signal(&comp_param[idx].cond);
1329             qemu_mutex_unlock(&comp_param[idx].mutex);
1330             pages = 1;
1331             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1332             break;
1333         }
1334     }
1335
1336     /*
1337      * wait for the free thread if the user specifies 'compress-wait-thread',
1338      * otherwise we will post the page out in the main thread as normal page.
1339      */
1340     if (pages < 0 && wait) {
1341         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1342         goto retry;
1343     }
1344     qemu_mutex_unlock(&comp_done_lock);
1345
1346     return pages;
1347 }
1348
1349 /**
1350  * find_dirty_block: find the next dirty page and update any state
1351  * associated with the search process.
1352  *
1353  * Returns true if a page is found
1354  *
1355  * @rs: current RAM state
1356  * @pss: data about the state of the current dirty page scan
1357  * @again: set to false if the search has scanned the whole of RAM
1358  */
1359 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1360 {
1361     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1362     if (pss->complete_round && pss->block == rs->last_seen_block &&
1363         pss->page >= rs->last_page) {
1364         /*
1365          * We've been once around the RAM and haven't found anything.
1366          * Give up.
1367          */
1368         *again = false;
1369         return false;
1370     }
1371     if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1372         >= pss->block->used_length) {
1373         /* Didn't find anything in this RAM Block */
1374         pss->page = 0;
1375         pss->block = QLIST_NEXT_RCU(pss->block, next);
1376         if (!pss->block) {
1377             /*
1378              * If memory migration starts over, we will meet a dirtied page
1379              * which may still exists in compression threads's ring, so we
1380              * should flush the compressed data to make sure the new page
1381              * is not overwritten by the old one in the destination.
1382              *
1383              * Also If xbzrle is on, stop using the data compression at this
1384              * point. In theory, xbzrle can do better than compression.
1385              */
1386             flush_compressed_data(rs);
1387
1388             /* Hit the end of the list */
1389             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1390             /* Flag that we've looped */
1391             pss->complete_round = true;
1392             rs->ram_bulk_stage = false;
1393         }
1394         /* Didn't find anything this time, but try again on the new block */
1395         *again = true;
1396         return false;
1397     } else {
1398         /* Can go around again, but... */
1399         *again = true;
1400         /* We've found something so probably don't need to */
1401         return true;
1402     }
1403 }
1404
1405 /**
1406  * unqueue_page: gets a page of the queue
1407  *
1408  * Helper for 'get_queued_page' - gets a page off the queue
1409  *
1410  * Returns the block of the page (or NULL if none available)
1411  *
1412  * @rs: current RAM state
1413  * @offset: used to return the offset within the RAMBlock
1414  */
1415 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1416 {
1417     RAMBlock *block = NULL;
1418
1419     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1420         return NULL;
1421     }
1422
1423     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1424     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1425         struct RAMSrcPageRequest *entry =
1426                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1427         block = entry->rb;
1428         *offset = entry->offset;
1429
1430         if (entry->len > TARGET_PAGE_SIZE) {
1431             entry->len -= TARGET_PAGE_SIZE;
1432             entry->offset += TARGET_PAGE_SIZE;
1433         } else {
1434             memory_region_unref(block->mr);
1435             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1436             g_free(entry);
1437             migration_consume_urgent_request();
1438         }
1439     }
1440
1441     return block;
1442 }
1443
1444 /**
1445  * get_queued_page: unqueue a page from the postcopy requests
1446  *
1447  * Skips pages that are already sent (!dirty)
1448  *
1449  * Returns true if a queued page is found
1450  *
1451  * @rs: current RAM state
1452  * @pss: data about the state of the current dirty page scan
1453  */
1454 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1455 {
1456     RAMBlock  *block;
1457     ram_addr_t offset;
1458     bool dirty;
1459
1460     do {
1461         block = unqueue_page(rs, &offset);
1462         /*
1463          * We're sending this page, and since it's postcopy nothing else
1464          * will dirty it, and we must make sure it doesn't get sent again
1465          * even if this queue request was received after the background
1466          * search already sent it.
1467          */
1468         if (block) {
1469             unsigned long page;
1470
1471             page = offset >> TARGET_PAGE_BITS;
1472             dirty = test_bit(page, block->bmap);
1473             if (!dirty) {
1474                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1475                                                 page);
1476             } else {
1477                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1478             }
1479         }
1480
1481     } while (block && !dirty);
1482
1483     if (block) {
1484         /*
1485          * As soon as we start servicing pages out of order, then we have
1486          * to kill the bulk stage, since the bulk stage assumes
1487          * in (migration_bitmap_find_and_reset_dirty) that every page is
1488          * dirty, that's no longer true.
1489          */
1490         rs->ram_bulk_stage = false;
1491
1492         /*
1493          * We want the background search to continue from the queued page
1494          * since the guest is likely to want other pages near to the page
1495          * it just requested.
1496          */
1497         pss->block = block;
1498         pss->page = offset >> TARGET_PAGE_BITS;
1499
1500         /*
1501          * This unqueued page would break the "one round" check, even is
1502          * really rare.
1503          */
1504         pss->complete_round = false;
1505     }
1506
1507     return !!block;
1508 }
1509
1510 /**
1511  * migration_page_queue_free: drop any remaining pages in the ram
1512  * request queue
1513  *
1514  * It should be empty at the end anyway, but in error cases there may
1515  * be some left.  in case that there is any page left, we drop it.
1516  *
1517  */
1518 static void migration_page_queue_free(RAMState *rs)
1519 {
1520     struct RAMSrcPageRequest *mspr, *next_mspr;
1521     /* This queue generally should be empty - but in the case of a failed
1522      * migration might have some droppings in.
1523      */
1524     RCU_READ_LOCK_GUARD();
1525     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1526         memory_region_unref(mspr->rb->mr);
1527         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1528         g_free(mspr);
1529     }
1530 }
1531
1532 /**
1533  * ram_save_queue_pages: queue the page for transmission
1534  *
1535  * A request from postcopy destination for example.
1536  *
1537  * Returns zero on success or negative on error
1538  *
1539  * @rbname: Name of the RAMBLock of the request. NULL means the
1540  *          same that last one.
1541  * @start: starting address from the start of the RAMBlock
1542  * @len: length (in bytes) to send
1543  */
1544 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1545 {
1546     RAMBlock *ramblock;
1547     RAMState *rs = ram_state;
1548
1549     ram_counters.postcopy_requests++;
1550     RCU_READ_LOCK_GUARD();
1551
1552     if (!rbname) {
1553         /* Reuse last RAMBlock */
1554         ramblock = rs->last_req_rb;
1555
1556         if (!ramblock) {
1557             /*
1558              * Shouldn't happen, we can't reuse the last RAMBlock if
1559              * it's the 1st request.
1560              */
1561             error_report("ram_save_queue_pages no previous block");
1562             return -1;
1563         }
1564     } else {
1565         ramblock = qemu_ram_block_by_name(rbname);
1566
1567         if (!ramblock) {
1568             /* We shouldn't be asked for a non-existent RAMBlock */
1569             error_report("ram_save_queue_pages no block '%s'", rbname);
1570             return -1;
1571         }
1572         rs->last_req_rb = ramblock;
1573     }
1574     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1575     if (start+len > ramblock->used_length) {
1576         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1577                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1578                      __func__, start, len, ramblock->used_length);
1579         return -1;
1580     }
1581
1582     struct RAMSrcPageRequest *new_entry =
1583         g_malloc0(sizeof(struct RAMSrcPageRequest));
1584     new_entry->rb = ramblock;
1585     new_entry->offset = start;
1586     new_entry->len = len;
1587
1588     memory_region_ref(ramblock->mr);
1589     qemu_mutex_lock(&rs->src_page_req_mutex);
1590     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1591     migration_make_urgent_request();
1592     qemu_mutex_unlock(&rs->src_page_req_mutex);
1593
1594     return 0;
1595 }
1596
1597 static bool save_page_use_compression(RAMState *rs)
1598 {
1599     if (!migrate_use_compression()) {
1600         return false;
1601     }
1602
1603     /*
1604      * If xbzrle is on, stop using the data compression after first
1605      * round of migration even if compression is enabled. In theory,
1606      * xbzrle can do better than compression.
1607      */
1608     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1609         return true;
1610     }
1611
1612     return false;
1613 }
1614
1615 /*
1616  * try to compress the page before posting it out, return true if the page
1617  * has been properly handled by compression, otherwise needs other
1618  * paths to handle it
1619  */
1620 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1621 {
1622     if (!save_page_use_compression(rs)) {
1623         return false;
1624     }
1625
1626     /*
1627      * When starting the process of a new block, the first page of
1628      * the block should be sent out before other pages in the same
1629      * block, and all the pages in last block should have been sent
1630      * out, keeping this order is important, because the 'cont' flag
1631      * is used to avoid resending the block name.
1632      *
1633      * We post the fist page as normal page as compression will take
1634      * much CPU resource.
1635      */
1636     if (block != rs->last_sent_block) {
1637         flush_compressed_data(rs);
1638         return false;
1639     }
1640
1641     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1642         return true;
1643     }
1644
1645     compression_counters.busy++;
1646     return false;
1647 }
1648
1649 /**
1650  * ram_save_target_page: save one target page
1651  *
1652  * Returns the number of pages written
1653  *
1654  * @rs: current RAM state
1655  * @pss: data about the page we want to send
1656  * @last_stage: if we are at the completion stage
1657  */
1658 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1659                                 bool last_stage)
1660 {
1661     RAMBlock *block = pss->block;
1662     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1663     int res;
1664
1665     if (control_save_page(rs, block, offset, &res)) {
1666         return res;
1667     }
1668
1669     if (save_compress_page(rs, block, offset)) {
1670         return 1;
1671     }
1672
1673     res = save_zero_page(rs, block, offset);
1674     if (res > 0) {
1675         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1676          * page would be stale
1677          */
1678         if (!save_page_use_compression(rs)) {
1679             XBZRLE_cache_lock();
1680             xbzrle_cache_zero_page(rs, block->offset + offset);
1681             XBZRLE_cache_unlock();
1682         }
1683         ram_release_pages(block->idstr, offset, res);
1684         return res;
1685     }
1686
1687     /*
1688      * Do not use multifd for:
1689      * 1. Compression as the first page in the new block should be posted out
1690      *    before sending the compressed page
1691      * 2. In postcopy as one whole host page should be placed
1692      */
1693     if (!save_page_use_compression(rs) && migrate_use_multifd()
1694         && !migration_in_postcopy()) {
1695         return ram_save_multifd_page(rs, block, offset);
1696     }
1697
1698     return ram_save_page(rs, pss, last_stage);
1699 }
1700
1701 /**
1702  * ram_save_host_page: save a whole host page
1703  *
1704  * Starting at *offset send pages up to the end of the current host
1705  * page. It's valid for the initial offset to point into the middle of
1706  * a host page in which case the remainder of the hostpage is sent.
1707  * Only dirty target pages are sent. Note that the host page size may
1708  * be a huge page for this block.
1709  * The saving stops at the boundary of the used_length of the block
1710  * if the RAMBlock isn't a multiple of the host page size.
1711  *
1712  * Returns the number of pages written or negative on error
1713  *
1714  * @rs: current RAM state
1715  * @ms: current migration state
1716  * @pss: data about the page we want to send
1717  * @last_stage: if we are at the completion stage
1718  */
1719 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1720                               bool last_stage)
1721 {
1722     int tmppages, pages = 0;
1723     size_t pagesize_bits =
1724         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1725
1726     if (ramblock_is_ignored(pss->block)) {
1727         error_report("block %s should not be migrated !", pss->block->idstr);
1728         return 0;
1729     }
1730
1731     do {
1732         /* Check the pages is dirty and if it is send it */
1733         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1734             pss->page++;
1735             continue;
1736         }
1737
1738         tmppages = ram_save_target_page(rs, pss, last_stage);
1739         if (tmppages < 0) {
1740             return tmppages;
1741         }
1742
1743         pages += tmppages;
1744         pss->page++;
1745         /* Allow rate limiting to happen in the middle of huge pages */
1746         migration_rate_limit();
1747     } while ((pss->page & (pagesize_bits - 1)) &&
1748              offset_in_ramblock(pss->block,
1749                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
1750
1751     /* The offset we leave with is the last one we looked at */
1752     pss->page--;
1753     return pages;
1754 }
1755
1756 /**
1757  * ram_find_and_save_block: finds a dirty page and sends it to f
1758  *
1759  * Called within an RCU critical section.
1760  *
1761  * Returns the number of pages written where zero means no dirty pages,
1762  * or negative on error
1763  *
1764  * @rs: current RAM state
1765  * @last_stage: if we are at the completion stage
1766  *
1767  * On systems where host-page-size > target-page-size it will send all the
1768  * pages in a host page that are dirty.
1769  */
1770
1771 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1772 {
1773     PageSearchStatus pss;
1774     int pages = 0;
1775     bool again, found;
1776
1777     /* No dirty page as there is zero RAM */
1778     if (!ram_bytes_total()) {
1779         return pages;
1780     }
1781
1782     pss.block = rs->last_seen_block;
1783     pss.page = rs->last_page;
1784     pss.complete_round = false;
1785
1786     if (!pss.block) {
1787         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1788     }
1789
1790     do {
1791         again = true;
1792         found = get_queued_page(rs, &pss);
1793
1794         if (!found) {
1795             /* priority queue empty, so just search for something dirty */
1796             found = find_dirty_block(rs, &pss, &again);
1797         }
1798
1799         if (found) {
1800             pages = ram_save_host_page(rs, &pss, last_stage);
1801         }
1802     } while (!pages && again);
1803
1804     rs->last_seen_block = pss.block;
1805     rs->last_page = pss.page;
1806
1807     return pages;
1808 }
1809
1810 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1811 {
1812     uint64_t pages = size / TARGET_PAGE_SIZE;
1813
1814     if (zero) {
1815         ram_counters.duplicate += pages;
1816     } else {
1817         ram_counters.normal += pages;
1818         ram_counters.transferred += size;
1819         qemu_update_position(f, size);
1820     }
1821 }
1822
1823 static uint64_t ram_bytes_total_common(bool count_ignored)
1824 {
1825     RAMBlock *block;
1826     uint64_t total = 0;
1827
1828     RCU_READ_LOCK_GUARD();
1829
1830     if (count_ignored) {
1831         RAMBLOCK_FOREACH_MIGRATABLE(block) {
1832             total += block->used_length;
1833         }
1834     } else {
1835         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1836             total += block->used_length;
1837         }
1838     }
1839     return total;
1840 }
1841
1842 uint64_t ram_bytes_total(void)
1843 {
1844     return ram_bytes_total_common(false);
1845 }
1846
1847 static void xbzrle_load_setup(void)
1848 {
1849     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1850 }
1851
1852 static void xbzrle_load_cleanup(void)
1853 {
1854     g_free(XBZRLE.decoded_buf);
1855     XBZRLE.decoded_buf = NULL;
1856 }
1857
1858 static void ram_state_cleanup(RAMState **rsp)
1859 {
1860     if (*rsp) {
1861         migration_page_queue_free(*rsp);
1862         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1863         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1864         g_free(*rsp);
1865         *rsp = NULL;
1866     }
1867 }
1868
1869 static void xbzrle_cleanup(void)
1870 {
1871     XBZRLE_cache_lock();
1872     if (XBZRLE.cache) {
1873         cache_fini(XBZRLE.cache);
1874         g_free(XBZRLE.encoded_buf);
1875         g_free(XBZRLE.current_buf);
1876         g_free(XBZRLE.zero_target_page);
1877         XBZRLE.cache = NULL;
1878         XBZRLE.encoded_buf = NULL;
1879         XBZRLE.current_buf = NULL;
1880         XBZRLE.zero_target_page = NULL;
1881     }
1882     XBZRLE_cache_unlock();
1883 }
1884
1885 static void ram_save_cleanup(void *opaque)
1886 {
1887     RAMState **rsp = opaque;
1888     RAMBlock *block;
1889
1890     /* caller have hold iothread lock or is in a bh, so there is
1891      * no writing race against the migration bitmap
1892      */
1893     memory_global_dirty_log_stop();
1894
1895     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1896         g_free(block->clear_bmap);
1897         block->clear_bmap = NULL;
1898         g_free(block->bmap);
1899         block->bmap = NULL;
1900     }
1901
1902     xbzrle_cleanup();
1903     compress_threads_save_cleanup();
1904     ram_state_cleanup(rsp);
1905 }
1906
1907 static void ram_state_reset(RAMState *rs)
1908 {
1909     rs->last_seen_block = NULL;
1910     rs->last_sent_block = NULL;
1911     rs->last_page = 0;
1912     rs->last_version = ram_list.version;
1913     rs->ram_bulk_stage = true;
1914     rs->fpo_enabled = false;
1915 }
1916
1917 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1918
1919 /*
1920  * 'expected' is the value you expect the bitmap mostly to be full
1921  * of; it won't bother printing lines that are all this value.
1922  * If 'todump' is null the migration bitmap is dumped.
1923  */
1924 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1925                            unsigned long pages)
1926 {
1927     int64_t cur;
1928     int64_t linelen = 128;
1929     char linebuf[129];
1930
1931     for (cur = 0; cur < pages; cur += linelen) {
1932         int64_t curb;
1933         bool found = false;
1934         /*
1935          * Last line; catch the case where the line length
1936          * is longer than remaining ram
1937          */
1938         if (cur + linelen > pages) {
1939             linelen = pages - cur;
1940         }
1941         for (curb = 0; curb < linelen; curb++) {
1942             bool thisbit = test_bit(cur + curb, todump);
1943             linebuf[curb] = thisbit ? '1' : '.';
1944             found = found || (thisbit != expected);
1945         }
1946         if (found) {
1947             linebuf[curb] = '\0';
1948             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1949         }
1950     }
1951 }
1952
1953 /* **** functions for postcopy ***** */
1954
1955 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1956 {
1957     struct RAMBlock *block;
1958
1959     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1960         unsigned long *bitmap = block->bmap;
1961         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1962         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1963
1964         while (run_start < range) {
1965             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1966             ram_discard_range(block->idstr,
1967                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
1968                               ((ram_addr_t)(run_end - run_start))
1969                                 << TARGET_PAGE_BITS);
1970             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1971         }
1972     }
1973 }
1974
1975 /**
1976  * postcopy_send_discard_bm_ram: discard a RAMBlock
1977  *
1978  * Returns zero on success
1979  *
1980  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1981  *
1982  * @ms: current migration state
1983  * @block: RAMBlock to discard
1984  */
1985 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
1986 {
1987     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1988     unsigned long current;
1989     unsigned long *bitmap = block->bmap;
1990
1991     for (current = 0; current < end; ) {
1992         unsigned long one = find_next_bit(bitmap, end, current);
1993         unsigned long zero, discard_length;
1994
1995         if (one >= end) {
1996             break;
1997         }
1998
1999         zero = find_next_zero_bit(bitmap, end, one + 1);
2000
2001         if (zero >= end) {
2002             discard_length = end - one;
2003         } else {
2004             discard_length = zero - one;
2005         }
2006         postcopy_discard_send_range(ms, one, discard_length);
2007         current = one + discard_length;
2008     }
2009
2010     return 0;
2011 }
2012
2013 /**
2014  * postcopy_each_ram_send_discard: discard all RAMBlocks
2015  *
2016  * Returns 0 for success or negative for error
2017  *
2018  * Utility for the outgoing postcopy code.
2019  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2020  *   passing it bitmap indexes and name.
2021  * (qemu_ram_foreach_block ends up passing unscaled lengths
2022  *  which would mean postcopy code would have to deal with target page)
2023  *
2024  * @ms: current migration state
2025  */
2026 static int postcopy_each_ram_send_discard(MigrationState *ms)
2027 {
2028     struct RAMBlock *block;
2029     int ret;
2030
2031     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2032         postcopy_discard_send_init(ms, block->idstr);
2033
2034         /*
2035          * Postcopy sends chunks of bitmap over the wire, but it
2036          * just needs indexes at this point, avoids it having
2037          * target page specific code.
2038          */
2039         ret = postcopy_send_discard_bm_ram(ms, block);
2040         postcopy_discard_send_finish(ms);
2041         if (ret) {
2042             return ret;
2043         }
2044     }
2045
2046     return 0;
2047 }
2048
2049 /**
2050  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2051  *
2052  * Helper for postcopy_chunk_hostpages; it's called twice to
2053  * canonicalize the two bitmaps, that are similar, but one is
2054  * inverted.
2055  *
2056  * Postcopy requires that all target pages in a hostpage are dirty or
2057  * clean, not a mix.  This function canonicalizes the bitmaps.
2058  *
2059  * @ms: current migration state
2060  * @block: block that contains the page we want to canonicalize
2061  */
2062 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2063 {
2064     RAMState *rs = ram_state;
2065     unsigned long *bitmap = block->bmap;
2066     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2067     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2068     unsigned long run_start;
2069
2070     if (block->page_size == TARGET_PAGE_SIZE) {
2071         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2072         return;
2073     }
2074
2075     /* Find a dirty page */
2076     run_start = find_next_bit(bitmap, pages, 0);
2077
2078     while (run_start < pages) {
2079
2080         /*
2081          * If the start of this run of pages is in the middle of a host
2082          * page, then we need to fixup this host page.
2083          */
2084         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2085             /* Find the end of this run */
2086             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2087             /*
2088              * If the end isn't at the start of a host page, then the
2089              * run doesn't finish at the end of a host page
2090              * and we need to discard.
2091              */
2092         }
2093
2094         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2095             unsigned long page;
2096             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2097                                                              host_ratio);
2098             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2099
2100             /* Clean up the bitmap */
2101             for (page = fixup_start_addr;
2102                  page < fixup_start_addr + host_ratio; page++) {
2103                 /*
2104                  * Remark them as dirty, updating the count for any pages
2105                  * that weren't previously dirty.
2106                  */
2107                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2108             }
2109         }
2110
2111         /* Find the next dirty page for the next iteration */
2112         run_start = find_next_bit(bitmap, pages, run_start);
2113     }
2114 }
2115
2116 /**
2117  * postcopy_chunk_hostpages: discard any partially sent host page
2118  *
2119  * Utility for the outgoing postcopy code.
2120  *
2121  * Discard any partially sent host-page size chunks, mark any partially
2122  * dirty host-page size chunks as all dirty.  In this case the host-page
2123  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2124  *
2125  * Returns zero on success
2126  *
2127  * @ms: current migration state
2128  * @block: block we want to work with
2129  */
2130 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2131 {
2132     postcopy_discard_send_init(ms, block->idstr);
2133
2134     /*
2135      * Ensure that all partially dirty host pages are made fully dirty.
2136      */
2137     postcopy_chunk_hostpages_pass(ms, block);
2138
2139     postcopy_discard_send_finish(ms);
2140     return 0;
2141 }
2142
2143 /**
2144  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2145  *
2146  * Returns zero on success
2147  *
2148  * Transmit the set of pages to be discarded after precopy to the target
2149  * these are pages that:
2150  *     a) Have been previously transmitted but are now dirty again
2151  *     b) Pages that have never been transmitted, this ensures that
2152  *        any pages on the destination that have been mapped by background
2153  *        tasks get discarded (transparent huge pages is the specific concern)
2154  * Hopefully this is pretty sparse
2155  *
2156  * @ms: current migration state
2157  */
2158 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2159 {
2160     RAMState *rs = ram_state;
2161     RAMBlock *block;
2162     int ret;
2163
2164     RCU_READ_LOCK_GUARD();
2165
2166     /* This should be our last sync, the src is now paused */
2167     migration_bitmap_sync(rs);
2168
2169     /* Easiest way to make sure we don't resume in the middle of a host-page */
2170     rs->last_seen_block = NULL;
2171     rs->last_sent_block = NULL;
2172     rs->last_page = 0;
2173
2174     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2175         /* Deal with TPS != HPS and huge pages */
2176         ret = postcopy_chunk_hostpages(ms, block);
2177         if (ret) {
2178             return ret;
2179         }
2180
2181 #ifdef DEBUG_POSTCOPY
2182         ram_debug_dump_bitmap(block->bmap, true,
2183                               block->used_length >> TARGET_PAGE_BITS);
2184 #endif
2185     }
2186     trace_ram_postcopy_send_discard_bitmap();
2187
2188     return postcopy_each_ram_send_discard(ms);
2189 }
2190
2191 /**
2192  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2193  *
2194  * Returns zero on success
2195  *
2196  * @rbname: name of the RAMBlock of the request. NULL means the
2197  *          same that last one.
2198  * @start: RAMBlock starting page
2199  * @length: RAMBlock size
2200  */
2201 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2202 {
2203     trace_ram_discard_range(rbname, start, length);
2204
2205     RCU_READ_LOCK_GUARD();
2206     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2207
2208     if (!rb) {
2209         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2210         return -1;
2211     }
2212
2213     /*
2214      * On source VM, we don't need to update the received bitmap since
2215      * we don't even have one.
2216      */
2217     if (rb->receivedmap) {
2218         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2219                      length >> qemu_target_page_bits());
2220     }
2221
2222     return ram_block_discard_range(rb, start, length);
2223 }
2224
2225 /*
2226  * For every allocation, we will try not to crash the VM if the
2227  * allocation failed.
2228  */
2229 static int xbzrle_init(void)
2230 {
2231     Error *local_err = NULL;
2232
2233     if (!migrate_use_xbzrle()) {
2234         return 0;
2235     }
2236
2237     XBZRLE_cache_lock();
2238
2239     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2240     if (!XBZRLE.zero_target_page) {
2241         error_report("%s: Error allocating zero page", __func__);
2242         goto err_out;
2243     }
2244
2245     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2246                               TARGET_PAGE_SIZE, &local_err);
2247     if (!XBZRLE.cache) {
2248         error_report_err(local_err);
2249         goto free_zero_page;
2250     }
2251
2252     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2253     if (!XBZRLE.encoded_buf) {
2254         error_report("%s: Error allocating encoded_buf", __func__);
2255         goto free_cache;
2256     }
2257
2258     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2259     if (!XBZRLE.current_buf) {
2260         error_report("%s: Error allocating current_buf", __func__);
2261         goto free_encoded_buf;
2262     }
2263
2264     /* We are all good */
2265     XBZRLE_cache_unlock();
2266     return 0;
2267
2268 free_encoded_buf:
2269     g_free(XBZRLE.encoded_buf);
2270     XBZRLE.encoded_buf = NULL;
2271 free_cache:
2272     cache_fini(XBZRLE.cache);
2273     XBZRLE.cache = NULL;
2274 free_zero_page:
2275     g_free(XBZRLE.zero_target_page);
2276     XBZRLE.zero_target_page = NULL;
2277 err_out:
2278     XBZRLE_cache_unlock();
2279     return -ENOMEM;
2280 }
2281
2282 static int ram_state_init(RAMState **rsp)
2283 {
2284     *rsp = g_try_new0(RAMState, 1);
2285
2286     if (!*rsp) {
2287         error_report("%s: Init ramstate fail", __func__);
2288         return -1;
2289     }
2290
2291     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2292     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2293     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2294
2295     /*
2296      * Count the total number of pages used by ram blocks not including any
2297      * gaps due to alignment or unplugs.
2298      * This must match with the initial values of dirty bitmap.
2299      */
2300     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2301     ram_state_reset(*rsp);
2302
2303     return 0;
2304 }
2305
2306 static void ram_list_init_bitmaps(void)
2307 {
2308     MigrationState *ms = migrate_get_current();
2309     RAMBlock *block;
2310     unsigned long pages;
2311     uint8_t shift;
2312
2313     /* Skip setting bitmap if there is no RAM */
2314     if (ram_bytes_total()) {
2315         shift = ms->clear_bitmap_shift;
2316         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2317             error_report("clear_bitmap_shift (%u) too big, using "
2318                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2319             shift = CLEAR_BITMAP_SHIFT_MAX;
2320         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2321             error_report("clear_bitmap_shift (%u) too small, using "
2322                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2323             shift = CLEAR_BITMAP_SHIFT_MIN;
2324         }
2325
2326         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2327             pages = block->max_length >> TARGET_PAGE_BITS;
2328             /*
2329              * The initial dirty bitmap for migration must be set with all
2330              * ones to make sure we'll migrate every guest RAM page to
2331              * destination.
2332              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2333              * new migration after a failed migration, ram_list.
2334              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2335              * guest memory.
2336              */
2337             block->bmap = bitmap_new(pages);
2338             bitmap_set(block->bmap, 0, pages);
2339             block->clear_bmap_shift = shift;
2340             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2341         }
2342     }
2343 }
2344
2345 static void ram_init_bitmaps(RAMState *rs)
2346 {
2347     /* For memory_global_dirty_log_start below.  */
2348     qemu_mutex_lock_iothread();
2349     qemu_mutex_lock_ramlist();
2350
2351     WITH_RCU_READ_LOCK_GUARD() {
2352         ram_list_init_bitmaps();
2353         memory_global_dirty_log_start();
2354         migration_bitmap_sync_precopy(rs);
2355     }
2356     qemu_mutex_unlock_ramlist();
2357     qemu_mutex_unlock_iothread();
2358 }
2359
2360 static int ram_init_all(RAMState **rsp)
2361 {
2362     if (ram_state_init(rsp)) {
2363         return -1;
2364     }
2365
2366     if (xbzrle_init()) {
2367         ram_state_cleanup(rsp);
2368         return -1;
2369     }
2370
2371     ram_init_bitmaps(*rsp);
2372
2373     return 0;
2374 }
2375
2376 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2377 {
2378     RAMBlock *block;
2379     uint64_t pages = 0;
2380
2381     /*
2382      * Postcopy is not using xbzrle/compression, so no need for that.
2383      * Also, since source are already halted, we don't need to care
2384      * about dirty page logging as well.
2385      */
2386
2387     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2388         pages += bitmap_count_one(block->bmap,
2389                                   block->used_length >> TARGET_PAGE_BITS);
2390     }
2391
2392     /* This may not be aligned with current bitmaps. Recalculate. */
2393     rs->migration_dirty_pages = pages;
2394
2395     rs->last_seen_block = NULL;
2396     rs->last_sent_block = NULL;
2397     rs->last_page = 0;
2398     rs->last_version = ram_list.version;
2399     /*
2400      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2401      * matter what we have sent.
2402      */
2403     rs->ram_bulk_stage = false;
2404
2405     /* Update RAMState cache of output QEMUFile */
2406     rs->f = out;
2407
2408     trace_ram_state_resume_prepare(pages);
2409 }
2410
2411 /*
2412  * This function clears bits of the free pages reported by the caller from the
2413  * migration dirty bitmap. @addr is the host address corresponding to the
2414  * start of the continuous guest free pages, and @len is the total bytes of
2415  * those pages.
2416  */
2417 void qemu_guest_free_page_hint(void *addr, size_t len)
2418 {
2419     RAMBlock *block;
2420     ram_addr_t offset;
2421     size_t used_len, start, npages;
2422     MigrationState *s = migrate_get_current();
2423
2424     /* This function is currently expected to be used during live migration */
2425     if (!migration_is_setup_or_active(s->state)) {
2426         return;
2427     }
2428
2429     for (; len > 0; len -= used_len, addr += used_len) {
2430         block = qemu_ram_block_from_host(addr, false, &offset);
2431         if (unlikely(!block || offset >= block->used_length)) {
2432             /*
2433              * The implementation might not support RAMBlock resize during
2434              * live migration, but it could happen in theory with future
2435              * updates. So we add a check here to capture that case.
2436              */
2437             error_report_once("%s unexpected error", __func__);
2438             return;
2439         }
2440
2441         if (len <= block->used_length - offset) {
2442             used_len = len;
2443         } else {
2444             used_len = block->used_length - offset;
2445         }
2446
2447         start = offset >> TARGET_PAGE_BITS;
2448         npages = used_len >> TARGET_PAGE_BITS;
2449
2450         qemu_mutex_lock(&ram_state->bitmap_mutex);
2451         ram_state->migration_dirty_pages -=
2452                       bitmap_count_one_with_offset(block->bmap, start, npages);
2453         bitmap_clear(block->bmap, start, npages);
2454         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2455     }
2456 }
2457
2458 /*
2459  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2460  * long-running RCU critical section.  When rcu-reclaims in the code
2461  * start to become numerous it will be necessary to reduce the
2462  * granularity of these critical sections.
2463  */
2464
2465 /**
2466  * ram_save_setup: Setup RAM for migration
2467  *
2468  * Returns zero to indicate success and negative for error
2469  *
2470  * @f: QEMUFile where to send the data
2471  * @opaque: RAMState pointer
2472  */
2473 static int ram_save_setup(QEMUFile *f, void *opaque)
2474 {
2475     RAMState **rsp = opaque;
2476     RAMBlock *block;
2477
2478     if (compress_threads_save_setup()) {
2479         return -1;
2480     }
2481
2482     /* migration has already setup the bitmap, reuse it. */
2483     if (!migration_in_colo_state()) {
2484         if (ram_init_all(rsp) != 0) {
2485             compress_threads_save_cleanup();
2486             return -1;
2487         }
2488     }
2489     (*rsp)->f = f;
2490
2491     WITH_RCU_READ_LOCK_GUARD() {
2492         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2493
2494         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2495             qemu_put_byte(f, strlen(block->idstr));
2496             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2497             qemu_put_be64(f, block->used_length);
2498             if (migrate_postcopy_ram() && block->page_size !=
2499                                           qemu_host_page_size) {
2500                 qemu_put_be64(f, block->page_size);
2501             }
2502             if (migrate_ignore_shared()) {
2503                 qemu_put_be64(f, block->mr->addr);
2504             }
2505         }
2506     }
2507
2508     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2509     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2510
2511     multifd_send_sync_main(f);
2512     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2513     qemu_fflush(f);
2514
2515     return 0;
2516 }
2517
2518 /**
2519  * ram_save_iterate: iterative stage for migration
2520  *
2521  * Returns zero to indicate success and negative for error
2522  *
2523  * @f: QEMUFile where to send the data
2524  * @opaque: RAMState pointer
2525  */
2526 static int ram_save_iterate(QEMUFile *f, void *opaque)
2527 {
2528     RAMState **temp = opaque;
2529     RAMState *rs = *temp;
2530     int ret = 0;
2531     int i;
2532     int64_t t0;
2533     int done = 0;
2534
2535     if (blk_mig_bulk_active()) {
2536         /* Avoid transferring ram during bulk phase of block migration as
2537          * the bulk phase will usually take a long time and transferring
2538          * ram updates during that time is pointless. */
2539         goto out;
2540     }
2541
2542     WITH_RCU_READ_LOCK_GUARD() {
2543         if (ram_list.version != rs->last_version) {
2544             ram_state_reset(rs);
2545         }
2546
2547         /* Read version before ram_list.blocks */
2548         smp_rmb();
2549
2550         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2551
2552         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2553         i = 0;
2554         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2555                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2556             int pages;
2557
2558             if (qemu_file_get_error(f)) {
2559                 break;
2560             }
2561
2562             pages = ram_find_and_save_block(rs, false);
2563             /* no more pages to sent */
2564             if (pages == 0) {
2565                 done = 1;
2566                 break;
2567             }
2568
2569             if (pages < 0) {
2570                 qemu_file_set_error(f, pages);
2571                 break;
2572             }
2573
2574             rs->target_page_count += pages;
2575
2576             /*
2577              * During postcopy, it is necessary to make sure one whole host
2578              * page is sent in one chunk.
2579              */
2580             if (migrate_postcopy_ram()) {
2581                 flush_compressed_data(rs);
2582             }
2583
2584             /*
2585              * we want to check in the 1st loop, just in case it was the 1st
2586              * time and we had to sync the dirty bitmap.
2587              * qemu_clock_get_ns() is a bit expensive, so we only check each
2588              * some iterations
2589              */
2590             if ((i & 63) == 0) {
2591                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2592                               1000000;
2593                 if (t1 > MAX_WAIT) {
2594                     trace_ram_save_iterate_big_wait(t1, i);
2595                     break;
2596                 }
2597             }
2598             i++;
2599         }
2600     }
2601
2602     /*
2603      * Must occur before EOS (or any QEMUFile operation)
2604      * because of RDMA protocol.
2605      */
2606     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2607
2608 out:
2609     if (ret >= 0
2610         && migration_is_setup_or_active(migrate_get_current()->state)) {
2611         multifd_send_sync_main(rs->f);
2612         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2613         qemu_fflush(f);
2614         ram_counters.transferred += 8;
2615
2616         ret = qemu_file_get_error(f);
2617     }
2618     if (ret < 0) {
2619         return ret;
2620     }
2621
2622     return done;
2623 }
2624
2625 /**
2626  * ram_save_complete: function called to send the remaining amount of ram
2627  *
2628  * Returns zero to indicate success or negative on error
2629  *
2630  * Called with iothread lock
2631  *
2632  * @f: QEMUFile where to send the data
2633  * @opaque: RAMState pointer
2634  */
2635 static int ram_save_complete(QEMUFile *f, void *opaque)
2636 {
2637     RAMState **temp = opaque;
2638     RAMState *rs = *temp;
2639     int ret = 0;
2640
2641     WITH_RCU_READ_LOCK_GUARD() {
2642         if (!migration_in_postcopy()) {
2643             migration_bitmap_sync_precopy(rs);
2644         }
2645
2646         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2647
2648         /* try transferring iterative blocks of memory */
2649
2650         /* flush all remaining blocks regardless of rate limiting */
2651         while (true) {
2652             int pages;
2653
2654             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2655             /* no more blocks to sent */
2656             if (pages == 0) {
2657                 break;
2658             }
2659             if (pages < 0) {
2660                 ret = pages;
2661                 break;
2662             }
2663         }
2664
2665         flush_compressed_data(rs);
2666         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2667     }
2668
2669     if (ret >= 0) {
2670         multifd_send_sync_main(rs->f);
2671         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2672         qemu_fflush(f);
2673     }
2674
2675     return ret;
2676 }
2677
2678 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2679                              uint64_t *res_precopy_only,
2680                              uint64_t *res_compatible,
2681                              uint64_t *res_postcopy_only)
2682 {
2683     RAMState **temp = opaque;
2684     RAMState *rs = *temp;
2685     uint64_t remaining_size;
2686
2687     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2688
2689     if (!migration_in_postcopy() &&
2690         remaining_size < max_size) {
2691         qemu_mutex_lock_iothread();
2692         WITH_RCU_READ_LOCK_GUARD() {
2693             migration_bitmap_sync_precopy(rs);
2694         }
2695         qemu_mutex_unlock_iothread();
2696         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2697     }
2698
2699     if (migrate_postcopy_ram()) {
2700         /* We can do postcopy, and all the data is postcopiable */
2701         *res_compatible += remaining_size;
2702     } else {
2703         *res_precopy_only += remaining_size;
2704     }
2705 }
2706
2707 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2708 {
2709     unsigned int xh_len;
2710     int xh_flags;
2711     uint8_t *loaded_data;
2712
2713     /* extract RLE header */
2714     xh_flags = qemu_get_byte(f);
2715     xh_len = qemu_get_be16(f);
2716
2717     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2718         error_report("Failed to load XBZRLE page - wrong compression!");
2719         return -1;
2720     }
2721
2722     if (xh_len > TARGET_PAGE_SIZE) {
2723         error_report("Failed to load XBZRLE page - len overflow!");
2724         return -1;
2725     }
2726     loaded_data = XBZRLE.decoded_buf;
2727     /* load data and decode */
2728     /* it can change loaded_data to point to an internal buffer */
2729     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2730
2731     /* decode RLE */
2732     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2733                              TARGET_PAGE_SIZE) == -1) {
2734         error_report("Failed to load XBZRLE page - decode error!");
2735         return -1;
2736     }
2737
2738     return 0;
2739 }
2740
2741 /**
2742  * ram_block_from_stream: read a RAMBlock id from the migration stream
2743  *
2744  * Must be called from within a rcu critical section.
2745  *
2746  * Returns a pointer from within the RCU-protected ram_list.
2747  *
2748  * @f: QEMUFile where to read the data from
2749  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2750  */
2751 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2752 {
2753     static RAMBlock *block = NULL;
2754     char id[256];
2755     uint8_t len;
2756
2757     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2758         if (!block) {
2759             error_report("Ack, bad migration stream!");
2760             return NULL;
2761         }
2762         return block;
2763     }
2764
2765     len = qemu_get_byte(f);
2766     qemu_get_buffer(f, (uint8_t *)id, len);
2767     id[len] = 0;
2768
2769     block = qemu_ram_block_by_name(id);
2770     if (!block) {
2771         error_report("Can't find block %s", id);
2772         return NULL;
2773     }
2774
2775     if (ramblock_is_ignored(block)) {
2776         error_report("block %s should not be migrated !", id);
2777         return NULL;
2778     }
2779
2780     return block;
2781 }
2782
2783 static inline void *host_from_ram_block_offset(RAMBlock *block,
2784                                                ram_addr_t offset)
2785 {
2786     if (!offset_in_ramblock(block, offset)) {
2787         return NULL;
2788     }
2789
2790     return block->host + offset;
2791 }
2792
2793 static inline void *colo_cache_from_block_offset(RAMBlock *block,
2794                              ram_addr_t offset, bool record_bitmap)
2795 {
2796     if (!offset_in_ramblock(block, offset)) {
2797         return NULL;
2798     }
2799     if (!block->colo_cache) {
2800         error_report("%s: colo_cache is NULL in block :%s",
2801                      __func__, block->idstr);
2802         return NULL;
2803     }
2804
2805     /*
2806     * During colo checkpoint, we need bitmap of these migrated pages.
2807     * It help us to decide which pages in ram cache should be flushed
2808     * into VM's RAM later.
2809     */
2810     if (record_bitmap &&
2811         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
2812         ram_state->migration_dirty_pages++;
2813     }
2814     return block->colo_cache + offset;
2815 }
2816
2817 /**
2818  * ram_handle_compressed: handle the zero page case
2819  *
2820  * If a page (or a whole RDMA chunk) has been
2821  * determined to be zero, then zap it.
2822  *
2823  * @host: host address for the zero page
2824  * @ch: what the page is filled from.  We only support zero
2825  * @size: size of the zero page
2826  */
2827 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2828 {
2829     if (ch != 0 || !is_zero_range(host, size)) {
2830         memset(host, ch, size);
2831     }
2832 }
2833
2834 /* return the size after decompression, or negative value on error */
2835 static int
2836 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2837                      const uint8_t *source, size_t source_len)
2838 {
2839     int err;
2840
2841     err = inflateReset(stream);
2842     if (err != Z_OK) {
2843         return -1;
2844     }
2845
2846     stream->avail_in = source_len;
2847     stream->next_in = (uint8_t *)source;
2848     stream->avail_out = dest_len;
2849     stream->next_out = dest;
2850
2851     err = inflate(stream, Z_NO_FLUSH);
2852     if (err != Z_STREAM_END) {
2853         return -1;
2854     }
2855
2856     return stream->total_out;
2857 }
2858
2859 static void *do_data_decompress(void *opaque)
2860 {
2861     DecompressParam *param = opaque;
2862     unsigned long pagesize;
2863     uint8_t *des;
2864     int len, ret;
2865
2866     qemu_mutex_lock(&param->mutex);
2867     while (!param->quit) {
2868         if (param->des) {
2869             des = param->des;
2870             len = param->len;
2871             param->des = 0;
2872             qemu_mutex_unlock(&param->mutex);
2873
2874             pagesize = TARGET_PAGE_SIZE;
2875
2876             ret = qemu_uncompress_data(&param->stream, des, pagesize,
2877                                        param->compbuf, len);
2878             if (ret < 0 && migrate_get_current()->decompress_error_check) {
2879                 error_report("decompress data failed");
2880                 qemu_file_set_error(decomp_file, ret);
2881             }
2882
2883             qemu_mutex_lock(&decomp_done_lock);
2884             param->done = true;
2885             qemu_cond_signal(&decomp_done_cond);
2886             qemu_mutex_unlock(&decomp_done_lock);
2887
2888             qemu_mutex_lock(&param->mutex);
2889         } else {
2890             qemu_cond_wait(&param->cond, &param->mutex);
2891         }
2892     }
2893     qemu_mutex_unlock(&param->mutex);
2894
2895     return NULL;
2896 }
2897
2898 static int wait_for_decompress_done(void)
2899 {
2900     int idx, thread_count;
2901
2902     if (!migrate_use_compression()) {
2903         return 0;
2904     }
2905
2906     thread_count = migrate_decompress_threads();
2907     qemu_mutex_lock(&decomp_done_lock);
2908     for (idx = 0; idx < thread_count; idx++) {
2909         while (!decomp_param[idx].done) {
2910             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2911         }
2912     }
2913     qemu_mutex_unlock(&decomp_done_lock);
2914     return qemu_file_get_error(decomp_file);
2915 }
2916
2917 static void compress_threads_load_cleanup(void)
2918 {
2919     int i, thread_count;
2920
2921     if (!migrate_use_compression()) {
2922         return;
2923     }
2924     thread_count = migrate_decompress_threads();
2925     for (i = 0; i < thread_count; i++) {
2926         /*
2927          * we use it as a indicator which shows if the thread is
2928          * properly init'd or not
2929          */
2930         if (!decomp_param[i].compbuf) {
2931             break;
2932         }
2933
2934         qemu_mutex_lock(&decomp_param[i].mutex);
2935         decomp_param[i].quit = true;
2936         qemu_cond_signal(&decomp_param[i].cond);
2937         qemu_mutex_unlock(&decomp_param[i].mutex);
2938     }
2939     for (i = 0; i < thread_count; i++) {
2940         if (!decomp_param[i].compbuf) {
2941             break;
2942         }
2943
2944         qemu_thread_join(decompress_threads + i);
2945         qemu_mutex_destroy(&decomp_param[i].mutex);
2946         qemu_cond_destroy(&decomp_param[i].cond);
2947         inflateEnd(&decomp_param[i].stream);
2948         g_free(decomp_param[i].compbuf);
2949         decomp_param[i].compbuf = NULL;
2950     }
2951     g_free(decompress_threads);
2952     g_free(decomp_param);
2953     decompress_threads = NULL;
2954     decomp_param = NULL;
2955     decomp_file = NULL;
2956 }
2957
2958 static int compress_threads_load_setup(QEMUFile *f)
2959 {
2960     int i, thread_count;
2961
2962     if (!migrate_use_compression()) {
2963         return 0;
2964     }
2965
2966     thread_count = migrate_decompress_threads();
2967     decompress_threads = g_new0(QemuThread, thread_count);
2968     decomp_param = g_new0(DecompressParam, thread_count);
2969     qemu_mutex_init(&decomp_done_lock);
2970     qemu_cond_init(&decomp_done_cond);
2971     decomp_file = f;
2972     for (i = 0; i < thread_count; i++) {
2973         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2974             goto exit;
2975         }
2976
2977         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2978         qemu_mutex_init(&decomp_param[i].mutex);
2979         qemu_cond_init(&decomp_param[i].cond);
2980         decomp_param[i].done = true;
2981         decomp_param[i].quit = false;
2982         qemu_thread_create(decompress_threads + i, "decompress",
2983                            do_data_decompress, decomp_param + i,
2984                            QEMU_THREAD_JOINABLE);
2985     }
2986     return 0;
2987 exit:
2988     compress_threads_load_cleanup();
2989     return -1;
2990 }
2991
2992 static void decompress_data_with_multi_threads(QEMUFile *f,
2993                                                void *host, int len)
2994 {
2995     int idx, thread_count;
2996
2997     thread_count = migrate_decompress_threads();
2998     qemu_mutex_lock(&decomp_done_lock);
2999     while (true) {
3000         for (idx = 0; idx < thread_count; idx++) {
3001             if (decomp_param[idx].done) {
3002                 decomp_param[idx].done = false;
3003                 qemu_mutex_lock(&decomp_param[idx].mutex);
3004                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3005                 decomp_param[idx].des = host;
3006                 decomp_param[idx].len = len;
3007                 qemu_cond_signal(&decomp_param[idx].cond);
3008                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3009                 break;
3010             }
3011         }
3012         if (idx < thread_count) {
3013             break;
3014         } else {
3015             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3016         }
3017     }
3018     qemu_mutex_unlock(&decomp_done_lock);
3019 }
3020
3021 /*
3022  * colo cache: this is for secondary VM, we cache the whole
3023  * memory of the secondary VM, it is need to hold the global lock
3024  * to call this helper.
3025  */
3026 int colo_init_ram_cache(void)
3027 {
3028     RAMBlock *block;
3029
3030     WITH_RCU_READ_LOCK_GUARD() {
3031         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3032             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3033                                                     NULL,
3034                                                     false);
3035             if (!block->colo_cache) {
3036                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3037                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3038                              block->used_length);
3039                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3040                     if (block->colo_cache) {
3041                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3042                         block->colo_cache = NULL;
3043                     }
3044                 }
3045                 return -errno;
3046             }
3047         }
3048     }
3049
3050     /*
3051     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3052     * with to decide which page in cache should be flushed into SVM's RAM. Here
3053     * we use the same name 'ram_bitmap' as for migration.
3054     */
3055     if (ram_bytes_total()) {
3056         RAMBlock *block;
3057
3058         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3059             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3060             block->bmap = bitmap_new(pages);
3061         }
3062     }
3063
3064     ram_state_init(&ram_state);
3065     return 0;
3066 }
3067
3068 /* TODO: duplicated with ram_init_bitmaps */
3069 void colo_incoming_start_dirty_log(void)
3070 {
3071     RAMBlock *block = NULL;
3072     /* For memory_global_dirty_log_start below. */
3073     qemu_mutex_lock_iothread();
3074     qemu_mutex_lock_ramlist();
3075
3076     memory_global_dirty_log_sync();
3077     WITH_RCU_READ_LOCK_GUARD() {
3078         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3079             ramblock_sync_dirty_bitmap(ram_state, block);
3080             /* Discard this dirty bitmap record */
3081             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3082         }
3083         memory_global_dirty_log_start();
3084     }
3085     ram_state->migration_dirty_pages = 0;
3086     qemu_mutex_unlock_ramlist();
3087     qemu_mutex_unlock_iothread();
3088 }
3089
3090 /* It is need to hold the global lock to call this helper */
3091 void colo_release_ram_cache(void)
3092 {
3093     RAMBlock *block;
3094
3095     memory_global_dirty_log_stop();
3096     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3097         g_free(block->bmap);
3098         block->bmap = NULL;
3099     }
3100
3101     WITH_RCU_READ_LOCK_GUARD() {
3102         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3103             if (block->colo_cache) {
3104                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3105                 block->colo_cache = NULL;
3106             }
3107         }
3108     }
3109     ram_state_cleanup(&ram_state);
3110 }
3111
3112 /**
3113  * ram_load_setup: Setup RAM for migration incoming side
3114  *
3115  * Returns zero to indicate success and negative for error
3116  *
3117  * @f: QEMUFile where to receive the data
3118  * @opaque: RAMState pointer
3119  */
3120 static int ram_load_setup(QEMUFile *f, void *opaque)
3121 {
3122     if (compress_threads_load_setup(f)) {
3123         return -1;
3124     }
3125
3126     xbzrle_load_setup();
3127     ramblock_recv_map_init();
3128
3129     return 0;
3130 }
3131
3132 static int ram_load_cleanup(void *opaque)
3133 {
3134     RAMBlock *rb;
3135
3136     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3137         qemu_ram_block_writeback(rb);
3138     }
3139
3140     xbzrle_load_cleanup();
3141     compress_threads_load_cleanup();
3142
3143     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3144         g_free(rb->receivedmap);
3145         rb->receivedmap = NULL;
3146     }
3147
3148     return 0;
3149 }
3150
3151 /**
3152  * ram_postcopy_incoming_init: allocate postcopy data structures
3153  *
3154  * Returns 0 for success and negative if there was one error
3155  *
3156  * @mis: current migration incoming state
3157  *
3158  * Allocate data structures etc needed by incoming migration with
3159  * postcopy-ram. postcopy-ram's similarly names
3160  * postcopy_ram_incoming_init does the work.
3161  */
3162 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3163 {
3164     return postcopy_ram_incoming_init(mis);
3165 }
3166
3167 /**
3168  * ram_load_postcopy: load a page in postcopy case
3169  *
3170  * Returns 0 for success or -errno in case of error
3171  *
3172  * Called in postcopy mode by ram_load().
3173  * rcu_read_lock is taken prior to this being called.
3174  *
3175  * @f: QEMUFile where to send the data
3176  */
3177 static int ram_load_postcopy(QEMUFile *f)
3178 {
3179     int flags = 0, ret = 0;
3180     bool place_needed = false;
3181     bool matches_target_page_size = false;
3182     MigrationIncomingState *mis = migration_incoming_get_current();
3183     /* Temporary page that is later 'placed' */
3184     void *postcopy_host_page = mis->postcopy_tmp_page;
3185     void *this_host = NULL;
3186     bool all_zero = true;
3187     int target_pages = 0;
3188
3189     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3190         ram_addr_t addr;
3191         void *host = NULL;
3192         void *page_buffer = NULL;
3193         void *place_source = NULL;
3194         RAMBlock *block = NULL;
3195         uint8_t ch;
3196         int len;
3197
3198         addr = qemu_get_be64(f);
3199
3200         /*
3201          * If qemu file error, we should stop here, and then "addr"
3202          * may be invalid
3203          */
3204         ret = qemu_file_get_error(f);
3205         if (ret) {
3206             break;
3207         }
3208
3209         flags = addr & ~TARGET_PAGE_MASK;
3210         addr &= TARGET_PAGE_MASK;
3211
3212         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3213         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3214                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3215             block = ram_block_from_stream(f, flags);
3216
3217             host = host_from_ram_block_offset(block, addr);
3218             if (!host) {
3219                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3220                 ret = -EINVAL;
3221                 break;
3222             }
3223             target_pages++;
3224             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3225             /*
3226              * Postcopy requires that we place whole host pages atomically;
3227              * these may be huge pages for RAMBlocks that are backed by
3228              * hugetlbfs.
3229              * To make it atomic, the data is read into a temporary page
3230              * that's moved into place later.
3231              * The migration protocol uses,  possibly smaller, target-pages
3232              * however the source ensures it always sends all the components
3233              * of a host page in one chunk.
3234              */
3235             page_buffer = postcopy_host_page +
3236                           ((uintptr_t)host & (block->page_size - 1));
3237             if (target_pages == 1) {
3238                 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3239                                                     block->page_size);
3240             } else {
3241                 /* not the 1st TP within the HP */
3242                 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3243                     (uintptr_t)this_host) {
3244                     error_report("Non-same host page %p/%p",
3245                                   host, this_host);
3246                     ret = -EINVAL;
3247                     break;
3248                 }
3249             }
3250
3251             /*
3252              * If it's the last part of a host page then we place the host
3253              * page
3254              */
3255             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3256                 place_needed = true;
3257             }
3258             place_source = postcopy_host_page;
3259         }
3260
3261         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3262         case RAM_SAVE_FLAG_ZERO:
3263             ch = qemu_get_byte(f);
3264             /*
3265              * Can skip to set page_buffer when
3266              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3267              */
3268             if (ch || !matches_target_page_size) {
3269                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3270             }
3271             if (ch) {
3272                 all_zero = false;
3273             }
3274             break;
3275
3276         case RAM_SAVE_FLAG_PAGE:
3277             all_zero = false;
3278             if (!matches_target_page_size) {
3279                 /* For huge pages, we always use temporary buffer */
3280                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3281             } else {
3282                 /*
3283                  * For small pages that matches target page size, we
3284                  * avoid the qemu_file copy.  Instead we directly use
3285                  * the buffer of QEMUFile to place the page.  Note: we
3286                  * cannot do any QEMUFile operation before using that
3287                  * buffer to make sure the buffer is valid when
3288                  * placing the page.
3289                  */
3290                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3291                                          TARGET_PAGE_SIZE);
3292             }
3293             break;
3294         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3295             all_zero = false;
3296             len = qemu_get_be32(f);
3297             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3298                 error_report("Invalid compressed data length: %d", len);
3299                 ret = -EINVAL;
3300                 break;
3301             }
3302             decompress_data_with_multi_threads(f, page_buffer, len);
3303             break;
3304
3305         case RAM_SAVE_FLAG_EOS:
3306             /* normal exit */
3307             multifd_recv_sync_main();
3308             break;
3309         default:
3310             error_report("Unknown combination of migration flags: %#x"
3311                          " (postcopy mode)", flags);
3312             ret = -EINVAL;
3313             break;
3314         }
3315
3316         /* Got the whole host page, wait for decompress before placing. */
3317         if (place_needed) {
3318             ret |= wait_for_decompress_done();
3319         }
3320
3321         /* Detect for any possible file errors */
3322         if (!ret && qemu_file_get_error(f)) {
3323             ret = qemu_file_get_error(f);
3324         }
3325
3326         if (!ret && place_needed) {
3327             /* This gets called at the last target page in the host page */
3328             void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3329                                                        block->page_size);
3330
3331             if (all_zero) {
3332                 ret = postcopy_place_page_zero(mis, place_dest,
3333                                                block);
3334             } else {
3335                 ret = postcopy_place_page(mis, place_dest,
3336                                           place_source, block);
3337             }
3338             place_needed = false;
3339             target_pages = 0;
3340             /* Assume we have a zero page until we detect something different */
3341             all_zero = true;
3342         }
3343     }
3344
3345     return ret;
3346 }
3347
3348 static bool postcopy_is_advised(void)
3349 {
3350     PostcopyState ps = postcopy_state_get();
3351     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3352 }
3353
3354 static bool postcopy_is_running(void)
3355 {
3356     PostcopyState ps = postcopy_state_get();
3357     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3358 }
3359
3360 /*
3361  * Flush content of RAM cache into SVM's memory.
3362  * Only flush the pages that be dirtied by PVM or SVM or both.
3363  */
3364 void colo_flush_ram_cache(void)
3365 {
3366     RAMBlock *block = NULL;
3367     void *dst_host;
3368     void *src_host;
3369     unsigned long offset = 0;
3370
3371     memory_global_dirty_log_sync();
3372     WITH_RCU_READ_LOCK_GUARD() {
3373         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3374             ramblock_sync_dirty_bitmap(ram_state, block);
3375         }
3376     }
3377
3378     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3379     WITH_RCU_READ_LOCK_GUARD() {
3380         block = QLIST_FIRST_RCU(&ram_list.blocks);
3381
3382         while (block) {
3383             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3384
3385             if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3386                 >= block->used_length) {
3387                 offset = 0;
3388                 block = QLIST_NEXT_RCU(block, next);
3389             } else {
3390                 migration_bitmap_clear_dirty(ram_state, block, offset);
3391                 dst_host = block->host
3392                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3393                 src_host = block->colo_cache
3394                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3395                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3396             }
3397         }
3398     }
3399     trace_colo_flush_ram_cache_end();
3400 }
3401
3402 /**
3403  * ram_load_precopy: load pages in precopy case
3404  *
3405  * Returns 0 for success or -errno in case of error
3406  *
3407  * Called in precopy mode by ram_load().
3408  * rcu_read_lock is taken prior to this being called.
3409  *
3410  * @f: QEMUFile where to send the data
3411  */
3412 static int ram_load_precopy(QEMUFile *f)
3413 {
3414     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3415     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3416     bool postcopy_advised = postcopy_is_advised();
3417     if (!migrate_use_compression()) {
3418         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3419     }
3420
3421     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3422         ram_addr_t addr, total_ram_bytes;
3423         void *host = NULL, *host_bak = NULL;
3424         uint8_t ch;
3425
3426         /*
3427          * Yield periodically to let main loop run, but an iteration of
3428          * the main loop is expensive, so do it each some iterations
3429          */
3430         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3431             aio_co_schedule(qemu_get_current_aio_context(),
3432                             qemu_coroutine_self());
3433             qemu_coroutine_yield();
3434         }
3435         i++;
3436
3437         addr = qemu_get_be64(f);
3438         flags = addr & ~TARGET_PAGE_MASK;
3439         addr &= TARGET_PAGE_MASK;
3440
3441         if (flags & invalid_flags) {
3442             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3443                 error_report("Received an unexpected compressed page");
3444             }
3445
3446             ret = -EINVAL;
3447             break;
3448         }
3449
3450         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3451                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3452             RAMBlock *block = ram_block_from_stream(f, flags);
3453
3454             host = host_from_ram_block_offset(block, addr);
3455             /*
3456              * After going into COLO stage, we should not load the page
3457              * into SVM's memory directly, we put them into colo_cache firstly.
3458              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3459              * Previously, we copied all these memory in preparing stage of COLO
3460              * while we need to stop VM, which is a time-consuming process.
3461              * Here we optimize it by a trick, back-up every page while in
3462              * migration process while COLO is enabled, though it affects the
3463              * speed of the migration, but it obviously reduce the downtime of
3464              * back-up all SVM'S memory in COLO preparing stage.
3465              */
3466             if (migration_incoming_colo_enabled()) {
3467                 if (migration_incoming_in_colo_state()) {
3468                     /* In COLO stage, put all pages into cache temporarily */
3469                     host = colo_cache_from_block_offset(block, addr, true);
3470                 } else {
3471                    /*
3472                     * In migration stage but before COLO stage,
3473                     * Put all pages into both cache and SVM's memory.
3474                     */
3475                     host_bak = colo_cache_from_block_offset(block, addr, false);
3476                 }
3477             }
3478             if (!host) {
3479                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3480                 ret = -EINVAL;
3481                 break;
3482             }
3483             if (!migration_incoming_in_colo_state()) {
3484                 ramblock_recv_bitmap_set(block, host);
3485             }
3486
3487             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3488         }
3489
3490         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3491         case RAM_SAVE_FLAG_MEM_SIZE:
3492             /* Synchronize RAM block list */
3493             total_ram_bytes = addr;
3494             while (!ret && total_ram_bytes) {
3495                 RAMBlock *block;
3496                 char id[256];
3497                 ram_addr_t length;
3498
3499                 len = qemu_get_byte(f);
3500                 qemu_get_buffer(f, (uint8_t *)id, len);
3501                 id[len] = 0;
3502                 length = qemu_get_be64(f);
3503
3504                 block = qemu_ram_block_by_name(id);
3505                 if (block && !qemu_ram_is_migratable(block)) {
3506                     error_report("block %s should not be migrated !", id);
3507                     ret = -EINVAL;
3508                 } else if (block) {
3509                     if (length != block->used_length) {
3510                         Error *local_err = NULL;
3511
3512                         ret = qemu_ram_resize(block, length,
3513                                               &local_err);
3514                         if (local_err) {
3515                             error_report_err(local_err);
3516                         }
3517                     }
3518                     /* For postcopy we need to check hugepage sizes match */
3519                     if (postcopy_advised &&
3520                         block->page_size != qemu_host_page_size) {
3521                         uint64_t remote_page_size = qemu_get_be64(f);
3522                         if (remote_page_size != block->page_size) {
3523                             error_report("Mismatched RAM page size %s "
3524                                          "(local) %zd != %" PRId64,
3525                                          id, block->page_size,
3526                                          remote_page_size);
3527                             ret = -EINVAL;
3528                         }
3529                     }
3530                     if (migrate_ignore_shared()) {
3531                         hwaddr addr = qemu_get_be64(f);
3532                         if (ramblock_is_ignored(block) &&
3533                             block->mr->addr != addr) {
3534                             error_report("Mismatched GPAs for block %s "
3535                                          "%" PRId64 "!= %" PRId64,
3536                                          id, (uint64_t)addr,
3537                                          (uint64_t)block->mr->addr);
3538                             ret = -EINVAL;
3539                         }
3540                     }
3541                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3542                                           block->idstr);
3543                 } else {
3544                     error_report("Unknown ramblock \"%s\", cannot "
3545                                  "accept migration", id);
3546                     ret = -EINVAL;
3547                 }
3548
3549                 total_ram_bytes -= length;
3550             }
3551             break;
3552
3553         case RAM_SAVE_FLAG_ZERO:
3554             ch = qemu_get_byte(f);
3555             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3556             break;
3557
3558         case RAM_SAVE_FLAG_PAGE:
3559             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3560             break;
3561
3562         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3563             len = qemu_get_be32(f);
3564             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3565                 error_report("Invalid compressed data length: %d", len);
3566                 ret = -EINVAL;
3567                 break;
3568             }
3569             decompress_data_with_multi_threads(f, host, len);
3570             break;
3571
3572         case RAM_SAVE_FLAG_XBZRLE:
3573             if (load_xbzrle(f, addr, host) < 0) {
3574                 error_report("Failed to decompress XBZRLE page at "
3575                              RAM_ADDR_FMT, addr);
3576                 ret = -EINVAL;
3577                 break;
3578             }
3579             break;
3580         case RAM_SAVE_FLAG_EOS:
3581             /* normal exit */
3582             multifd_recv_sync_main();
3583             break;
3584         default:
3585             if (flags & RAM_SAVE_FLAG_HOOK) {
3586                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3587             } else {
3588                 error_report("Unknown combination of migration flags: %#x",
3589                              flags);
3590                 ret = -EINVAL;
3591             }
3592         }
3593         if (!ret) {
3594             ret = qemu_file_get_error(f);
3595         }
3596         if (!ret && host_bak) {
3597             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3598         }
3599     }
3600
3601     ret |= wait_for_decompress_done();
3602     return ret;
3603 }
3604
3605 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3606 {
3607     int ret = 0;
3608     static uint64_t seq_iter;
3609     /*
3610      * If system is running in postcopy mode, page inserts to host memory must
3611      * be atomic
3612      */
3613     bool postcopy_running = postcopy_is_running();
3614
3615     seq_iter++;
3616
3617     if (version_id != 4) {
3618         return -EINVAL;
3619     }
3620
3621     /*
3622      * This RCU critical section can be very long running.
3623      * When RCU reclaims in the code start to become numerous,
3624      * it will be necessary to reduce the granularity of this
3625      * critical section.
3626      */
3627     WITH_RCU_READ_LOCK_GUARD() {
3628         if (postcopy_running) {
3629             ret = ram_load_postcopy(f);
3630         } else {
3631             ret = ram_load_precopy(f);
3632         }
3633     }
3634     trace_ram_load_complete(ret, seq_iter);
3635
3636     return ret;
3637 }
3638
3639 static bool ram_has_postcopy(void *opaque)
3640 {
3641     RAMBlock *rb;
3642     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3643         if (ramblock_is_pmem(rb)) {
3644             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3645                          "is not supported now!", rb->idstr, rb->host);
3646             return false;
3647         }
3648     }
3649
3650     return migrate_postcopy_ram();
3651 }
3652
3653 /* Sync all the dirty bitmap with destination VM.  */
3654 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3655 {
3656     RAMBlock *block;
3657     QEMUFile *file = s->to_dst_file;
3658     int ramblock_count = 0;
3659
3660     trace_ram_dirty_bitmap_sync_start();
3661
3662     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3663         qemu_savevm_send_recv_bitmap(file, block->idstr);
3664         trace_ram_dirty_bitmap_request(block->idstr);
3665         ramblock_count++;
3666     }
3667
3668     trace_ram_dirty_bitmap_sync_wait();
3669
3670     /* Wait until all the ramblocks' dirty bitmap synced */
3671     while (ramblock_count--) {
3672         qemu_sem_wait(&s->rp_state.rp_sem);
3673     }
3674
3675     trace_ram_dirty_bitmap_sync_complete();
3676
3677     return 0;
3678 }
3679
3680 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3681 {
3682     qemu_sem_post(&s->rp_state.rp_sem);
3683 }
3684
3685 /*
3686  * Read the received bitmap, revert it as the initial dirty bitmap.
3687  * This is only used when the postcopy migration is paused but wants
3688  * to resume from a middle point.
3689  */
3690 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3691 {
3692     int ret = -EINVAL;
3693     QEMUFile *file = s->rp_state.from_dst_file;
3694     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3695     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3696     uint64_t size, end_mark;
3697
3698     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3699
3700     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3701         error_report("%s: incorrect state %s", __func__,
3702                      MigrationStatus_str(s->state));
3703         return -EINVAL;
3704     }
3705
3706     /*
3707      * Note: see comments in ramblock_recv_bitmap_send() on why we
3708      * need the endianess convertion, and the paddings.
3709      */
3710     local_size = ROUND_UP(local_size, 8);
3711
3712     /* Add paddings */
3713     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3714
3715     size = qemu_get_be64(file);
3716
3717     /* The size of the bitmap should match with our ramblock */
3718     if (size != local_size) {
3719         error_report("%s: ramblock '%s' bitmap size mismatch "
3720                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3721                      block->idstr, size, local_size);
3722         ret = -EINVAL;
3723         goto out;
3724     }
3725
3726     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3727     end_mark = qemu_get_be64(file);
3728
3729     ret = qemu_file_get_error(file);
3730     if (ret || size != local_size) {
3731         error_report("%s: read bitmap failed for ramblock '%s': %d"
3732                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3733                      __func__, block->idstr, ret, local_size, size);
3734         ret = -EIO;
3735         goto out;
3736     }
3737
3738     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3739         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3740                      __func__, block->idstr, end_mark);
3741         ret = -EINVAL;
3742         goto out;
3743     }
3744
3745     /*
3746      * Endianess convertion. We are during postcopy (though paused).
3747      * The dirty bitmap won't change. We can directly modify it.
3748      */
3749     bitmap_from_le(block->bmap, le_bitmap, nbits);
3750
3751     /*
3752      * What we received is "received bitmap". Revert it as the initial
3753      * dirty bitmap for this ramblock.
3754      */
3755     bitmap_complement(block->bmap, block->bmap, nbits);
3756
3757     trace_ram_dirty_bitmap_reload_complete(block->idstr);
3758
3759     /*
3760      * We succeeded to sync bitmap for current ramblock. If this is
3761      * the last one to sync, we need to notify the main send thread.
3762      */
3763     ram_dirty_bitmap_reload_notify(s);
3764
3765     ret = 0;
3766 out:
3767     g_free(le_bitmap);
3768     return ret;
3769 }
3770
3771 static int ram_resume_prepare(MigrationState *s, void *opaque)
3772 {
3773     RAMState *rs = *(RAMState **)opaque;
3774     int ret;
3775
3776     ret = ram_dirty_bitmap_sync_all(s, rs);
3777     if (ret) {
3778         return ret;
3779     }
3780
3781     ram_state_resume_prepare(rs, s->to_dst_file);
3782
3783     return 0;
3784 }
3785
3786 static SaveVMHandlers savevm_ram_handlers = {
3787     .save_setup = ram_save_setup,
3788     .save_live_iterate = ram_save_iterate,
3789     .save_live_complete_postcopy = ram_save_complete,
3790     .save_live_complete_precopy = ram_save_complete,
3791     .has_postcopy = ram_has_postcopy,
3792     .save_live_pending = ram_save_pending,
3793     .load_state = ram_load,
3794     .save_cleanup = ram_save_cleanup,
3795     .load_setup = ram_load_setup,
3796     .load_cleanup = ram_load_cleanup,
3797     .resume_prepare = ram_resume_prepare,
3798 };
3799
3800 void ram_mig_init(void)
3801 {
3802     qemu_mutex_init(&XBZRLE.lock);
3803     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
3804 }