migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include "qemu/cutils.h"
  32 #include "qemu/bitops.h"
  33 #include "qemu/bitmap.h"
  34 #include "qemu/main-loop.h"
  35 #include "xbzrle.h"
  36 #include "ram.h"
  37 #include "migration.h"
  38 #include "migration/register.h"
  39 #include "migration/misc.h"
  40 #include "qemu-file.h"
  41 #include "postcopy-ram.h"
  42 #include "page_cache.h"
  43 #include "qemu/error-report.h"
  44 #include "qapi/error.h"
  45 #include "qapi/qapi-types-migration.h"
  46 #include "qapi/qapi-events-migration.h"
  47 #include "qapi/qmp/qerror.h"
  48 #include "trace.h"
  49 #include "exec/ram_addr.h"
  50 #include "exec/target_page.h"
  51 #include "qemu/rcu_queue.h"
  52 #include "migration/colo.h"
  53 #include "block.h"
  54 #include "sysemu/sysemu.h"
  55 #include "savevm.h"
  56 #include "qemu/iov.h"
  57 #include "multifd.h"
  58
  59 /***********************************************************/
  60 /* ram save/restore */
  61
  62 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  63  * worked for pages that where filled with the same char.  We switched
  64  * it to only search for the zero value.  And to avoid confusion with
  65  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  66  */
  67
  68 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  69 #define RAM_SAVE_FLAG_ZERO     0x02
  70 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  71 #define RAM_SAVE_FLAG_PAGE     0x08
  72 #define RAM_SAVE_FLAG_EOS      0x10
  73 #define RAM_SAVE_FLAG_CONTINUE 0x20
  74 #define RAM_SAVE_FLAG_XBZRLE   0x40
  75 /* 0x80 is reserved in migration.h start with 0x100 next */
  76 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  77
  78 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  79 {
  80     return buffer_is_zero(p, size);
  81 }
  82
  83 XBZRLECacheStats xbzrle_counters;
  84
  85 /* struct contains XBZRLE cache and a static page
  86    used by the compression */
  87 static struct {
  88     /* buffer used for XBZRLE encoding */
  89     uint8_t *encoded_buf;
  90     /* buffer for storing page content */
  91     uint8_t *current_buf;
  92     /* Cache for XBZRLE, Protected by lock. */
  93     PageCache *cache;
  94     QemuMutex lock;
  95     /* it will store a page full of zeros */
  96     uint8_t *zero_target_page;
  97     /* buffer used for XBZRLE decoding */
  98     uint8_t *decoded_buf;
  99 } XBZRLE;
 100
 101 static void XBZRLE_cache_lock(void)
 102 {
 103     if (migrate_use_xbzrle())
 104         qemu_mutex_lock(&XBZRLE.lock);
 105 }
 106
 107 static void XBZRLE_cache_unlock(void)
 108 {
 109     if (migrate_use_xbzrle())
 110         qemu_mutex_unlock(&XBZRLE.lock);
 111 }
 112
 113 /**
 114  * xbzrle_cache_resize: resize the xbzrle cache
 115  *
 116  * This function is called from qmp_migrate_set_cache_size in main
 117  * thread, possibly while a migration is in progress.  A running
 118  * migration may be using the cache and might finish during this call,
 119  * hence changes to the cache are protected by XBZRLE.lock().
 120  *
 121  * Returns 0 for success or -1 for error
 122  *
 123  * @new_size: new cache size
 124  * @errp: set *errp if the check failed, with reason
 125  */
 126 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 127 {
 128     PageCache *new_cache;
 129     int64_t ret = 0;
 130
 131     /* Check for truncation */
 132     if (new_size != (size_t)new_size) {
 133         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 134                    "exceeding address space");
 135         return -1;
 136     }
 137
 138     if (new_size == migrate_xbzrle_cache_size()) {
 139         /* nothing to do */
 140         return 0;
 141     }
 142
 143     XBZRLE_cache_lock();
 144
 145     if (XBZRLE.cache != NULL) {
 146         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 147         if (!new_cache) {
 148             ret = -1;
 149             goto out;
 150         }
 151
 152         cache_fini(XBZRLE.cache);
 153         XBZRLE.cache = new_cache;
 154     }
 155 out:
 156     XBZRLE_cache_unlock();
 157     return ret;
 158 }
 159
 160 static bool ramblock_is_ignored(RAMBlock *block)
 161 {
 162     return !qemu_ram_is_migratable(block) ||
 163            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 164 }
 165
 166 /* Should be holding either ram_list.mutex, or the RCU lock. */
 167 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
 168     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 169         if (ramblock_is_ignored(block)) {} else
 170
 171 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 172     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 173         if (!qemu_ram_is_migratable(block)) {} else
 174
 175 #undef RAMBLOCK_FOREACH
 176
 177 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 178 {
 179     RAMBlock *block;
 180     int ret = 0;
 181
 182     RCU_READ_LOCK_GUARD();
 183
 184     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 185         ret = func(block, opaque);
 186         if (ret) {
 187             break;
 188         }
 189     }
 190     return ret;
 191 }
 192
 193 static void ramblock_recv_map_init(void)
 194 {
 195     RAMBlock *rb;
 196
 197     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 198         assert(!rb->receivedmap);
 199         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 200     }
 201 }
 202
 203 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 204 {
 205     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 206                     rb->receivedmap);
 207 }
 208
 209 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 210 {
 211     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 212 }
 213
 214 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 215 {
 216     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 217 }
 218
 219 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 220                                     size_t nr)
 221 {
 222     bitmap_set_atomic(rb->receivedmap,
 223                       ramblock_recv_bitmap_offset(host_addr, rb),
 224                       nr);
 225 }
 226
 227 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 228
 229 /*
 230  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 231  *
 232  * Returns >0 if success with sent bytes, or <0 if error.
 233  */
 234 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 235                                   const char *block_name)
 236 {
 237     RAMBlock *block = qemu_ram_block_by_name(block_name);
 238     unsigned long *le_bitmap, nbits;
 239     uint64_t size;
 240
 241     if (!block) {
 242         error_report("%s: invalid block name: %s", __func__, block_name);
 243         return -1;
 244     }
 245
 246     nbits = block->used_length >> TARGET_PAGE_BITS;
 247
 248     /*
 249      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 250      * machines we may need 4 more bytes for padding (see below
 251      * comment). So extend it a bit before hand.
 252      */
 253     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 254
 255     /*
 256      * Always use little endian when sending the bitmap. This is
 257      * required that when source and destination VMs are not using the
 258      * same endianess. (Note: big endian won't work.)
 259      */
 260     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 261
 262     /* Size of the bitmap, in bytes */
 263     size = DIV_ROUND_UP(nbits, 8);
 264
 265     /*
 266      * size is always aligned to 8 bytes for 64bit machines, but it
 267      * may not be true for 32bit machines. We need this padding to
 268      * make sure the migration can survive even between 32bit and
 269      * 64bit machines.
 270      */
 271     size = ROUND_UP(size, 8);
 272
 273     qemu_put_be64(file, size);
 274     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 275     /*
 276      * Mark as an end, in case the middle part is screwed up due to
 277      * some "misterious" reason.
 278      */
 279     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 280     qemu_fflush(file);
 281
 282     g_free(le_bitmap);
 283
 284     if (qemu_file_get_error(file)) {
 285         return qemu_file_get_error(file);
 286     }
 287
 288     return size + sizeof(size);
 289 }
 290
 291 /*
 292  * An outstanding page request, on the source, having been received
 293  * and queued
 294  */
 295 struct RAMSrcPageRequest {
 296     RAMBlock *rb;
 297     hwaddr    offset;
 298     hwaddr    len;
 299
 300     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 301 };
 302
 303 /* State of RAM for migration */
 304 struct RAMState {
 305     /* QEMUFile used for this migration */
 306     QEMUFile *f;
 307     /* Last block that we have visited searching for dirty pages */
 308     RAMBlock *last_seen_block;
 309     /* Last block from where we have sent data */
 310     RAMBlock *last_sent_block;
 311     /* Last dirty target page we have sent */
 312     ram_addr_t last_page;
 313     /* last ram version we have seen */
 314     uint32_t last_version;
 315     /* We are in the first round */
 316     bool ram_bulk_stage;
 317     /* The free page optimization is enabled */
 318     bool fpo_enabled;
 319     /* How many times we have dirty too many pages */
 320     int dirty_rate_high_cnt;
 321     /* these variables are used for bitmap sync */
 322     /* last time we did a full bitmap_sync */
 323     int64_t time_last_bitmap_sync;
 324     /* bytes transferred at start_time */
 325     uint64_t bytes_xfer_prev;
 326     /* number of dirty pages since start_time */
 327     uint64_t num_dirty_pages_period;
 328     /* xbzrle misses since the beginning of the period */
 329     uint64_t xbzrle_cache_miss_prev;
 330     /* Amount of xbzrle pages since the beginning of the period */
 331     uint64_t xbzrle_pages_prev;
 332     /* Amount of xbzrle encoded bytes since the beginning of the period */
 333     uint64_t xbzrle_bytes_prev;
 334
 335     /* compression statistics since the beginning of the period */
 336     /* amount of count that no free thread to compress data */
 337     uint64_t compress_thread_busy_prev;
 338     /* amount bytes after compression */
 339     uint64_t compressed_size_prev;
 340     /* amount of compressed pages */
 341     uint64_t compress_pages_prev;
 342
 343     /* total handled target pages at the beginning of period */
 344     uint64_t target_page_count_prev;
 345     /* total handled target pages since start */
 346     uint64_t target_page_count;
 347     /* number of dirty bits in the bitmap */
 348     uint64_t migration_dirty_pages;
 349     /* Protects modification of the bitmap and migration dirty pages */
 350     QemuMutex bitmap_mutex;
 351     /* The RAMBlock used in the last src_page_requests */
 352     RAMBlock *last_req_rb;
 353     /* Queue of outstanding page requests from the destination */
 354     QemuMutex src_page_req_mutex;
 355     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 356 };
 357 typedef struct RAMState RAMState;
 358
 359 static RAMState *ram_state;
 360
 361 static NotifierWithReturnList precopy_notifier_list;
 362
 363 void precopy_infrastructure_init(void)
 364 {
 365     notifier_with_return_list_init(&precopy_notifier_list);
 366 }
 367
 368 void precopy_add_notifier(NotifierWithReturn *n)
 369 {
 370     notifier_with_return_list_add(&precopy_notifier_list, n);
 371 }
 372
 373 void precopy_remove_notifier(NotifierWithReturn *n)
 374 {
 375     notifier_with_return_remove(n);
 376 }
 377
 378 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 379 {
 380     PrecopyNotifyData pnd;
 381     pnd.reason = reason;
 382     pnd.errp = errp;
 383
 384     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 385 }
 386
 387 void precopy_enable_free_page_optimization(void)
 388 {
 389     if (!ram_state) {
 390         return;
 391     }
 392
 393     ram_state->fpo_enabled = true;
 394 }
 395
 396 uint64_t ram_bytes_remaining(void)
 397 {
 398     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 399                        0;
 400 }
 401
 402 MigrationStats ram_counters;
 403
 404 /* used by the search for pages to send */
 405 struct PageSearchStatus {
 406     /* Current block being searched */
 407     RAMBlock    *block;
 408     /* Current page to search from */
 409     unsigned long page;
 410     /* Set once we wrap around */
 411     bool         complete_round;
 412 };
 413 typedef struct PageSearchStatus PageSearchStatus;
 414
 415 CompressionStats compression_counters;
 416
 417 struct CompressParam {
 418     bool done;
 419     bool quit;
 420     bool zero_page;
 421     QEMUFile *file;
 422     QemuMutex mutex;
 423     QemuCond cond;
 424     RAMBlock *block;
 425     ram_addr_t offset;
 426
 427     /* internally used fields */
 428     z_stream stream;
 429     uint8_t *originbuf;
 430 };
 431 typedef struct CompressParam CompressParam;
 432
 433 struct DecompressParam {
 434     bool done;
 435     bool quit;
 436     QemuMutex mutex;
 437     QemuCond cond;
 438     void *des;
 439     uint8_t *compbuf;
 440     int len;
 441     z_stream stream;
 442 };
 443 typedef struct DecompressParam DecompressParam;
 444
 445 static CompressParam *comp_param;
 446 static QemuThread *compress_threads;
 447 /* comp_done_cond is used to wake up the migration thread when
 448  * one of the compression threads has finished the compression.
 449  * comp_done_lock is used to co-work with comp_done_cond.
 450  */
 451 static QemuMutex comp_done_lock;
 452 static QemuCond comp_done_cond;
 453 /* The empty QEMUFileOps will be used by file in CompressParam */
 454 static const QEMUFileOps empty_ops = { };
 455
 456 static QEMUFile *decomp_file;
 457 static DecompressParam *decomp_param;
 458 static QemuThread *decompress_threads;
 459 static QemuMutex decomp_done_lock;
 460 static QemuCond decomp_done_cond;
 461
 462 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 463                                  ram_addr_t offset, uint8_t *source_buf);
 464
 465 static void *do_data_compress(void *opaque)
 466 {
 467     CompressParam *param = opaque;
 468     RAMBlock *block;
 469     ram_addr_t offset;
 470     bool zero_page;
 471
 472     qemu_mutex_lock(&param->mutex);
 473     while (!param->quit) {
 474         if (param->block) {
 475             block = param->block;
 476             offset = param->offset;
 477             param->block = NULL;
 478             qemu_mutex_unlock(&param->mutex);
 479
 480             zero_page = do_compress_ram_page(param->file, &param->stream,
 481                                              block, offset, param->originbuf);
 482
 483             qemu_mutex_lock(&comp_done_lock);
 484             param->done = true;
 485             param->zero_page = zero_page;
 486             qemu_cond_signal(&comp_done_cond);
 487             qemu_mutex_unlock(&comp_done_lock);
 488
 489             qemu_mutex_lock(&param->mutex);
 490         } else {
 491             qemu_cond_wait(&param->cond, &param->mutex);
 492         }
 493     }
 494     qemu_mutex_unlock(&param->mutex);
 495
 496     return NULL;
 497 }
 498
 499 static void compress_threads_save_cleanup(void)
 500 {
 501     int i, thread_count;
 502
 503     if (!migrate_use_compression() || !comp_param) {
 504         return;
 505     }
 506
 507     thread_count = migrate_compress_threads();
 508     for (i = 0; i < thread_count; i++) {
 509         /*
 510          * we use it as a indicator which shows if the thread is
 511          * properly init'd or not
 512          */
 513         if (!comp_param[i].file) {
 514             break;
 515         }
 516
 517         qemu_mutex_lock(&comp_param[i].mutex);
 518         comp_param[i].quit = true;
 519         qemu_cond_signal(&comp_param[i].cond);
 520         qemu_mutex_unlock(&comp_param[i].mutex);
 521
 522         qemu_thread_join(compress_threads + i);
 523         qemu_mutex_destroy(&comp_param[i].mutex);
 524         qemu_cond_destroy(&comp_param[i].cond);
 525         deflateEnd(&comp_param[i].stream);
 526         g_free(comp_param[i].originbuf);
 527         qemu_fclose(comp_param[i].file);
 528         comp_param[i].file = NULL;
 529     }
 530     qemu_mutex_destroy(&comp_done_lock);
 531     qemu_cond_destroy(&comp_done_cond);
 532     g_free(compress_threads);
 533     g_free(comp_param);
 534     compress_threads = NULL;
 535     comp_param = NULL;
 536 }
 537
 538 static int compress_threads_save_setup(void)
 539 {
 540     int i, thread_count;
 541
 542     if (!migrate_use_compression()) {
 543         return 0;
 544     }
 545     thread_count = migrate_compress_threads();
 546     compress_threads = g_new0(QemuThread, thread_count);
 547     comp_param = g_new0(CompressParam, thread_count);
 548     qemu_cond_init(&comp_done_cond);
 549     qemu_mutex_init(&comp_done_lock);
 550     for (i = 0; i < thread_count; i++) {
 551         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 552         if (!comp_param[i].originbuf) {
 553             goto exit;
 554         }
 555
 556         if (deflateInit(&comp_param[i].stream,
 557                         migrate_compress_level()) != Z_OK) {
 558             g_free(comp_param[i].originbuf);
 559             goto exit;
 560         }
 561
 562         /* comp_param[i].file is just used as a dummy buffer to save data,
 563          * set its ops to empty.
 564          */
 565         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 566         comp_param[i].done = true;
 567         comp_param[i].quit = false;
 568         qemu_mutex_init(&comp_param[i].mutex);
 569         qemu_cond_init(&comp_param[i].cond);
 570         qemu_thread_create(compress_threads + i, "compress",
 571                            do_data_compress, comp_param + i,
 572                            QEMU_THREAD_JOINABLE);
 573     }
 574     return 0;
 575
 576 exit:
 577     compress_threads_save_cleanup();
 578     return -1;
 579 }
 580
 581 /**
 582  * save_page_header: write page header to wire
 583  *
 584  * If this is the 1st block, it also writes the block identification
 585  *
 586  * Returns the number of bytes written
 587  *
 588  * @f: QEMUFile where to send the data
 589  * @block: block that contains the page we want to send
 590  * @offset: offset inside the block for the page
 591  *          in the lower bits, it contains flags
 592  */
 593 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 594                                ram_addr_t offset)
 595 {
 596     size_t size, len;
 597
 598     if (block == rs->last_sent_block) {
 599         offset |= RAM_SAVE_FLAG_CONTINUE;
 600     }
 601     qemu_put_be64(f, offset);
 602     size = 8;
 603
 604     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 605         len = strlen(block->idstr);
 606         qemu_put_byte(f, len);
 607         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 608         size += 1 + len;
 609         rs->last_sent_block = block;
 610     }
 611     return size;
 612 }
 613
 614 /**
 615  * mig_throttle_guest_down: throotle down the guest
 616  *
 617  * Reduce amount of guest cpu execution to hopefully slow down memory
 618  * writes. If guest dirty memory rate is reduced below the rate at
 619  * which we can transfer pages to the destination then we should be
 620  * able to complete migration. Some workloads dirty memory way too
 621  * fast and will not effectively converge, even with auto-converge.
 622  */
 623 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 624                                     uint64_t bytes_dirty_threshold)
 625 {
 626     MigrationState *s = migrate_get_current();
 627     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 628     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 629     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 630     int pct_max = s->parameters.max_cpu_throttle;
 631
 632     uint64_t throttle_now = cpu_throttle_get_percentage();
 633     uint64_t cpu_now, cpu_ideal, throttle_inc;
 634
 635     /* We have not started throttling yet. Let's start it. */
 636     if (!cpu_throttle_active()) {
 637         cpu_throttle_set(pct_initial);
 638     } else {
 639         /* Throttling already on, just increase the rate */
 640         if (!pct_tailslow) {
 641             throttle_inc = pct_increment;
 642         } else {
 643             /* Compute the ideal CPU percentage used by Guest, which may
 644              * make the dirty rate match the dirty rate threshold. */
 645             cpu_now = 100 - throttle_now;
 646             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 647                         bytes_dirty_period);
 648             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 649         }
 650         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 651     }
 652 }
 653
 654 /**
 655  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 656  *
 657  * @rs: current RAM state
 658  * @current_addr: address for the zero page
 659  *
 660  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 661  * The important thing is that a stale (not-yet-0'd) page be replaced
 662  * by the new data.
 663  * As a bonus, if the page wasn't in the cache it gets added so that
 664  * when a small write is made into the 0'd page it gets XBZRLE sent.
 665  */
 666 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 667 {
 668     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 669         return;
 670     }
 671
 672     /* We don't care if this fails to allocate a new cache page
 673      * as long as it updated an old one */
 674     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 675                  ram_counters.dirty_sync_count);
 676 }
 677
 678 #define ENCODING_FLAG_XBZRLE 0x1
 679
 680 /**
 681  * save_xbzrle_page: compress and send current page
 682  *
 683  * Returns: 1 means that we wrote the page
 684  *          0 means that page is identical to the one already sent
 685  *          -1 means that xbzrle would be longer than normal
 686  *
 687  * @rs: current RAM state
 688  * @current_data: pointer to the address of the page contents
 689  * @current_addr: addr of the page
 690  * @block: block that contains the page we want to send
 691  * @offset: offset inside the block for the page
 692  * @last_stage: if we are at the completion stage
 693  */
 694 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 695                             ram_addr_t current_addr, RAMBlock *block,
 696                             ram_addr_t offset, bool last_stage)
 697 {
 698     int encoded_len = 0, bytes_xbzrle;
 699     uint8_t *prev_cached_page;
 700
 701     if (!cache_is_cached(XBZRLE.cache, current_addr,
 702                          ram_counters.dirty_sync_count)) {
 703         xbzrle_counters.cache_miss++;
 704         if (!last_stage) {
 705             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 706                              ram_counters.dirty_sync_count) == -1) {
 707                 return -1;
 708             } else {
 709                 /* update *current_data when the page has been
 710                    inserted into cache */
 711                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 712             }
 713         }
 714         return -1;
 715     }
 716
 717     /*
 718      * Reaching here means the page has hit the xbzrle cache, no matter what
 719      * encoding result it is (normal encoding, overflow or skipping the page),
 720      * count the page as encoded. This is used to caculate the encoding rate.
 721      *
 722      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 723      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 724      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 725      * skipped page included. In this way, the encoding rate can tell if the
 726      * guest page is good for xbzrle encoding.
 727      */
 728     xbzrle_counters.pages++;
 729     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 730
 731     /* save current buffer into memory */
 732     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 733
 734     /* XBZRLE encoding (if there is no overflow) */
 735     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 736                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 737                                        TARGET_PAGE_SIZE);
 738
 739     /*
 740      * Update the cache contents, so that it corresponds to the data
 741      * sent, in all cases except where we skip the page.
 742      */
 743     if (!last_stage && encoded_len != 0) {
 744         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 745         /*
 746          * In the case where we couldn't compress, ensure that the caller
 747          * sends the data from the cache, since the guest might have
 748          * changed the RAM since we copied it.
 749          */
 750         *current_data = prev_cached_page;
 751     }
 752
 753     if (encoded_len == 0) {
 754         trace_save_xbzrle_page_skipping();
 755         return 0;
 756     } else if (encoded_len == -1) {
 757         trace_save_xbzrle_page_overflow();
 758         xbzrle_counters.overflow++;
 759         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 760         return -1;
 761     }
 762
 763     /* Send XBZRLE based compressed page */
 764     bytes_xbzrle = save_page_header(rs, rs->f, block,
 765                                     offset | RAM_SAVE_FLAG_XBZRLE);
 766     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 767     qemu_put_be16(rs->f, encoded_len);
 768     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 769     bytes_xbzrle += encoded_len + 1 + 2;
 770     /*
 771      * Like compressed_size (please see update_compress_thread_counts),
 772      * the xbzrle encoded bytes don't count the 8 byte header with
 773      * RAM_SAVE_FLAG_CONTINUE.
 774      */
 775     xbzrle_counters.bytes += bytes_xbzrle - 8;
 776     ram_counters.transferred += bytes_xbzrle;
 777
 778     return 1;
 779 }
 780
 781 /**
 782  * migration_bitmap_find_dirty: find the next dirty page from start
 783  *
 784  * Returns the page offset within memory region of the start of a dirty page
 785  *
 786  * @rs: current RAM state
 787  * @rb: RAMBlock where to search for dirty pages
 788  * @start: page where we start the search
 789  */
 790 static inline
 791 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 792                                           unsigned long start)
 793 {
 794     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 795     unsigned long *bitmap = rb->bmap;
 796     unsigned long next;
 797
 798     if (ramblock_is_ignored(rb)) {
 799         return size;
 800     }
 801
 802     /*
 803      * When the free page optimization is enabled, we need to check the bitmap
 804      * to send the non-free pages rather than all the pages in the bulk stage.
 805      */
 806     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
 807         next = start + 1;
 808     } else {
 809         next = find_next_bit(bitmap, size, start);
 810     }
 811
 812     return next;
 813 }
 814
 815 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 816                                                 RAMBlock *rb,
 817                                                 unsigned long page)
 818 {
 819     bool ret;
 820
 821     qemu_mutex_lock(&rs->bitmap_mutex);
 822
 823     /*
 824      * Clear dirty bitmap if needed.  This _must_ be called before we
 825      * send any of the page in the chunk because we need to make sure
 826      * we can capture further page content changes when we sync dirty
 827      * log the next time.  So as long as we are going to send any of
 828      * the page in the chunk we clear the remote dirty bitmap for all.
 829      * Clearing it earlier won't be a problem, but too late will.
 830      */
 831     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
 832         uint8_t shift = rb->clear_bmap_shift;
 833         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
 834         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
 835
 836         /*
 837          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 838          * can make things easier sometimes since then start address
 839          * of the small chunk will always be 64 pages aligned so the
 840          * bitmap will always be aligned to unsigned long.  We should
 841          * even be able to remove this restriction but I'm simply
 842          * keeping it.
 843          */
 844         assert(shift >= 6);
 845         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 846         memory_region_clear_dirty_bitmap(rb->mr, start, size);
 847     }
 848
 849     ret = test_and_clear_bit(page, rb->bmap);
 850
 851     if (ret) {
 852         rs->migration_dirty_pages--;
 853     }
 854     qemu_mutex_unlock(&rs->bitmap_mutex);
 855
 856     return ret;
 857 }
 858
 859 /* Called with RCU critical section */
 860 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 861 {
 862     rs->migration_dirty_pages +=
 863         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
 864                                               &rs->num_dirty_pages_period);
 865 }
 866
 867 /**
 868  * ram_pagesize_summary: calculate all the pagesizes of a VM
 869  *
 870  * Returns a summary bitmap of the page sizes of all RAMBlocks
 871  *
 872  * For VMs with just normal pages this is equivalent to the host page
 873  * size. If it's got some huge pages then it's the OR of all the
 874  * different page sizes.
 875  */
 876 uint64_t ram_pagesize_summary(void)
 877 {
 878     RAMBlock *block;
 879     uint64_t summary = 0;
 880
 881     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 882         summary |= block->page_size;
 883     }
 884
 885     return summary;
 886 }
 887
 888 uint64_t ram_get_total_transferred_pages(void)
 889 {
 890     return  ram_counters.normal + ram_counters.duplicate +
 891                 compression_counters.pages + xbzrle_counters.pages;
 892 }
 893
 894 static void migration_update_rates(RAMState *rs, int64_t end_time)
 895 {
 896     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 897     double compressed_size;
 898
 899     /* calculate period counters */
 900     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 901                 / (end_time - rs->time_last_bitmap_sync);
 902
 903     if (!page_count) {
 904         return;
 905     }
 906
 907     if (migrate_use_xbzrle()) {
 908         double encoded_size, unencoded_size;
 909
 910         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 911             rs->xbzrle_cache_miss_prev) / page_count;
 912         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 913         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
 914                          TARGET_PAGE_SIZE;
 915         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
 916         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
 917             xbzrle_counters.encoding_rate = 0;
 918         } else {
 919             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
 920         }
 921         rs->xbzrle_pages_prev = xbzrle_counters.pages;
 922         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
 923     }
 924
 925     if (migrate_use_compression()) {
 926         compression_counters.busy_rate = (double)(compression_counters.busy -
 927             rs->compress_thread_busy_prev) / page_count;
 928         rs->compress_thread_busy_prev = compression_counters.busy;
 929
 930         compressed_size = compression_counters.compressed_size -
 931                           rs->compressed_size_prev;
 932         if (compressed_size) {
 933             double uncompressed_size = (compression_counters.pages -
 934                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
 935
 936             /* Compression-Ratio = Uncompressed-size / Compressed-size */
 937             compression_counters.compression_rate =
 938                                         uncompressed_size / compressed_size;
 939
 940             rs->compress_pages_prev = compression_counters.pages;
 941             rs->compressed_size_prev = compression_counters.compressed_size;
 942         }
 943     }
 944 }
 945
 946 static void migration_trigger_throttle(RAMState *rs)
 947 {
 948     MigrationState *s = migrate_get_current();
 949     uint64_t threshold = s->parameters.throttle_trigger_threshold;
 950
 951     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
 952     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
 953     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
 954
 955     /* During block migration the auto-converge logic incorrectly detects
 956      * that ram migration makes no progress. Avoid this by disabling the
 957      * throttling logic during the bulk phase of block migration. */
 958     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 959         /* The following detection logic can be refined later. For now:
 960            Check to see if the ratio between dirtied bytes and the approx.
 961            amount of bytes that just got transferred since the last time
 962            we were in this routine reaches the threshold. If that happens
 963            twice, start or increase throttling. */
 964
 965         if ((bytes_dirty_period > bytes_dirty_threshold) &&
 966             (++rs->dirty_rate_high_cnt >= 2)) {
 967             trace_migration_throttle();
 968             rs->dirty_rate_high_cnt = 0;
 969             mig_throttle_guest_down(bytes_dirty_period,
 970                                     bytes_dirty_threshold);
 971         }
 972     }
 973 }
 974
 975 static void migration_bitmap_sync(RAMState *rs)
 976 {
 977     RAMBlock *block;
 978     int64_t end_time;
 979
 980     ram_counters.dirty_sync_count++;
 981
 982     if (!rs->time_last_bitmap_sync) {
 983         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 984     }
 985
 986     trace_migration_bitmap_sync_start();
 987     memory_global_dirty_log_sync();
 988
 989     qemu_mutex_lock(&rs->bitmap_mutex);
 990     WITH_RCU_READ_LOCK_GUARD() {
 991         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 992             ramblock_sync_dirty_bitmap(rs, block);
 993         }
 994         ram_counters.remaining = ram_bytes_remaining();
 995     }
 996     qemu_mutex_unlock(&rs->bitmap_mutex);
 997
 998     memory_global_after_dirty_log_sync();
 999     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1000
1001     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1002
1003     /* more than 1 second = 1000 millisecons */
1004     if (end_time > rs->time_last_bitmap_sync + 1000) {
1005         migration_trigger_throttle(rs);
1006
1007         migration_update_rates(rs, end_time);
1008
1009         rs->target_page_count_prev = rs->target_page_count;
1010
1011         /* reset period counters */
1012         rs->time_last_bitmap_sync = end_time;
1013         rs->num_dirty_pages_period = 0;
1014         rs->bytes_xfer_prev = ram_counters.transferred;
1015     }
1016     if (migrate_use_events()) {
1017         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1018     }
1019 }
1020
1021 static void migration_bitmap_sync_precopy(RAMState *rs)
1022 {
1023     Error *local_err = NULL;
1024
1025     /*
1026      * The current notifier usage is just an optimization to migration, so we
1027      * don't stop the normal migration process in the error case.
1028      */
1029     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1030         error_report_err(local_err);
1031         local_err = NULL;
1032     }
1033
1034     migration_bitmap_sync(rs);
1035
1036     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1037         error_report_err(local_err);
1038     }
1039 }
1040
1041 /**
1042  * save_zero_page_to_file: send the zero page to the file
1043  *
1044  * Returns the size of data written to the file, 0 means the page is not
1045  * a zero page
1046  *
1047  * @rs: current RAM state
1048  * @file: the file where the data is saved
1049  * @block: block that contains the page we want to send
1050  * @offset: offset inside the block for the page
1051  */
1052 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1053                                   RAMBlock *block, ram_addr_t offset)
1054 {
1055     uint8_t *p = block->host + offset;
1056     int len = 0;
1057
1058     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1059         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1060         qemu_put_byte(file, 0);
1061         len += 1;
1062     }
1063     return len;
1064 }
1065
1066 /**
1067  * save_zero_page: send the zero page to the stream
1068  *
1069  * Returns the number of pages written.
1070  *
1071  * @rs: current RAM state
1072  * @block: block that contains the page we want to send
1073  * @offset: offset inside the block for the page
1074  */
1075 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1076 {
1077     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1078
1079     if (len) {
1080         ram_counters.duplicate++;
1081         ram_counters.transferred += len;
1082         return 1;
1083     }
1084     return -1;
1085 }
1086
1087 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1088 {
1089     if (!migrate_release_ram() || !migration_in_postcopy()) {
1090         return;
1091     }
1092
1093     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1094 }
1095
1096 /*
1097  * @pages: the number of pages written by the control path,
1098  *        < 0 - error
1099  *        > 0 - number of pages written
1100  *
1101  * Return true if the pages has been saved, otherwise false is returned.
1102  */
1103 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1104                               int *pages)
1105 {
1106     uint64_t bytes_xmit = 0;
1107     int ret;
1108
1109     *pages = -1;
1110     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1111                                 &bytes_xmit);
1112     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1113         return false;
1114     }
1115
1116     if (bytes_xmit) {
1117         ram_counters.transferred += bytes_xmit;
1118         *pages = 1;
1119     }
1120
1121     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1122         return true;
1123     }
1124
1125     if (bytes_xmit > 0) {
1126         ram_counters.normal++;
1127     } else if (bytes_xmit == 0) {
1128         ram_counters.duplicate++;
1129     }
1130
1131     return true;
1132 }
1133
1134 /*
1135  * directly send the page to the stream
1136  *
1137  * Returns the number of pages written.
1138  *
1139  * @rs: current RAM state
1140  * @block: block that contains the page we want to send
1141  * @offset: offset inside the block for the page
1142  * @buf: the page to be sent
1143  * @async: send to page asyncly
1144  */
1145 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1146                             uint8_t *buf, bool async)
1147 {
1148     ram_counters.transferred += save_page_header(rs, rs->f, block,
1149                                                  offset | RAM_SAVE_FLAG_PAGE);
1150     if (async) {
1151         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1152                               migrate_release_ram() &
1153                               migration_in_postcopy());
1154     } else {
1155         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1156     }
1157     ram_counters.transferred += TARGET_PAGE_SIZE;
1158     ram_counters.normal++;
1159     return 1;
1160 }
1161
1162 /**
1163  * ram_save_page: send the given page to the stream
1164  *
1165  * Returns the number of pages written.
1166  *          < 0 - error
1167  *          >=0 - Number of pages written - this might legally be 0
1168  *                if xbzrle noticed the page was the same.
1169  *
1170  * @rs: current RAM state
1171  * @block: block that contains the page we want to send
1172  * @offset: offset inside the block for the page
1173  * @last_stage: if we are at the completion stage
1174  */
1175 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1176 {
1177     int pages = -1;
1178     uint8_t *p;
1179     bool send_async = true;
1180     RAMBlock *block = pss->block;
1181     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1182     ram_addr_t current_addr = block->offset + offset;
1183
1184     p = block->host + offset;
1185     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1186
1187     XBZRLE_cache_lock();
1188     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1189         migrate_use_xbzrle()) {
1190         pages = save_xbzrle_page(rs, &p, current_addr, block,
1191                                  offset, last_stage);
1192         if (!last_stage) {
1193             /* Can't send this cached data async, since the cache page
1194              * might get updated before it gets to the wire
1195              */
1196             send_async = false;
1197         }
1198     }
1199
1200     /* XBZRLE overflow or normal page */
1201     if (pages == -1) {
1202         pages = save_normal_page(rs, block, offset, p, send_async);
1203     }
1204
1205     XBZRLE_cache_unlock();
1206
1207     return pages;
1208 }
1209
1210 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1211                                  ram_addr_t offset)
1212 {
1213     if (multifd_queue_page(rs->f, block, offset) < 0) {
1214         return -1;
1215     }
1216     ram_counters.normal++;
1217
1218     return 1;
1219 }
1220
1221 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1222                                  ram_addr_t offset, uint8_t *source_buf)
1223 {
1224     RAMState *rs = ram_state;
1225     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1226     bool zero_page = false;
1227     int ret;
1228
1229     if (save_zero_page_to_file(rs, f, block, offset)) {
1230         zero_page = true;
1231         goto exit;
1232     }
1233
1234     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1235
1236     /*
1237      * copy it to a internal buffer to avoid it being modified by VM
1238      * so that we can catch up the error during compression and
1239      * decompression
1240      */
1241     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1242     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1243     if (ret < 0) {
1244         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1245         error_report("compressed data failed!");
1246         return false;
1247     }
1248
1249 exit:
1250     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1251     return zero_page;
1252 }
1253
1254 static void
1255 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1256 {
1257     ram_counters.transferred += bytes_xmit;
1258
1259     if (param->zero_page) {
1260         ram_counters.duplicate++;
1261         return;
1262     }
1263
1264     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1265     compression_counters.compressed_size += bytes_xmit - 8;
1266     compression_counters.pages++;
1267 }
1268
1269 static bool save_page_use_compression(RAMState *rs);
1270
1271 static void flush_compressed_data(RAMState *rs)
1272 {
1273     int idx, len, thread_count;
1274
1275     if (!save_page_use_compression(rs)) {
1276         return;
1277     }
1278     thread_count = migrate_compress_threads();
1279
1280     qemu_mutex_lock(&comp_done_lock);
1281     for (idx = 0; idx < thread_count; idx++) {
1282         while (!comp_param[idx].done) {
1283             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1284         }
1285     }
1286     qemu_mutex_unlock(&comp_done_lock);
1287
1288     for (idx = 0; idx < thread_count; idx++) {
1289         qemu_mutex_lock(&comp_param[idx].mutex);
1290         if (!comp_param[idx].quit) {
1291             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1292             /*
1293              * it's safe to fetch zero_page without holding comp_done_lock
1294              * as there is no further request submitted to the thread,
1295              * i.e, the thread should be waiting for a request at this point.
1296              */
1297             update_compress_thread_counts(&comp_param[idx], len);
1298         }
1299         qemu_mutex_unlock(&comp_param[idx].mutex);
1300     }
1301 }
1302
1303 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1304                                        ram_addr_t offset)
1305 {
1306     param->block = block;
1307     param->offset = offset;
1308 }
1309
1310 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1311                                            ram_addr_t offset)
1312 {
1313     int idx, thread_count, bytes_xmit = -1, pages = -1;
1314     bool wait = migrate_compress_wait_thread();
1315
1316     thread_count = migrate_compress_threads();
1317     qemu_mutex_lock(&comp_done_lock);
1318 retry:
1319     for (idx = 0; idx < thread_count; idx++) {
1320         if (comp_param[idx].done) {
1321             comp_param[idx].done = false;
1322             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1323             qemu_mutex_lock(&comp_param[idx].mutex);
1324             set_compress_params(&comp_param[idx], block, offset);
1325             qemu_cond_signal(&comp_param[idx].cond);
1326             qemu_mutex_unlock(&comp_param[idx].mutex);
1327             pages = 1;
1328             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1329             break;
1330         }
1331     }
1332
1333     /*
1334      * wait for the free thread if the user specifies 'compress-wait-thread',
1335      * otherwise we will post the page out in the main thread as normal page.
1336      */
1337     if (pages < 0 && wait) {
1338         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1339         goto retry;
1340     }
1341     qemu_mutex_unlock(&comp_done_lock);
1342
1343     return pages;
1344 }
1345
1346 /**
1347  * find_dirty_block: find the next dirty page and update any state
1348  * associated with the search process.
1349  *
1350  * Returns true if a page is found
1351  *
1352  * @rs: current RAM state
1353  * @pss: data about the state of the current dirty page scan
1354  * @again: set to false if the search has scanned the whole of RAM
1355  */
1356 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1357 {
1358     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1359     if (pss->complete_round && pss->block == rs->last_seen_block &&
1360         pss->page >= rs->last_page) {
1361         /*
1362          * We've been once around the RAM and haven't found anything.
1363          * Give up.
1364          */
1365         *again = false;
1366         return false;
1367     }
1368     if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1369         >= pss->block->used_length) {
1370         /* Didn't find anything in this RAM Block */
1371         pss->page = 0;
1372         pss->block = QLIST_NEXT_RCU(pss->block, next);
1373         if (!pss->block) {
1374             /*
1375              * If memory migration starts over, we will meet a dirtied page
1376              * which may still exists in compression threads's ring, so we
1377              * should flush the compressed data to make sure the new page
1378              * is not overwritten by the old one in the destination.
1379              *
1380              * Also If xbzrle is on, stop using the data compression at this
1381              * point. In theory, xbzrle can do better than compression.
1382              */
1383             flush_compressed_data(rs);
1384
1385             /* Hit the end of the list */
1386             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1387             /* Flag that we've looped */
1388             pss->complete_round = true;
1389             rs->ram_bulk_stage = false;
1390         }
1391         /* Didn't find anything this time, but try again on the new block */
1392         *again = true;
1393         return false;
1394     } else {
1395         /* Can go around again, but... */
1396         *again = true;
1397         /* We've found something so probably don't need to */
1398         return true;
1399     }
1400 }
1401
1402 /**
1403  * unqueue_page: gets a page of the queue
1404  *
1405  * Helper for 'get_queued_page' - gets a page off the queue
1406  *
1407  * Returns the block of the page (or NULL if none available)
1408  *
1409  * @rs: current RAM state
1410  * @offset: used to return the offset within the RAMBlock
1411  */
1412 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1413 {
1414     RAMBlock *block = NULL;
1415
1416     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1417         return NULL;
1418     }
1419
1420     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1421     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1422         struct RAMSrcPageRequest *entry =
1423                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1424         block = entry->rb;
1425         *offset = entry->offset;
1426
1427         if (entry->len > TARGET_PAGE_SIZE) {
1428             entry->len -= TARGET_PAGE_SIZE;
1429             entry->offset += TARGET_PAGE_SIZE;
1430         } else {
1431             memory_region_unref(block->mr);
1432             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1433             g_free(entry);
1434             migration_consume_urgent_request();
1435         }
1436     }
1437
1438     return block;
1439 }
1440
1441 /**
1442  * get_queued_page: unqueue a page from the postcopy requests
1443  *
1444  * Skips pages that are already sent (!dirty)
1445  *
1446  * Returns true if a queued page is found
1447  *
1448  * @rs: current RAM state
1449  * @pss: data about the state of the current dirty page scan
1450  */
1451 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1452 {
1453     RAMBlock  *block;
1454     ram_addr_t offset;
1455     bool dirty;
1456
1457     do {
1458         block = unqueue_page(rs, &offset);
1459         /*
1460          * We're sending this page, and since it's postcopy nothing else
1461          * will dirty it, and we must make sure it doesn't get sent again
1462          * even if this queue request was received after the background
1463          * search already sent it.
1464          */
1465         if (block) {
1466             unsigned long page;
1467
1468             page = offset >> TARGET_PAGE_BITS;
1469             dirty = test_bit(page, block->bmap);
1470             if (!dirty) {
1471                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1472                                                 page);
1473             } else {
1474                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1475             }
1476         }
1477
1478     } while (block && !dirty);
1479
1480     if (block) {
1481         /*
1482          * As soon as we start servicing pages out of order, then we have
1483          * to kill the bulk stage, since the bulk stage assumes
1484          * in (migration_bitmap_find_and_reset_dirty) that every page is
1485          * dirty, that's no longer true.
1486          */
1487         rs->ram_bulk_stage = false;
1488
1489         /*
1490          * We want the background search to continue from the queued page
1491          * since the guest is likely to want other pages near to the page
1492          * it just requested.
1493          */
1494         pss->block = block;
1495         pss->page = offset >> TARGET_PAGE_BITS;
1496
1497         /*
1498          * This unqueued page would break the "one round" check, even is
1499          * really rare.
1500          */
1501         pss->complete_round = false;
1502     }
1503
1504     return !!block;
1505 }
1506
1507 /**
1508  * migration_page_queue_free: drop any remaining pages in the ram
1509  * request queue
1510  *
1511  * It should be empty at the end anyway, but in error cases there may
1512  * be some left.  in case that there is any page left, we drop it.
1513  *
1514  */
1515 static void migration_page_queue_free(RAMState *rs)
1516 {
1517     struct RAMSrcPageRequest *mspr, *next_mspr;
1518     /* This queue generally should be empty - but in the case of a failed
1519      * migration might have some droppings in.
1520      */
1521     RCU_READ_LOCK_GUARD();
1522     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1523         memory_region_unref(mspr->rb->mr);
1524         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1525         g_free(mspr);
1526     }
1527 }
1528
1529 /**
1530  * ram_save_queue_pages: queue the page for transmission
1531  *
1532  * A request from postcopy destination for example.
1533  *
1534  * Returns zero on success or negative on error
1535  *
1536  * @rbname: Name of the RAMBLock of the request. NULL means the
1537  *          same that last one.
1538  * @start: starting address from the start of the RAMBlock
1539  * @len: length (in bytes) to send
1540  */
1541 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1542 {
1543     RAMBlock *ramblock;
1544     RAMState *rs = ram_state;
1545
1546     ram_counters.postcopy_requests++;
1547     RCU_READ_LOCK_GUARD();
1548
1549     if (!rbname) {
1550         /* Reuse last RAMBlock */
1551         ramblock = rs->last_req_rb;
1552
1553         if (!ramblock) {
1554             /*
1555              * Shouldn't happen, we can't reuse the last RAMBlock if
1556              * it's the 1st request.
1557              */
1558             error_report("ram_save_queue_pages no previous block");
1559             return -1;
1560         }
1561     } else {
1562         ramblock = qemu_ram_block_by_name(rbname);
1563
1564         if (!ramblock) {
1565             /* We shouldn't be asked for a non-existent RAMBlock */
1566             error_report("ram_save_queue_pages no block '%s'", rbname);
1567             return -1;
1568         }
1569         rs->last_req_rb = ramblock;
1570     }
1571     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1572     if (start+len > ramblock->used_length) {
1573         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1574                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1575                      __func__, start, len, ramblock->used_length);
1576         return -1;
1577     }
1578
1579     struct RAMSrcPageRequest *new_entry =
1580         g_malloc0(sizeof(struct RAMSrcPageRequest));
1581     new_entry->rb = ramblock;
1582     new_entry->offset = start;
1583     new_entry->len = len;
1584
1585     memory_region_ref(ramblock->mr);
1586     qemu_mutex_lock(&rs->src_page_req_mutex);
1587     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1588     migration_make_urgent_request();
1589     qemu_mutex_unlock(&rs->src_page_req_mutex);
1590
1591     return 0;
1592 }
1593
1594 static bool save_page_use_compression(RAMState *rs)
1595 {
1596     if (!migrate_use_compression()) {
1597         return false;
1598     }
1599
1600     /*
1601      * If xbzrle is on, stop using the data compression after first
1602      * round of migration even if compression is enabled. In theory,
1603      * xbzrle can do better than compression.
1604      */
1605     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1606         return true;
1607     }
1608
1609     return false;
1610 }
1611
1612 /*
1613  * try to compress the page before posting it out, return true if the page
1614  * has been properly handled by compression, otherwise needs other
1615  * paths to handle it
1616  */
1617 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1618 {
1619     if (!save_page_use_compression(rs)) {
1620         return false;
1621     }
1622
1623     /*
1624      * When starting the process of a new block, the first page of
1625      * the block should be sent out before other pages in the same
1626      * block, and all the pages in last block should have been sent
1627      * out, keeping this order is important, because the 'cont' flag
1628      * is used to avoid resending the block name.
1629      *
1630      * We post the fist page as normal page as compression will take
1631      * much CPU resource.
1632      */
1633     if (block != rs->last_sent_block) {
1634         flush_compressed_data(rs);
1635         return false;
1636     }
1637
1638     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1639         return true;
1640     }
1641
1642     compression_counters.busy++;
1643     return false;
1644 }
1645
1646 /**
1647  * ram_save_target_page: save one target page
1648  *
1649  * Returns the number of pages written
1650  *
1651  * @rs: current RAM state
1652  * @pss: data about the page we want to send
1653  * @last_stage: if we are at the completion stage
1654  */
1655 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1656                                 bool last_stage)
1657 {
1658     RAMBlock *block = pss->block;
1659     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1660     int res;
1661
1662     if (control_save_page(rs, block, offset, &res)) {
1663         return res;
1664     }
1665
1666     if (save_compress_page(rs, block, offset)) {
1667         return 1;
1668     }
1669
1670     res = save_zero_page(rs, block, offset);
1671     if (res > 0) {
1672         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1673          * page would be stale
1674          */
1675         if (!save_page_use_compression(rs)) {
1676             XBZRLE_cache_lock();
1677             xbzrle_cache_zero_page(rs, block->offset + offset);
1678             XBZRLE_cache_unlock();
1679         }
1680         ram_release_pages(block->idstr, offset, res);
1681         return res;
1682     }
1683
1684     /*
1685      * Do not use multifd for:
1686      * 1. Compression as the first page in the new block should be posted out
1687      *    before sending the compressed page
1688      * 2. In postcopy as one whole host page should be placed
1689      */
1690     if (!save_page_use_compression(rs) && migrate_use_multifd()
1691         && !migration_in_postcopy()) {
1692         return ram_save_multifd_page(rs, block, offset);
1693     }
1694
1695     return ram_save_page(rs, pss, last_stage);
1696 }
1697
1698 /**
1699  * ram_save_host_page: save a whole host page
1700  *
1701  * Starting at *offset send pages up to the end of the current host
1702  * page. It's valid for the initial offset to point into the middle of
1703  * a host page in which case the remainder of the hostpage is sent.
1704  * Only dirty target pages are sent. Note that the host page size may
1705  * be a huge page for this block.
1706  * The saving stops at the boundary of the used_length of the block
1707  * if the RAMBlock isn't a multiple of the host page size.
1708  *
1709  * Returns the number of pages written or negative on error
1710  *
1711  * @rs: current RAM state
1712  * @ms: current migration state
1713  * @pss: data about the page we want to send
1714  * @last_stage: if we are at the completion stage
1715  */
1716 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1717                               bool last_stage)
1718 {
1719     int tmppages, pages = 0;
1720     size_t pagesize_bits =
1721         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1722
1723     if (ramblock_is_ignored(pss->block)) {
1724         error_report("block %s should not be migrated !", pss->block->idstr);
1725         return 0;
1726     }
1727
1728     do {
1729         /* Check the pages is dirty and if it is send it */
1730         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1731             pss->page++;
1732             continue;
1733         }
1734
1735         tmppages = ram_save_target_page(rs, pss, last_stage);
1736         if (tmppages < 0) {
1737             return tmppages;
1738         }
1739
1740         pages += tmppages;
1741         pss->page++;
1742         /* Allow rate limiting to happen in the middle of huge pages */
1743         migration_rate_limit();
1744     } while ((pss->page & (pagesize_bits - 1)) &&
1745              offset_in_ramblock(pss->block,
1746                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
1747
1748     /* The offset we leave with is the last one we looked at */
1749     pss->page--;
1750     return pages;
1751 }
1752
1753 /**
1754  * ram_find_and_save_block: finds a dirty page and sends it to f
1755  *
1756  * Called within an RCU critical section.
1757  *
1758  * Returns the number of pages written where zero means no dirty pages,
1759  * or negative on error
1760  *
1761  * @rs: current RAM state
1762  * @last_stage: if we are at the completion stage
1763  *
1764  * On systems where host-page-size > target-page-size it will send all the
1765  * pages in a host page that are dirty.
1766  */
1767
1768 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1769 {
1770     PageSearchStatus pss;
1771     int pages = 0;
1772     bool again, found;
1773
1774     /* No dirty page as there is zero RAM */
1775     if (!ram_bytes_total()) {
1776         return pages;
1777     }
1778
1779     pss.block = rs->last_seen_block;
1780     pss.page = rs->last_page;
1781     pss.complete_round = false;
1782
1783     if (!pss.block) {
1784         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1785     }
1786
1787     do {
1788         again = true;
1789         found = get_queued_page(rs, &pss);
1790
1791         if (!found) {
1792             /* priority queue empty, so just search for something dirty */
1793             found = find_dirty_block(rs, &pss, &again);
1794         }
1795
1796         if (found) {
1797             pages = ram_save_host_page(rs, &pss, last_stage);
1798         }
1799     } while (!pages && again);
1800
1801     rs->last_seen_block = pss.block;
1802     rs->last_page = pss.page;
1803
1804     return pages;
1805 }
1806
1807 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1808 {
1809     uint64_t pages = size / TARGET_PAGE_SIZE;
1810
1811     if (zero) {
1812         ram_counters.duplicate += pages;
1813     } else {
1814         ram_counters.normal += pages;
1815         ram_counters.transferred += size;
1816         qemu_update_position(f, size);
1817     }
1818 }
1819
1820 static uint64_t ram_bytes_total_common(bool count_ignored)
1821 {
1822     RAMBlock *block;
1823     uint64_t total = 0;
1824
1825     RCU_READ_LOCK_GUARD();
1826
1827     if (count_ignored) {
1828         RAMBLOCK_FOREACH_MIGRATABLE(block) {
1829             total += block->used_length;
1830         }
1831     } else {
1832         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1833             total += block->used_length;
1834         }
1835     }
1836     return total;
1837 }
1838
1839 uint64_t ram_bytes_total(void)
1840 {
1841     return ram_bytes_total_common(false);
1842 }
1843
1844 static void xbzrle_load_setup(void)
1845 {
1846     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1847 }
1848
1849 static void xbzrle_load_cleanup(void)
1850 {
1851     g_free(XBZRLE.decoded_buf);
1852     XBZRLE.decoded_buf = NULL;
1853 }
1854
1855 static void ram_state_cleanup(RAMState **rsp)
1856 {
1857     if (*rsp) {
1858         migration_page_queue_free(*rsp);
1859         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1860         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1861         g_free(*rsp);
1862         *rsp = NULL;
1863     }
1864 }
1865
1866 static void xbzrle_cleanup(void)
1867 {
1868     XBZRLE_cache_lock();
1869     if (XBZRLE.cache) {
1870         cache_fini(XBZRLE.cache);
1871         g_free(XBZRLE.encoded_buf);
1872         g_free(XBZRLE.current_buf);
1873         g_free(XBZRLE.zero_target_page);
1874         XBZRLE.cache = NULL;
1875         XBZRLE.encoded_buf = NULL;
1876         XBZRLE.current_buf = NULL;
1877         XBZRLE.zero_target_page = NULL;
1878     }
1879     XBZRLE_cache_unlock();
1880 }
1881
1882 static void ram_save_cleanup(void *opaque)
1883 {
1884     RAMState **rsp = opaque;
1885     RAMBlock *block;
1886
1887     /* caller have hold iothread lock or is in a bh, so there is
1888      * no writing race against the migration bitmap
1889      */
1890     memory_global_dirty_log_stop();
1891
1892     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1893         g_free(block->clear_bmap);
1894         block->clear_bmap = NULL;
1895         g_free(block->bmap);
1896         block->bmap = NULL;
1897     }
1898
1899     xbzrle_cleanup();
1900     compress_threads_save_cleanup();
1901     ram_state_cleanup(rsp);
1902 }
1903
1904 static void ram_state_reset(RAMState *rs)
1905 {
1906     rs->last_seen_block = NULL;
1907     rs->last_sent_block = NULL;
1908     rs->last_page = 0;
1909     rs->last_version = ram_list.version;
1910     rs->ram_bulk_stage = true;
1911     rs->fpo_enabled = false;
1912 }
1913
1914 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1915
1916 /*
1917  * 'expected' is the value you expect the bitmap mostly to be full
1918  * of; it won't bother printing lines that are all this value.
1919  * If 'todump' is null the migration bitmap is dumped.
1920  */
1921 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1922                            unsigned long pages)
1923 {
1924     int64_t cur;
1925     int64_t linelen = 128;
1926     char linebuf[129];
1927
1928     for (cur = 0; cur < pages; cur += linelen) {
1929         int64_t curb;
1930         bool found = false;
1931         /*
1932          * Last line; catch the case where the line length
1933          * is longer than remaining ram
1934          */
1935         if (cur + linelen > pages) {
1936             linelen = pages - cur;
1937         }
1938         for (curb = 0; curb < linelen; curb++) {
1939             bool thisbit = test_bit(cur + curb, todump);
1940             linebuf[curb] = thisbit ? '1' : '.';
1941             found = found || (thisbit != expected);
1942         }
1943         if (found) {
1944             linebuf[curb] = '\0';
1945             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1946         }
1947     }
1948 }
1949
1950 /* **** functions for postcopy ***** */
1951
1952 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1953 {
1954     struct RAMBlock *block;
1955
1956     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1957         unsigned long *bitmap = block->bmap;
1958         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1959         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1960
1961         while (run_start < range) {
1962             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1963             ram_discard_range(block->idstr,
1964                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
1965                               ((ram_addr_t)(run_end - run_start))
1966                                 << TARGET_PAGE_BITS);
1967             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1968         }
1969     }
1970 }
1971
1972 /**
1973  * postcopy_send_discard_bm_ram: discard a RAMBlock
1974  *
1975  * Returns zero on success
1976  *
1977  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1978  *
1979  * @ms: current migration state
1980  * @block: RAMBlock to discard
1981  */
1982 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
1983 {
1984     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1985     unsigned long current;
1986     unsigned long *bitmap = block->bmap;
1987
1988     for (current = 0; current < end; ) {
1989         unsigned long one = find_next_bit(bitmap, end, current);
1990         unsigned long zero, discard_length;
1991
1992         if (one >= end) {
1993             break;
1994         }
1995
1996         zero = find_next_zero_bit(bitmap, end, one + 1);
1997
1998         if (zero >= end) {
1999             discard_length = end - one;
2000         } else {
2001             discard_length = zero - one;
2002         }
2003         postcopy_discard_send_range(ms, one, discard_length);
2004         current = one + discard_length;
2005     }
2006
2007     return 0;
2008 }
2009
2010 /**
2011  * postcopy_each_ram_send_discard: discard all RAMBlocks
2012  *
2013  * Returns 0 for success or negative for error
2014  *
2015  * Utility for the outgoing postcopy code.
2016  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2017  *   passing it bitmap indexes and name.
2018  * (qemu_ram_foreach_block ends up passing unscaled lengths
2019  *  which would mean postcopy code would have to deal with target page)
2020  *
2021  * @ms: current migration state
2022  */
2023 static int postcopy_each_ram_send_discard(MigrationState *ms)
2024 {
2025     struct RAMBlock *block;
2026     int ret;
2027
2028     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2029         postcopy_discard_send_init(ms, block->idstr);
2030
2031         /*
2032          * Postcopy sends chunks of bitmap over the wire, but it
2033          * just needs indexes at this point, avoids it having
2034          * target page specific code.
2035          */
2036         ret = postcopy_send_discard_bm_ram(ms, block);
2037         postcopy_discard_send_finish(ms);
2038         if (ret) {
2039             return ret;
2040         }
2041     }
2042
2043     return 0;
2044 }
2045
2046 /**
2047  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2048  *
2049  * Helper for postcopy_chunk_hostpages; it's called twice to
2050  * canonicalize the two bitmaps, that are similar, but one is
2051  * inverted.
2052  *
2053  * Postcopy requires that all target pages in a hostpage are dirty or
2054  * clean, not a mix.  This function canonicalizes the bitmaps.
2055  *
2056  * @ms: current migration state
2057  * @block: block that contains the page we want to canonicalize
2058  */
2059 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2060 {
2061     RAMState *rs = ram_state;
2062     unsigned long *bitmap = block->bmap;
2063     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2064     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2065     unsigned long run_start;
2066
2067     if (block->page_size == TARGET_PAGE_SIZE) {
2068         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2069         return;
2070     }
2071
2072     /* Find a dirty page */
2073     run_start = find_next_bit(bitmap, pages, 0);
2074
2075     while (run_start < pages) {
2076
2077         /*
2078          * If the start of this run of pages is in the middle of a host
2079          * page, then we need to fixup this host page.
2080          */
2081         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2082             /* Find the end of this run */
2083             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2084             /*
2085              * If the end isn't at the start of a host page, then the
2086              * run doesn't finish at the end of a host page
2087              * and we need to discard.
2088              */
2089         }
2090
2091         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2092             unsigned long page;
2093             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2094                                                              host_ratio);
2095             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2096
2097             /* Clean up the bitmap */
2098             for (page = fixup_start_addr;
2099                  page < fixup_start_addr + host_ratio; page++) {
2100                 /*
2101                  * Remark them as dirty, updating the count for any pages
2102                  * that weren't previously dirty.
2103                  */
2104                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2105             }
2106         }
2107
2108         /* Find the next dirty page for the next iteration */
2109         run_start = find_next_bit(bitmap, pages, run_start);
2110     }
2111 }
2112
2113 /**
2114  * postcopy_chunk_hostpages: discard any partially sent host page
2115  *
2116  * Utility for the outgoing postcopy code.
2117  *
2118  * Discard any partially sent host-page size chunks, mark any partially
2119  * dirty host-page size chunks as all dirty.  In this case the host-page
2120  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2121  *
2122  * Returns zero on success
2123  *
2124  * @ms: current migration state
2125  * @block: block we want to work with
2126  */
2127 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2128 {
2129     postcopy_discard_send_init(ms, block->idstr);
2130
2131     /*
2132      * Ensure that all partially dirty host pages are made fully dirty.
2133      */
2134     postcopy_chunk_hostpages_pass(ms, block);
2135
2136     postcopy_discard_send_finish(ms);
2137     return 0;
2138 }
2139
2140 /**
2141  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2142  *
2143  * Returns zero on success
2144  *
2145  * Transmit the set of pages to be discarded after precopy to the target
2146  * these are pages that:
2147  *     a) Have been previously transmitted but are now dirty again
2148  *     b) Pages that have never been transmitted, this ensures that
2149  *        any pages on the destination that have been mapped by background
2150  *        tasks get discarded (transparent huge pages is the specific concern)
2151  * Hopefully this is pretty sparse
2152  *
2153  * @ms: current migration state
2154  */
2155 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2156 {
2157     RAMState *rs = ram_state;
2158     RAMBlock *block;
2159     int ret;
2160
2161     RCU_READ_LOCK_GUARD();
2162
2163     /* This should be our last sync, the src is now paused */
2164     migration_bitmap_sync(rs);
2165
2166     /* Easiest way to make sure we don't resume in the middle of a host-page */
2167     rs->last_seen_block = NULL;
2168     rs->last_sent_block = NULL;
2169     rs->last_page = 0;
2170
2171     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2172         /* Deal with TPS != HPS and huge pages */
2173         ret = postcopy_chunk_hostpages(ms, block);
2174         if (ret) {
2175             return ret;
2176         }
2177
2178 #ifdef DEBUG_POSTCOPY
2179         ram_debug_dump_bitmap(block->bmap, true,
2180                               block->used_length >> TARGET_PAGE_BITS);
2181 #endif
2182     }
2183     trace_ram_postcopy_send_discard_bitmap();
2184
2185     return postcopy_each_ram_send_discard(ms);
2186 }
2187
2188 /**
2189  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2190  *
2191  * Returns zero on success
2192  *
2193  * @rbname: name of the RAMBlock of the request. NULL means the
2194  *          same that last one.
2195  * @start: RAMBlock starting page
2196  * @length: RAMBlock size
2197  */
2198 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2199 {
2200     trace_ram_discard_range(rbname, start, length);
2201
2202     RCU_READ_LOCK_GUARD();
2203     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2204
2205     if (!rb) {
2206         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2207         return -1;
2208     }
2209
2210     /*
2211      * On source VM, we don't need to update the received bitmap since
2212      * we don't even have one.
2213      */
2214     if (rb->receivedmap) {
2215         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2216                      length >> qemu_target_page_bits());
2217     }
2218
2219     return ram_block_discard_range(rb, start, length);
2220 }
2221
2222 /*
2223  * For every allocation, we will try not to crash the VM if the
2224  * allocation failed.
2225  */
2226 static int xbzrle_init(void)
2227 {
2228     Error *local_err = NULL;
2229
2230     if (!migrate_use_xbzrle()) {
2231         return 0;
2232     }
2233
2234     XBZRLE_cache_lock();
2235
2236     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2237     if (!XBZRLE.zero_target_page) {
2238         error_report("%s: Error allocating zero page", __func__);
2239         goto err_out;
2240     }
2241
2242     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2243                               TARGET_PAGE_SIZE, &local_err);
2244     if (!XBZRLE.cache) {
2245         error_report_err(local_err);
2246         goto free_zero_page;
2247     }
2248
2249     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2250     if (!XBZRLE.encoded_buf) {
2251         error_report("%s: Error allocating encoded_buf", __func__);
2252         goto free_cache;
2253     }
2254
2255     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2256     if (!XBZRLE.current_buf) {
2257         error_report("%s: Error allocating current_buf", __func__);
2258         goto free_encoded_buf;
2259     }
2260
2261     /* We are all good */
2262     XBZRLE_cache_unlock();
2263     return 0;
2264
2265 free_encoded_buf:
2266     g_free(XBZRLE.encoded_buf);
2267     XBZRLE.encoded_buf = NULL;
2268 free_cache:
2269     cache_fini(XBZRLE.cache);
2270     XBZRLE.cache = NULL;
2271 free_zero_page:
2272     g_free(XBZRLE.zero_target_page);
2273     XBZRLE.zero_target_page = NULL;
2274 err_out:
2275     XBZRLE_cache_unlock();
2276     return -ENOMEM;
2277 }
2278
2279 static int ram_state_init(RAMState **rsp)
2280 {
2281     *rsp = g_try_new0(RAMState, 1);
2282
2283     if (!*rsp) {
2284         error_report("%s: Init ramstate fail", __func__);
2285         return -1;
2286     }
2287
2288     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2289     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2290     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2291
2292     /*
2293      * Count the total number of pages used by ram blocks not including any
2294      * gaps due to alignment or unplugs.
2295      * This must match with the initial values of dirty bitmap.
2296      */
2297     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2298     ram_state_reset(*rsp);
2299
2300     return 0;
2301 }
2302
2303 static void ram_list_init_bitmaps(void)
2304 {
2305     MigrationState *ms = migrate_get_current();
2306     RAMBlock *block;
2307     unsigned long pages;
2308     uint8_t shift;
2309
2310     /* Skip setting bitmap if there is no RAM */
2311     if (ram_bytes_total()) {
2312         shift = ms->clear_bitmap_shift;
2313         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2314             error_report("clear_bitmap_shift (%u) too big, using "
2315                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2316             shift = CLEAR_BITMAP_SHIFT_MAX;
2317         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2318             error_report("clear_bitmap_shift (%u) too small, using "
2319                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2320             shift = CLEAR_BITMAP_SHIFT_MIN;
2321         }
2322
2323         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2324             pages = block->max_length >> TARGET_PAGE_BITS;
2325             /*
2326              * The initial dirty bitmap for migration must be set with all
2327              * ones to make sure we'll migrate every guest RAM page to
2328              * destination.
2329              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2330              * new migration after a failed migration, ram_list.
2331              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2332              * guest memory.
2333              */
2334             block->bmap = bitmap_new(pages);
2335             bitmap_set(block->bmap, 0, pages);
2336             block->clear_bmap_shift = shift;
2337             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2338         }
2339     }
2340 }
2341
2342 static void ram_init_bitmaps(RAMState *rs)
2343 {
2344     /* For memory_global_dirty_log_start below.  */
2345     qemu_mutex_lock_iothread();
2346     qemu_mutex_lock_ramlist();
2347
2348     WITH_RCU_READ_LOCK_GUARD() {
2349         ram_list_init_bitmaps();
2350         memory_global_dirty_log_start();
2351         migration_bitmap_sync_precopy(rs);
2352     }
2353     qemu_mutex_unlock_ramlist();
2354     qemu_mutex_unlock_iothread();
2355 }
2356
2357 static int ram_init_all(RAMState **rsp)
2358 {
2359     if (ram_state_init(rsp)) {
2360         return -1;
2361     }
2362
2363     if (xbzrle_init()) {
2364         ram_state_cleanup(rsp);
2365         return -1;
2366     }
2367
2368     ram_init_bitmaps(*rsp);
2369
2370     return 0;
2371 }
2372
2373 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2374 {
2375     RAMBlock *block;
2376     uint64_t pages = 0;
2377
2378     /*
2379      * Postcopy is not using xbzrle/compression, so no need for that.
2380      * Also, since source are already halted, we don't need to care
2381      * about dirty page logging as well.
2382      */
2383
2384     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2385         pages += bitmap_count_one(block->bmap,
2386                                   block->used_length >> TARGET_PAGE_BITS);
2387     }
2388
2389     /* This may not be aligned with current bitmaps. Recalculate. */
2390     rs->migration_dirty_pages = pages;
2391
2392     rs->last_seen_block = NULL;
2393     rs->last_sent_block = NULL;
2394     rs->last_page = 0;
2395     rs->last_version = ram_list.version;
2396     /*
2397      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2398      * matter what we have sent.
2399      */
2400     rs->ram_bulk_stage = false;
2401
2402     /* Update RAMState cache of output QEMUFile */
2403     rs->f = out;
2404
2405     trace_ram_state_resume_prepare(pages);
2406 }
2407
2408 /*
2409  * This function clears bits of the free pages reported by the caller from the
2410  * migration dirty bitmap. @addr is the host address corresponding to the
2411  * start of the continuous guest free pages, and @len is the total bytes of
2412  * those pages.
2413  */
2414 void qemu_guest_free_page_hint(void *addr, size_t len)
2415 {
2416     RAMBlock *block;
2417     ram_addr_t offset;
2418     size_t used_len, start, npages;
2419     MigrationState *s = migrate_get_current();
2420
2421     /* This function is currently expected to be used during live migration */
2422     if (!migration_is_setup_or_active(s->state)) {
2423         return;
2424     }
2425
2426     for (; len > 0; len -= used_len, addr += used_len) {
2427         block = qemu_ram_block_from_host(addr, false, &offset);
2428         if (unlikely(!block || offset >= block->used_length)) {
2429             /*
2430              * The implementation might not support RAMBlock resize during
2431              * live migration, but it could happen in theory with future
2432              * updates. So we add a check here to capture that case.
2433              */
2434             error_report_once("%s unexpected error", __func__);
2435             return;
2436         }
2437
2438         if (len <= block->used_length - offset) {
2439             used_len = len;
2440         } else {
2441             used_len = block->used_length - offset;
2442         }
2443
2444         start = offset >> TARGET_PAGE_BITS;
2445         npages = used_len >> TARGET_PAGE_BITS;
2446
2447         qemu_mutex_lock(&ram_state->bitmap_mutex);
2448         ram_state->migration_dirty_pages -=
2449                       bitmap_count_one_with_offset(block->bmap, start, npages);
2450         bitmap_clear(block->bmap, start, npages);
2451         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2452     }
2453 }
2454
2455 /*
2456  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2457  * long-running RCU critical section.  When rcu-reclaims in the code
2458  * start to become numerous it will be necessary to reduce the
2459  * granularity of these critical sections.
2460  */
2461
2462 /**
2463  * ram_save_setup: Setup RAM for migration
2464  *
2465  * Returns zero to indicate success and negative for error
2466  *
2467  * @f: QEMUFile where to send the data
2468  * @opaque: RAMState pointer
2469  */
2470 static int ram_save_setup(QEMUFile *f, void *opaque)
2471 {
2472     RAMState **rsp = opaque;
2473     RAMBlock *block;
2474
2475     if (compress_threads_save_setup()) {
2476         return -1;
2477     }
2478
2479     /* migration has already setup the bitmap, reuse it. */
2480     if (!migration_in_colo_state()) {
2481         if (ram_init_all(rsp) != 0) {
2482             compress_threads_save_cleanup();
2483             return -1;
2484         }
2485     }
2486     (*rsp)->f = f;
2487
2488     WITH_RCU_READ_LOCK_GUARD() {
2489         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2490
2491         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2492             qemu_put_byte(f, strlen(block->idstr));
2493             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2494             qemu_put_be64(f, block->used_length);
2495             if (migrate_postcopy_ram() && block->page_size !=
2496                                           qemu_host_page_size) {
2497                 qemu_put_be64(f, block->page_size);
2498             }
2499             if (migrate_ignore_shared()) {
2500                 qemu_put_be64(f, block->mr->addr);
2501             }
2502         }
2503     }
2504
2505     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2506     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2507
2508     multifd_send_sync_main(f);
2509     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2510     qemu_fflush(f);
2511
2512     return 0;
2513 }
2514
2515 /**
2516  * ram_save_iterate: iterative stage for migration
2517  *
2518  * Returns zero to indicate success and negative for error
2519  *
2520  * @f: QEMUFile where to send the data
2521  * @opaque: RAMState pointer
2522  */
2523 static int ram_save_iterate(QEMUFile *f, void *opaque)
2524 {
2525     RAMState **temp = opaque;
2526     RAMState *rs = *temp;
2527     int ret = 0;
2528     int i;
2529     int64_t t0;
2530     int done = 0;
2531
2532     if (blk_mig_bulk_active()) {
2533         /* Avoid transferring ram during bulk phase of block migration as
2534          * the bulk phase will usually take a long time and transferring
2535          * ram updates during that time is pointless. */
2536         goto out;
2537     }
2538
2539     WITH_RCU_READ_LOCK_GUARD() {
2540         if (ram_list.version != rs->last_version) {
2541             ram_state_reset(rs);
2542         }
2543
2544         /* Read version before ram_list.blocks */
2545         smp_rmb();
2546
2547         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2548
2549         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2550         i = 0;
2551         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2552                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2553             int pages;
2554
2555             if (qemu_file_get_error(f)) {
2556                 break;
2557             }
2558
2559             pages = ram_find_and_save_block(rs, false);
2560             /* no more pages to sent */
2561             if (pages == 0) {
2562                 done = 1;
2563                 break;
2564             }
2565
2566             if (pages < 0) {
2567                 qemu_file_set_error(f, pages);
2568                 break;
2569             }
2570
2571             rs->target_page_count += pages;
2572
2573             /*
2574              * During postcopy, it is necessary to make sure one whole host
2575              * page is sent in one chunk.
2576              */
2577             if (migrate_postcopy_ram()) {
2578                 flush_compressed_data(rs);
2579             }
2580
2581             /*
2582              * we want to check in the 1st loop, just in case it was the 1st
2583              * time and we had to sync the dirty bitmap.
2584              * qemu_clock_get_ns() is a bit expensive, so we only check each
2585              * some iterations
2586              */
2587             if ((i & 63) == 0) {
2588                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2589                               1000000;
2590                 if (t1 > MAX_WAIT) {
2591                     trace_ram_save_iterate_big_wait(t1, i);
2592                     break;
2593                 }
2594             }
2595             i++;
2596         }
2597     }
2598
2599     /*
2600      * Must occur before EOS (or any QEMUFile operation)
2601      * because of RDMA protocol.
2602      */
2603     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2604
2605 out:
2606     if (ret >= 0
2607         && migration_is_setup_or_active(migrate_get_current()->state)) {
2608         multifd_send_sync_main(rs->f);
2609         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2610         qemu_fflush(f);
2611         ram_counters.transferred += 8;
2612
2613         ret = qemu_file_get_error(f);
2614     }
2615     if (ret < 0) {
2616         return ret;
2617     }
2618
2619     return done;
2620 }
2621
2622 /**
2623  * ram_save_complete: function called to send the remaining amount of ram
2624  *
2625  * Returns zero to indicate success or negative on error
2626  *
2627  * Called with iothread lock
2628  *
2629  * @f: QEMUFile where to send the data
2630  * @opaque: RAMState pointer
2631  */
2632 static int ram_save_complete(QEMUFile *f, void *opaque)
2633 {
2634     RAMState **temp = opaque;
2635     RAMState *rs = *temp;
2636     int ret = 0;
2637
2638     WITH_RCU_READ_LOCK_GUARD() {
2639         if (!migration_in_postcopy()) {
2640             migration_bitmap_sync_precopy(rs);
2641         }
2642
2643         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2644
2645         /* try transferring iterative blocks of memory */
2646
2647         /* flush all remaining blocks regardless of rate limiting */
2648         while (true) {
2649             int pages;
2650
2651             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2652             /* no more blocks to sent */
2653             if (pages == 0) {
2654                 break;
2655             }
2656             if (pages < 0) {
2657                 ret = pages;
2658                 break;
2659             }
2660         }
2661
2662         flush_compressed_data(rs);
2663         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2664     }
2665
2666     if (ret >= 0) {
2667         multifd_send_sync_main(rs->f);
2668         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2669         qemu_fflush(f);
2670     }
2671
2672     return ret;
2673 }
2674
2675 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2676                              uint64_t *res_precopy_only,
2677                              uint64_t *res_compatible,
2678                              uint64_t *res_postcopy_only)
2679 {
2680     RAMState **temp = opaque;
2681     RAMState *rs = *temp;
2682     uint64_t remaining_size;
2683
2684     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2685
2686     if (!migration_in_postcopy() &&
2687         remaining_size < max_size) {
2688         qemu_mutex_lock_iothread();
2689         WITH_RCU_READ_LOCK_GUARD() {
2690             migration_bitmap_sync_precopy(rs);
2691         }
2692         qemu_mutex_unlock_iothread();
2693         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2694     }
2695
2696     if (migrate_postcopy_ram()) {
2697         /* We can do postcopy, and all the data is postcopiable */
2698         *res_compatible += remaining_size;
2699     } else {
2700         *res_precopy_only += remaining_size;
2701     }
2702 }
2703
2704 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2705 {
2706     unsigned int xh_len;
2707     int xh_flags;
2708     uint8_t *loaded_data;
2709
2710     /* extract RLE header */
2711     xh_flags = qemu_get_byte(f);
2712     xh_len = qemu_get_be16(f);
2713
2714     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2715         error_report("Failed to load XBZRLE page - wrong compression!");
2716         return -1;
2717     }
2718
2719     if (xh_len > TARGET_PAGE_SIZE) {
2720         error_report("Failed to load XBZRLE page - len overflow!");
2721         return -1;
2722     }
2723     loaded_data = XBZRLE.decoded_buf;
2724     /* load data and decode */
2725     /* it can change loaded_data to point to an internal buffer */
2726     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2727
2728     /* decode RLE */
2729     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2730                              TARGET_PAGE_SIZE) == -1) {
2731         error_report("Failed to load XBZRLE page - decode error!");
2732         return -1;
2733     }
2734
2735     return 0;
2736 }
2737
2738 /**
2739  * ram_block_from_stream: read a RAMBlock id from the migration stream
2740  *
2741  * Must be called from within a rcu critical section.
2742  *
2743  * Returns a pointer from within the RCU-protected ram_list.
2744  *
2745  * @f: QEMUFile where to read the data from
2746  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2747  */
2748 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2749 {
2750     static RAMBlock *block = NULL;
2751     char id[256];
2752     uint8_t len;
2753
2754     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2755         if (!block) {
2756             error_report("Ack, bad migration stream!");
2757             return NULL;
2758         }
2759         return block;
2760     }
2761
2762     len = qemu_get_byte(f);
2763     qemu_get_buffer(f, (uint8_t *)id, len);
2764     id[len] = 0;
2765
2766     block = qemu_ram_block_by_name(id);
2767     if (!block) {
2768         error_report("Can't find block %s", id);
2769         return NULL;
2770     }
2771
2772     if (ramblock_is_ignored(block)) {
2773         error_report("block %s should not be migrated !", id);
2774         return NULL;
2775     }
2776
2777     return block;
2778 }
2779
2780 static inline void *host_from_ram_block_offset(RAMBlock *block,
2781                                                ram_addr_t offset)
2782 {
2783     if (!offset_in_ramblock(block, offset)) {
2784         return NULL;
2785     }
2786
2787     return block->host + offset;
2788 }
2789
2790 static inline void *colo_cache_from_block_offset(RAMBlock *block,
2791                              ram_addr_t offset, bool record_bitmap)
2792 {
2793     if (!offset_in_ramblock(block, offset)) {
2794         return NULL;
2795     }
2796     if (!block->colo_cache) {
2797         error_report("%s: colo_cache is NULL in block :%s",
2798                      __func__, block->idstr);
2799         return NULL;
2800     }
2801
2802     /*
2803     * During colo checkpoint, we need bitmap of these migrated pages.
2804     * It help us to decide which pages in ram cache should be flushed
2805     * into VM's RAM later.
2806     */
2807     if (record_bitmap &&
2808         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
2809         ram_state->migration_dirty_pages++;
2810     }
2811     return block->colo_cache + offset;
2812 }
2813
2814 /**
2815  * ram_handle_compressed: handle the zero page case
2816  *
2817  * If a page (or a whole RDMA chunk) has been
2818  * determined to be zero, then zap it.
2819  *
2820  * @host: host address for the zero page
2821  * @ch: what the page is filled from.  We only support zero
2822  * @size: size of the zero page
2823  */
2824 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2825 {
2826     if (ch != 0 || !is_zero_range(host, size)) {
2827         memset(host, ch, size);
2828     }
2829 }
2830
2831 /* return the size after decompression, or negative value on error */
2832 static int
2833 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2834                      const uint8_t *source, size_t source_len)
2835 {
2836     int err;
2837
2838     err = inflateReset(stream);
2839     if (err != Z_OK) {
2840         return -1;
2841     }
2842
2843     stream->avail_in = source_len;
2844     stream->next_in = (uint8_t *)source;
2845     stream->avail_out = dest_len;
2846     stream->next_out = dest;
2847
2848     err = inflate(stream, Z_NO_FLUSH);
2849     if (err != Z_STREAM_END) {
2850         return -1;
2851     }
2852
2853     return stream->total_out;
2854 }
2855
2856 static void *do_data_decompress(void *opaque)
2857 {
2858     DecompressParam *param = opaque;
2859     unsigned long pagesize;
2860     uint8_t *des;
2861     int len, ret;
2862
2863     qemu_mutex_lock(&param->mutex);
2864     while (!param->quit) {
2865         if (param->des) {
2866             des = param->des;
2867             len = param->len;
2868             param->des = 0;
2869             qemu_mutex_unlock(&param->mutex);
2870
2871             pagesize = TARGET_PAGE_SIZE;
2872
2873             ret = qemu_uncompress_data(&param->stream, des, pagesize,
2874                                        param->compbuf, len);
2875             if (ret < 0 && migrate_get_current()->decompress_error_check) {
2876                 error_report("decompress data failed");
2877                 qemu_file_set_error(decomp_file, ret);
2878             }
2879
2880             qemu_mutex_lock(&decomp_done_lock);
2881             param->done = true;
2882             qemu_cond_signal(&decomp_done_cond);
2883             qemu_mutex_unlock(&decomp_done_lock);
2884
2885             qemu_mutex_lock(&param->mutex);
2886         } else {
2887             qemu_cond_wait(&param->cond, &param->mutex);
2888         }
2889     }
2890     qemu_mutex_unlock(&param->mutex);
2891
2892     return NULL;
2893 }
2894
2895 static int wait_for_decompress_done(void)
2896 {
2897     int idx, thread_count;
2898
2899     if (!migrate_use_compression()) {
2900         return 0;
2901     }
2902
2903     thread_count = migrate_decompress_threads();
2904     qemu_mutex_lock(&decomp_done_lock);
2905     for (idx = 0; idx < thread_count; idx++) {
2906         while (!decomp_param[idx].done) {
2907             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2908         }
2909     }
2910     qemu_mutex_unlock(&decomp_done_lock);
2911     return qemu_file_get_error(decomp_file);
2912 }
2913
2914 static void compress_threads_load_cleanup(void)
2915 {
2916     int i, thread_count;
2917
2918     if (!migrate_use_compression()) {
2919         return;
2920     }
2921     thread_count = migrate_decompress_threads();
2922     for (i = 0; i < thread_count; i++) {
2923         /*
2924          * we use it as a indicator which shows if the thread is
2925          * properly init'd or not
2926          */
2927         if (!decomp_param[i].compbuf) {
2928             break;
2929         }
2930
2931         qemu_mutex_lock(&decomp_param[i].mutex);
2932         decomp_param[i].quit = true;
2933         qemu_cond_signal(&decomp_param[i].cond);
2934         qemu_mutex_unlock(&decomp_param[i].mutex);
2935     }
2936     for (i = 0; i < thread_count; i++) {
2937         if (!decomp_param[i].compbuf) {
2938             break;
2939         }
2940
2941         qemu_thread_join(decompress_threads + i);
2942         qemu_mutex_destroy(&decomp_param[i].mutex);
2943         qemu_cond_destroy(&decomp_param[i].cond);
2944         inflateEnd(&decomp_param[i].stream);
2945         g_free(decomp_param[i].compbuf);
2946         decomp_param[i].compbuf = NULL;
2947     }
2948     g_free(decompress_threads);
2949     g_free(decomp_param);
2950     decompress_threads = NULL;
2951     decomp_param = NULL;
2952     decomp_file = NULL;
2953 }
2954
2955 static int compress_threads_load_setup(QEMUFile *f)
2956 {
2957     int i, thread_count;
2958
2959     if (!migrate_use_compression()) {
2960         return 0;
2961     }
2962
2963     thread_count = migrate_decompress_threads();
2964     decompress_threads = g_new0(QemuThread, thread_count);
2965     decomp_param = g_new0(DecompressParam, thread_count);
2966     qemu_mutex_init(&decomp_done_lock);
2967     qemu_cond_init(&decomp_done_cond);
2968     decomp_file = f;
2969     for (i = 0; i < thread_count; i++) {
2970         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2971             goto exit;
2972         }
2973
2974         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2975         qemu_mutex_init(&decomp_param[i].mutex);
2976         qemu_cond_init(&decomp_param[i].cond);
2977         decomp_param[i].done = true;
2978         decomp_param[i].quit = false;
2979         qemu_thread_create(decompress_threads + i, "decompress",
2980                            do_data_decompress, decomp_param + i,
2981                            QEMU_THREAD_JOINABLE);
2982     }
2983     return 0;
2984 exit:
2985     compress_threads_load_cleanup();
2986     return -1;
2987 }
2988
2989 static void decompress_data_with_multi_threads(QEMUFile *f,
2990                                                void *host, int len)
2991 {
2992     int idx, thread_count;
2993
2994     thread_count = migrate_decompress_threads();
2995     qemu_mutex_lock(&decomp_done_lock);
2996     while (true) {
2997         for (idx = 0; idx < thread_count; idx++) {
2998             if (decomp_param[idx].done) {
2999                 decomp_param[idx].done = false;
3000                 qemu_mutex_lock(&decomp_param[idx].mutex);
3001                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3002                 decomp_param[idx].des = host;
3003                 decomp_param[idx].len = len;
3004                 qemu_cond_signal(&decomp_param[idx].cond);
3005                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3006                 break;
3007             }
3008         }
3009         if (idx < thread_count) {
3010             break;
3011         } else {
3012             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3013         }
3014     }
3015     qemu_mutex_unlock(&decomp_done_lock);
3016 }
3017
3018 /*
3019  * colo cache: this is for secondary VM, we cache the whole
3020  * memory of the secondary VM, it is need to hold the global lock
3021  * to call this helper.
3022  */
3023 int colo_init_ram_cache(void)
3024 {
3025     RAMBlock *block;
3026
3027     WITH_RCU_READ_LOCK_GUARD() {
3028         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3029             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3030                                                     NULL,
3031                                                     false);
3032             if (!block->colo_cache) {
3033                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3034                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3035                              block->used_length);
3036                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3037                     if (block->colo_cache) {
3038                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3039                         block->colo_cache = NULL;
3040                     }
3041                 }
3042                 return -errno;
3043             }
3044         }
3045     }
3046
3047     /*
3048     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3049     * with to decide which page in cache should be flushed into SVM's RAM. Here
3050     * we use the same name 'ram_bitmap' as for migration.
3051     */
3052     if (ram_bytes_total()) {
3053         RAMBlock *block;
3054
3055         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3056             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3057             block->bmap = bitmap_new(pages);
3058         }
3059     }
3060
3061     ram_state_init(&ram_state);
3062     return 0;
3063 }
3064
3065 /* TODO: duplicated with ram_init_bitmaps */
3066 void colo_incoming_start_dirty_log(void)
3067 {
3068     RAMBlock *block = NULL;
3069     /* For memory_global_dirty_log_start below. */
3070     qemu_mutex_lock_iothread();
3071     qemu_mutex_lock_ramlist();
3072
3073     memory_global_dirty_log_sync();
3074     WITH_RCU_READ_LOCK_GUARD() {
3075         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3076             ramblock_sync_dirty_bitmap(ram_state, block);
3077             /* Discard this dirty bitmap record */
3078             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3079         }
3080         memory_global_dirty_log_start();
3081     }
3082     ram_state->migration_dirty_pages = 0;
3083     qemu_mutex_unlock_ramlist();
3084     qemu_mutex_unlock_iothread();
3085 }
3086
3087 /* It is need to hold the global lock to call this helper */
3088 void colo_release_ram_cache(void)
3089 {
3090     RAMBlock *block;
3091
3092     memory_global_dirty_log_stop();
3093     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3094         g_free(block->bmap);
3095         block->bmap = NULL;
3096     }
3097
3098     WITH_RCU_READ_LOCK_GUARD() {
3099         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3100             if (block->colo_cache) {
3101                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3102                 block->colo_cache = NULL;
3103             }
3104         }
3105     }
3106     ram_state_cleanup(&ram_state);
3107 }
3108
3109 /**
3110  * ram_load_setup: Setup RAM for migration incoming side
3111  *
3112  * Returns zero to indicate success and negative for error
3113  *
3114  * @f: QEMUFile where to receive the data
3115  * @opaque: RAMState pointer
3116  */
3117 static int ram_load_setup(QEMUFile *f, void *opaque)
3118 {
3119     if (compress_threads_load_setup(f)) {
3120         return -1;
3121     }
3122
3123     xbzrle_load_setup();
3124     ramblock_recv_map_init();
3125
3126     return 0;
3127 }
3128
3129 static int ram_load_cleanup(void *opaque)
3130 {
3131     RAMBlock *rb;
3132
3133     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3134         qemu_ram_block_writeback(rb);
3135     }
3136
3137     xbzrle_load_cleanup();
3138     compress_threads_load_cleanup();
3139
3140     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3141         g_free(rb->receivedmap);
3142         rb->receivedmap = NULL;
3143     }
3144
3145     return 0;
3146 }
3147
3148 /**
3149  * ram_postcopy_incoming_init: allocate postcopy data structures
3150  *
3151  * Returns 0 for success and negative if there was one error
3152  *
3153  * @mis: current migration incoming state
3154  *
3155  * Allocate data structures etc needed by incoming migration with
3156  * postcopy-ram. postcopy-ram's similarly names
3157  * postcopy_ram_incoming_init does the work.
3158  */
3159 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3160 {
3161     return postcopy_ram_incoming_init(mis);
3162 }
3163
3164 /**
3165  * ram_load_postcopy: load a page in postcopy case
3166  *
3167  * Returns 0 for success or -errno in case of error
3168  *
3169  * Called in postcopy mode by ram_load().
3170  * rcu_read_lock is taken prior to this being called.
3171  *
3172  * @f: QEMUFile where to send the data
3173  */
3174 static int ram_load_postcopy(QEMUFile *f)
3175 {
3176     int flags = 0, ret = 0;
3177     bool place_needed = false;
3178     bool matches_target_page_size = false;
3179     MigrationIncomingState *mis = migration_incoming_get_current();
3180     /* Temporary page that is later 'placed' */
3181     void *postcopy_host_page = mis->postcopy_tmp_page;
3182     void *this_host = NULL;
3183     bool all_zero = true;
3184     int target_pages = 0;
3185
3186     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3187         ram_addr_t addr;
3188         void *host = NULL;
3189         void *page_buffer = NULL;
3190         void *place_source = NULL;
3191         RAMBlock *block = NULL;
3192         uint8_t ch;
3193         int len;
3194
3195         addr = qemu_get_be64(f);
3196
3197         /*
3198          * If qemu file error, we should stop here, and then "addr"
3199          * may be invalid
3200          */
3201         ret = qemu_file_get_error(f);
3202         if (ret) {
3203             break;
3204         }
3205
3206         flags = addr & ~TARGET_PAGE_MASK;
3207         addr &= TARGET_PAGE_MASK;
3208
3209         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3210         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3211                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3212             block = ram_block_from_stream(f, flags);
3213
3214             host = host_from_ram_block_offset(block, addr);
3215             if (!host) {
3216                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3217                 ret = -EINVAL;
3218                 break;
3219             }
3220             target_pages++;
3221             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3222             /*
3223              * Postcopy requires that we place whole host pages atomically;
3224              * these may be huge pages for RAMBlocks that are backed by
3225              * hugetlbfs.
3226              * To make it atomic, the data is read into a temporary page
3227              * that's moved into place later.
3228              * The migration protocol uses,  possibly smaller, target-pages
3229              * however the source ensures it always sends all the components
3230              * of a host page in one chunk.
3231              */
3232             page_buffer = postcopy_host_page +
3233                           ((uintptr_t)host & (block->page_size - 1));
3234             if (target_pages == 1) {
3235                 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3236                                                     block->page_size);
3237             } else {
3238                 /* not the 1st TP within the HP */
3239                 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3240                     (uintptr_t)this_host) {
3241                     error_report("Non-same host page %p/%p",
3242                                   host, this_host);
3243                     ret = -EINVAL;
3244                     break;
3245                 }
3246             }
3247
3248             /*
3249              * If it's the last part of a host page then we place the host
3250              * page
3251              */
3252             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3253                 place_needed = true;
3254             }
3255             place_source = postcopy_host_page;
3256         }
3257
3258         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3259         case RAM_SAVE_FLAG_ZERO:
3260             ch = qemu_get_byte(f);
3261             /*
3262              * Can skip to set page_buffer when
3263              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3264              */
3265             if (ch || !matches_target_page_size) {
3266                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3267             }
3268             if (ch) {
3269                 all_zero = false;
3270             }
3271             break;
3272
3273         case RAM_SAVE_FLAG_PAGE:
3274             all_zero = false;
3275             if (!matches_target_page_size) {
3276                 /* For huge pages, we always use temporary buffer */
3277                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3278             } else {
3279                 /*
3280                  * For small pages that matches target page size, we
3281                  * avoid the qemu_file copy.  Instead we directly use
3282                  * the buffer of QEMUFile to place the page.  Note: we
3283                  * cannot do any QEMUFile operation before using that
3284                  * buffer to make sure the buffer is valid when
3285                  * placing the page.
3286                  */
3287                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3288                                          TARGET_PAGE_SIZE);
3289             }
3290             break;
3291         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3292             all_zero = false;
3293             len = qemu_get_be32(f);
3294             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3295                 error_report("Invalid compressed data length: %d", len);
3296                 ret = -EINVAL;
3297                 break;
3298             }
3299             decompress_data_with_multi_threads(f, page_buffer, len);
3300             break;
3301
3302         case RAM_SAVE_FLAG_EOS:
3303             /* normal exit */
3304             multifd_recv_sync_main();
3305             break;
3306         default:
3307             error_report("Unknown combination of migration flags: %#x"
3308                          " (postcopy mode)", flags);
3309             ret = -EINVAL;
3310             break;
3311         }
3312
3313         /* Got the whole host page, wait for decompress before placing. */
3314         if (place_needed) {
3315             ret |= wait_for_decompress_done();
3316         }
3317
3318         /* Detect for any possible file errors */
3319         if (!ret && qemu_file_get_error(f)) {
3320             ret = qemu_file_get_error(f);
3321         }
3322
3323         if (!ret && place_needed) {
3324             /* This gets called at the last target page in the host page */
3325             void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3326                                                        block->page_size);
3327
3328             if (all_zero) {
3329                 ret = postcopy_place_page_zero(mis, place_dest,
3330                                                block);
3331             } else {
3332                 ret = postcopy_place_page(mis, place_dest,
3333                                           place_source, block);
3334             }
3335             place_needed = false;
3336             target_pages = 0;
3337             /* Assume we have a zero page until we detect something different */
3338             all_zero = true;
3339         }
3340     }
3341
3342     return ret;
3343 }
3344
3345 static bool postcopy_is_advised(void)
3346 {
3347     PostcopyState ps = postcopy_state_get();
3348     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3349 }
3350
3351 static bool postcopy_is_running(void)
3352 {
3353     PostcopyState ps = postcopy_state_get();
3354     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3355 }
3356
3357 /*
3358  * Flush content of RAM cache into SVM's memory.
3359  * Only flush the pages that be dirtied by PVM or SVM or both.
3360  */
3361 void colo_flush_ram_cache(void)
3362 {
3363     RAMBlock *block = NULL;
3364     void *dst_host;
3365     void *src_host;
3366     unsigned long offset = 0;
3367
3368     memory_global_dirty_log_sync();
3369     WITH_RCU_READ_LOCK_GUARD() {
3370         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3371             ramblock_sync_dirty_bitmap(ram_state, block);
3372         }
3373     }
3374
3375     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3376     WITH_RCU_READ_LOCK_GUARD() {
3377         block = QLIST_FIRST_RCU(&ram_list.blocks);
3378
3379         while (block) {
3380             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3381
3382             if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3383                 >= block->used_length) {
3384                 offset = 0;
3385                 block = QLIST_NEXT_RCU(block, next);
3386             } else {
3387                 migration_bitmap_clear_dirty(ram_state, block, offset);
3388                 dst_host = block->host
3389                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3390                 src_host = block->colo_cache
3391                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3392                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3393             }
3394         }
3395     }
3396     trace_colo_flush_ram_cache_end();
3397 }
3398
3399 /**
3400  * ram_load_precopy: load pages in precopy case
3401  *
3402  * Returns 0 for success or -errno in case of error
3403  *
3404  * Called in precopy mode by ram_load().
3405  * rcu_read_lock is taken prior to this being called.
3406  *
3407  * @f: QEMUFile where to send the data
3408  */
3409 static int ram_load_precopy(QEMUFile *f)
3410 {
3411     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3412     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3413     bool postcopy_advised = postcopy_is_advised();
3414     if (!migrate_use_compression()) {
3415         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3416     }
3417
3418     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3419         ram_addr_t addr, total_ram_bytes;
3420         void *host = NULL, *host_bak = NULL;
3421         uint8_t ch;
3422
3423         /*
3424          * Yield periodically to let main loop run, but an iteration of
3425          * the main loop is expensive, so do it each some iterations
3426          */
3427         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3428             aio_co_schedule(qemu_get_current_aio_context(),
3429                             qemu_coroutine_self());
3430             qemu_coroutine_yield();
3431         }
3432         i++;
3433
3434         addr = qemu_get_be64(f);
3435         flags = addr & ~TARGET_PAGE_MASK;
3436         addr &= TARGET_PAGE_MASK;
3437
3438         if (flags & invalid_flags) {
3439             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3440                 error_report("Received an unexpected compressed page");
3441             }
3442
3443             ret = -EINVAL;
3444             break;
3445         }
3446
3447         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3448                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3449             RAMBlock *block = ram_block_from_stream(f, flags);
3450
3451             host = host_from_ram_block_offset(block, addr);
3452             /*
3453              * After going into COLO stage, we should not load the page
3454              * into SVM's memory directly, we put them into colo_cache firstly.
3455              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3456              * Previously, we copied all these memory in preparing stage of COLO
3457              * while we need to stop VM, which is a time-consuming process.
3458              * Here we optimize it by a trick, back-up every page while in
3459              * migration process while COLO is enabled, though it affects the
3460              * speed of the migration, but it obviously reduce the downtime of
3461              * back-up all SVM'S memory in COLO preparing stage.
3462              */
3463             if (migration_incoming_colo_enabled()) {
3464                 if (migration_incoming_in_colo_state()) {
3465                     /* In COLO stage, put all pages into cache temporarily */
3466                     host = colo_cache_from_block_offset(block, addr, true);
3467                 } else {
3468                    /*
3469                     * In migration stage but before COLO stage,
3470                     * Put all pages into both cache and SVM's memory.
3471                     */
3472                     host_bak = colo_cache_from_block_offset(block, addr, false);
3473                 }
3474             }
3475             if (!host) {
3476                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3477                 ret = -EINVAL;
3478                 break;
3479             }
3480             if (!migration_incoming_in_colo_state()) {
3481                 ramblock_recv_bitmap_set(block, host);
3482             }
3483
3484             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3485         }
3486
3487         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3488         case RAM_SAVE_FLAG_MEM_SIZE:
3489             /* Synchronize RAM block list */
3490             total_ram_bytes = addr;
3491             while (!ret && total_ram_bytes) {
3492                 RAMBlock *block;
3493                 char id[256];
3494                 ram_addr_t length;
3495
3496                 len = qemu_get_byte(f);
3497                 qemu_get_buffer(f, (uint8_t *)id, len);
3498                 id[len] = 0;
3499                 length = qemu_get_be64(f);
3500
3501                 block = qemu_ram_block_by_name(id);
3502                 if (block && !qemu_ram_is_migratable(block)) {
3503                     error_report("block %s should not be migrated !", id);
3504                     ret = -EINVAL;
3505                 } else if (block) {
3506                     if (length != block->used_length) {
3507                         Error *local_err = NULL;
3508
3509                         ret = qemu_ram_resize(block, length,
3510                                               &local_err);
3511                         if (local_err) {
3512                             error_report_err(local_err);
3513                         }
3514                     }
3515                     /* For postcopy we need to check hugepage sizes match */
3516                     if (postcopy_advised &&
3517                         block->page_size != qemu_host_page_size) {
3518                         uint64_t remote_page_size = qemu_get_be64(f);
3519                         if (remote_page_size != block->page_size) {
3520                             error_report("Mismatched RAM page size %s "
3521                                          "(local) %zd != %" PRId64,
3522                                          id, block->page_size,
3523                                          remote_page_size);
3524                             ret = -EINVAL;
3525                         }
3526                     }
3527                     if (migrate_ignore_shared()) {
3528                         hwaddr addr = qemu_get_be64(f);
3529                         if (ramblock_is_ignored(block) &&
3530                             block->mr->addr != addr) {
3531                             error_report("Mismatched GPAs for block %s "
3532                                          "%" PRId64 "!= %" PRId64,
3533                                          id, (uint64_t)addr,
3534                                          (uint64_t)block->mr->addr);
3535                             ret = -EINVAL;
3536                         }
3537                     }
3538                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3539                                           block->idstr);
3540                 } else {
3541                     error_report("Unknown ramblock \"%s\", cannot "
3542                                  "accept migration", id);
3543                     ret = -EINVAL;
3544                 }
3545
3546                 total_ram_bytes -= length;
3547             }
3548             break;
3549
3550         case RAM_SAVE_FLAG_ZERO:
3551             ch = qemu_get_byte(f);
3552             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3553             break;
3554
3555         case RAM_SAVE_FLAG_PAGE:
3556             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3557             break;
3558
3559         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3560             len = qemu_get_be32(f);
3561             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3562                 error_report("Invalid compressed data length: %d", len);
3563                 ret = -EINVAL;
3564                 break;
3565             }
3566             decompress_data_with_multi_threads(f, host, len);
3567             break;
3568
3569         case RAM_SAVE_FLAG_XBZRLE:
3570             if (load_xbzrle(f, addr, host) < 0) {
3571                 error_report("Failed to decompress XBZRLE page at "
3572                              RAM_ADDR_FMT, addr);
3573                 ret = -EINVAL;
3574                 break;
3575             }
3576             break;
3577         case RAM_SAVE_FLAG_EOS:
3578             /* normal exit */
3579             multifd_recv_sync_main();
3580             break;
3581         default:
3582             if (flags & RAM_SAVE_FLAG_HOOK) {
3583                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3584             } else {
3585                 error_report("Unknown combination of migration flags: %#x",
3586                              flags);
3587                 ret = -EINVAL;
3588             }
3589         }
3590         if (!ret) {
3591             ret = qemu_file_get_error(f);
3592         }
3593         if (!ret && host_bak) {
3594             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3595         }
3596     }
3597
3598     ret |= wait_for_decompress_done();
3599     return ret;
3600 }
3601
3602 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3603 {
3604     int ret = 0;
3605     static uint64_t seq_iter;
3606     /*
3607      * If system is running in postcopy mode, page inserts to host memory must
3608      * be atomic
3609      */
3610     bool postcopy_running = postcopy_is_running();
3611
3612     seq_iter++;
3613
3614     if (version_id != 4) {
3615         return -EINVAL;
3616     }
3617
3618     /*
3619      * This RCU critical section can be very long running.
3620      * When RCU reclaims in the code start to become numerous,
3621      * it will be necessary to reduce the granularity of this
3622      * critical section.
3623      */
3624     WITH_RCU_READ_LOCK_GUARD() {
3625         if (postcopy_running) {
3626             ret = ram_load_postcopy(f);
3627         } else {
3628             ret = ram_load_precopy(f);
3629         }
3630     }
3631     trace_ram_load_complete(ret, seq_iter);
3632
3633     return ret;
3634 }
3635
3636 static bool ram_has_postcopy(void *opaque)
3637 {
3638     RAMBlock *rb;
3639     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3640         if (ramblock_is_pmem(rb)) {
3641             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3642                          "is not supported now!", rb->idstr, rb->host);
3643             return false;
3644         }
3645     }
3646
3647     return migrate_postcopy_ram();
3648 }
3649
3650 /* Sync all the dirty bitmap with destination VM.  */
3651 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3652 {
3653     RAMBlock *block;
3654     QEMUFile *file = s->to_dst_file;
3655     int ramblock_count = 0;
3656
3657     trace_ram_dirty_bitmap_sync_start();
3658
3659     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3660         qemu_savevm_send_recv_bitmap(file, block->idstr);
3661         trace_ram_dirty_bitmap_request(block->idstr);
3662         ramblock_count++;
3663     }
3664
3665     trace_ram_dirty_bitmap_sync_wait();
3666
3667     /* Wait until all the ramblocks' dirty bitmap synced */
3668     while (ramblock_count--) {
3669         qemu_sem_wait(&s->rp_state.rp_sem);
3670     }
3671
3672     trace_ram_dirty_bitmap_sync_complete();
3673
3674     return 0;
3675 }
3676
3677 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3678 {
3679     qemu_sem_post(&s->rp_state.rp_sem);
3680 }
3681
3682 /*
3683  * Read the received bitmap, revert it as the initial dirty bitmap.
3684  * This is only used when the postcopy migration is paused but wants
3685  * to resume from a middle point.
3686  */
3687 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3688 {
3689     int ret = -EINVAL;
3690     QEMUFile *file = s->rp_state.from_dst_file;
3691     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3692     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3693     uint64_t size, end_mark;
3694
3695     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3696
3697     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3698         error_report("%s: incorrect state %s", __func__,
3699                      MigrationStatus_str(s->state));
3700         return -EINVAL;
3701     }
3702
3703     /*
3704      * Note: see comments in ramblock_recv_bitmap_send() on why we
3705      * need the endianess convertion, and the paddings.
3706      */
3707     local_size = ROUND_UP(local_size, 8);
3708
3709     /* Add paddings */
3710     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3711
3712     size = qemu_get_be64(file);
3713
3714     /* The size of the bitmap should match with our ramblock */
3715     if (size != local_size) {
3716         error_report("%s: ramblock '%s' bitmap size mismatch "
3717                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3718                      block->idstr, size, local_size);
3719         ret = -EINVAL;
3720         goto out;
3721     }
3722
3723     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3724     end_mark = qemu_get_be64(file);
3725
3726     ret = qemu_file_get_error(file);
3727     if (ret || size != local_size) {
3728         error_report("%s: read bitmap failed for ramblock '%s': %d"
3729                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3730                      __func__, block->idstr, ret, local_size, size);
3731         ret = -EIO;
3732         goto out;
3733     }
3734
3735     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3736         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3737                      __func__, block->idstr, end_mark);
3738         ret = -EINVAL;
3739         goto out;
3740     }
3741
3742     /*
3743      * Endianess convertion. We are during postcopy (though paused).
3744      * The dirty bitmap won't change. We can directly modify it.
3745      */
3746     bitmap_from_le(block->bmap, le_bitmap, nbits);
3747
3748     /*
3749      * What we received is "received bitmap". Revert it as the initial
3750      * dirty bitmap for this ramblock.
3751      */
3752     bitmap_complement(block->bmap, block->bmap, nbits);
3753
3754     trace_ram_dirty_bitmap_reload_complete(block->idstr);
3755
3756     /*
3757      * We succeeded to sync bitmap for current ramblock. If this is
3758      * the last one to sync, we need to notify the main send thread.
3759      */
3760     ram_dirty_bitmap_reload_notify(s);
3761
3762     ret = 0;
3763 out:
3764     g_free(le_bitmap);
3765     return ret;
3766 }
3767
3768 static int ram_resume_prepare(MigrationState *s, void *opaque)
3769 {
3770     RAMState *rs = *(RAMState **)opaque;
3771     int ret;
3772
3773     ret = ram_dirty_bitmap_sync_all(s, rs);
3774     if (ret) {
3775         return ret;
3776     }
3777
3778     ram_state_resume_prepare(rs, s->to_dst_file);
3779
3780     return 0;
3781 }
3782
3783 static SaveVMHandlers savevm_ram_handlers = {
3784     .save_setup = ram_save_setup,
3785     .save_live_iterate = ram_save_iterate,
3786     .save_live_complete_postcopy = ram_save_complete,
3787     .save_live_complete_precopy = ram_save_complete,
3788     .has_postcopy = ram_has_postcopy,
3789     .save_live_pending = ram_save_pending,
3790     .load_state = ram_load,
3791     .save_cleanup = ram_save_cleanup,
3792     .load_setup = ram_load_setup,
3793     .load_cleanup = ram_load_cleanup,
3794     .resume_prepare = ram_resume_prepare,
3795 };
3796
3797 void ram_mig_init(void)
3798 {
3799     qemu_mutex_init(&XBZRLE.lock);
3800     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
3801 }