migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "qemu/pmem.h"
  37 #include "xbzrle.h"
  38 #include "ram.h"
  39 #include "migration.h"
  40 #include "socket.h"
  41 #include "migration/register.h"
  42 #include "migration/misc.h"
  43 #include "qemu-file.h"
  44 #include "postcopy-ram.h"
  45 #include "page_cache.h"
  46 #include "qemu/error-report.h"
  47 #include "qapi/error.h"
  48 #include "qapi/qapi-events-migration.h"
  49 #include "qapi/qmp/qerror.h"
  50 #include "trace.h"
  51 #include "exec/ram_addr.h"
  52 #include "exec/target_page.h"
  53 #include "qemu/rcu_queue.h"
  54 #include "migration/colo.h"
  55 #include "block.h"
  56 #include "sysemu/sysemu.h"
  57 #include "qemu/uuid.h"
  58 #include "savevm.h"
  59 #include "qemu/iov.h"
  60
  61 /***********************************************************/
  62 /* ram save/restore */
  63
  64 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  65  * worked for pages that where filled with the same char.  We switched
  66  * it to only search for the zero value.  And to avoid confusion with
  67  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  68  */
  69
  70 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  71 #define RAM_SAVE_FLAG_ZERO     0x02
  72 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  73 #define RAM_SAVE_FLAG_PAGE     0x08
  74 #define RAM_SAVE_FLAG_EOS      0x10
  75 #define RAM_SAVE_FLAG_CONTINUE 0x20
  76 #define RAM_SAVE_FLAG_XBZRLE   0x40
  77 /* 0x80 is reserved in migration.h start with 0x100 next */
  78 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  79
  80 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  81 {
  82     return buffer_is_zero(p, size);
  83 }
  84
  85 XBZRLECacheStats xbzrle_counters;
  86
  87 /* struct contains XBZRLE cache and a static page
  88    used by the compression */
  89 static struct {
  90     /* buffer used for XBZRLE encoding */
  91     uint8_t *encoded_buf;
  92     /* buffer for storing page content */
  93     uint8_t *current_buf;
  94     /* Cache for XBZRLE, Protected by lock. */
  95     PageCache *cache;
  96     QemuMutex lock;
  97     /* it will store a page full of zeros */
  98     uint8_t *zero_target_page;
  99     /* buffer used for XBZRLE decoding */
 100     uint8_t *decoded_buf;
 101 } XBZRLE;
 102
 103 static void XBZRLE_cache_lock(void)
 104 {
 105     if (migrate_use_xbzrle())
 106         qemu_mutex_lock(&XBZRLE.lock);
 107 }
 108
 109 static void XBZRLE_cache_unlock(void)
 110 {
 111     if (migrate_use_xbzrle())
 112         qemu_mutex_unlock(&XBZRLE.lock);
 113 }
 114
 115 /**
 116  * xbzrle_cache_resize: resize the xbzrle cache
 117  *
 118  * This function is called from qmp_migrate_set_cache_size in main
 119  * thread, possibly while a migration is in progress.  A running
 120  * migration may be using the cache and might finish during this call,
 121  * hence changes to the cache are protected by XBZRLE.lock().
 122  *
 123  * Returns 0 for success or -1 for error
 124  *
 125  * @new_size: new cache size
 126  * @errp: set *errp if the check failed, with reason
 127  */
 128 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 129 {
 130     PageCache *new_cache;
 131     int64_t ret = 0;
 132
 133     /* Check for truncation */
 134     if (new_size != (size_t)new_size) {
 135         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 136                    "exceeding address space");
 137         return -1;
 138     }
 139
 140     if (new_size == migrate_xbzrle_cache_size()) {
 141         /* nothing to do */
 142         return 0;
 143     }
 144
 145     XBZRLE_cache_lock();
 146
 147     if (XBZRLE.cache != NULL) {
 148         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 149         if (!new_cache) {
 150             ret = -1;
 151             goto out;
 152         }
 153
 154         cache_fini(XBZRLE.cache);
 155         XBZRLE.cache = new_cache;
 156     }
 157 out:
 158     XBZRLE_cache_unlock();
 159     return ret;
 160 }
 161
 162 static bool ramblock_is_ignored(RAMBlock *block)
 163 {
 164     return !qemu_ram_is_migratable(block) ||
 165            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 166 }
 167
 168 /* Should be holding either ram_list.mutex, or the RCU lock. */
 169 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
 170     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 171         if (ramblock_is_ignored(block)) {} else
 172
 173 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 174     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 175         if (!qemu_ram_is_migratable(block)) {} else
 176
 177 #undef RAMBLOCK_FOREACH
 178
 179 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 180 {
 181     RAMBlock *block;
 182     int ret = 0;
 183
 184     rcu_read_lock();
 185     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 186         ret = func(block, opaque);
 187         if (ret) {
 188             break;
 189         }
 190     }
 191     rcu_read_unlock();
 192     return ret;
 193 }
 194
 195 static void ramblock_recv_map_init(void)
 196 {
 197     RAMBlock *rb;
 198
 199     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 200         assert(!rb->receivedmap);
 201         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 202     }
 203 }
 204
 205 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 206 {
 207     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 208                     rb->receivedmap);
 209 }
 210
 211 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 212 {
 213     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 214 }
 215
 216 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 217 {
 218     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 219 }
 220
 221 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 222                                     size_t nr)
 223 {
 224     bitmap_set_atomic(rb->receivedmap,
 225                       ramblock_recv_bitmap_offset(host_addr, rb),
 226                       nr);
 227 }
 228
 229 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 230
 231 /*
 232  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 233  *
 234  * Returns >0 if success with sent bytes, or <0 if error.
 235  */
 236 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 237                                   const char *block_name)
 238 {
 239     RAMBlock *block = qemu_ram_block_by_name(block_name);
 240     unsigned long *le_bitmap, nbits;
 241     uint64_t size;
 242
 243     if (!block) {
 244         error_report("%s: invalid block name: %s", __func__, block_name);
 245         return -1;
 246     }
 247
 248     nbits = block->used_length >> TARGET_PAGE_BITS;
 249
 250     /*
 251      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 252      * machines we may need 4 more bytes for padding (see below
 253      * comment). So extend it a bit before hand.
 254      */
 255     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 256
 257     /*
 258      * Always use little endian when sending the bitmap. This is
 259      * required that when source and destination VMs are not using the
 260      * same endianess. (Note: big endian won't work.)
 261      */
 262     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 263
 264     /* Size of the bitmap, in bytes */
 265     size = DIV_ROUND_UP(nbits, 8);
 266
 267     /*
 268      * size is always aligned to 8 bytes for 64bit machines, but it
 269      * may not be true for 32bit machines. We need this padding to
 270      * make sure the migration can survive even between 32bit and
 271      * 64bit machines.
 272      */
 273     size = ROUND_UP(size, 8);
 274
 275     qemu_put_be64(file, size);
 276     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 277     /*
 278      * Mark as an end, in case the middle part is screwed up due to
 279      * some "misterious" reason.
 280      */
 281     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 282     qemu_fflush(file);
 283
 284     g_free(le_bitmap);
 285
 286     if (qemu_file_get_error(file)) {
 287         return qemu_file_get_error(file);
 288     }
 289
 290     return size + sizeof(size);
 291 }
 292
 293 /*
 294  * An outstanding page request, on the source, having been received
 295  * and queued
 296  */
 297 struct RAMSrcPageRequest {
 298     RAMBlock *rb;
 299     hwaddr    offset;
 300     hwaddr    len;
 301
 302     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 303 };
 304
 305 /* State of RAM for migration */
 306 struct RAMState {
 307     /* QEMUFile used for this migration */
 308     QEMUFile *f;
 309     /* Last block that we have visited searching for dirty pages */
 310     RAMBlock *last_seen_block;
 311     /* Last block from where we have sent data */
 312     RAMBlock *last_sent_block;
 313     /* Last dirty target page we have sent */
 314     ram_addr_t last_page;
 315     /* last ram version we have seen */
 316     uint32_t last_version;
 317     /* We are in the first round */
 318     bool ram_bulk_stage;
 319     /* The free page optimization is enabled */
 320     bool fpo_enabled;
 321     /* How many times we have dirty too many pages */
 322     int dirty_rate_high_cnt;
 323     /* these variables are used for bitmap sync */
 324     /* last time we did a full bitmap_sync */
 325     int64_t time_last_bitmap_sync;
 326     /* bytes transferred at start_time */
 327     uint64_t bytes_xfer_prev;
 328     /* number of dirty pages since start_time */
 329     uint64_t num_dirty_pages_period;
 330     /* xbzrle misses since the beginning of the period */
 331     uint64_t xbzrle_cache_miss_prev;
 332
 333     /* compression statistics since the beginning of the period */
 334     /* amount of count that no free thread to compress data */
 335     uint64_t compress_thread_busy_prev;
 336     /* amount bytes after compression */
 337     uint64_t compressed_size_prev;
 338     /* amount of compressed pages */
 339     uint64_t compress_pages_prev;
 340
 341     /* total handled target pages at the beginning of period */
 342     uint64_t target_page_count_prev;
 343     /* total handled target pages since start */
 344     uint64_t target_page_count;
 345     /* number of dirty bits in the bitmap */
 346     uint64_t migration_dirty_pages;
 347     /* Protects modification of the bitmap and migration dirty pages */
 348     QemuMutex bitmap_mutex;
 349     /* The RAMBlock used in the last src_page_requests */
 350     RAMBlock *last_req_rb;
 351     /* Queue of outstanding page requests from the destination */
 352     QemuMutex src_page_req_mutex;
 353     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 354 };
 355 typedef struct RAMState RAMState;
 356
 357 static RAMState *ram_state;
 358
 359 static NotifierWithReturnList precopy_notifier_list;
 360
 361 void precopy_infrastructure_init(void)
 362 {
 363     notifier_with_return_list_init(&precopy_notifier_list);
 364 }
 365
 366 void precopy_add_notifier(NotifierWithReturn *n)
 367 {
 368     notifier_with_return_list_add(&precopy_notifier_list, n);
 369 }
 370
 371 void precopy_remove_notifier(NotifierWithReturn *n)
 372 {
 373     notifier_with_return_remove(n);
 374 }
 375
 376 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 377 {
 378     PrecopyNotifyData pnd;
 379     pnd.reason = reason;
 380     pnd.errp = errp;
 381
 382     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 383 }
 384
 385 void precopy_enable_free_page_optimization(void)
 386 {
 387     if (!ram_state) {
 388         return;
 389     }
 390
 391     ram_state->fpo_enabled = true;
 392 }
 393
 394 uint64_t ram_bytes_remaining(void)
 395 {
 396     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 397                        0;
 398 }
 399
 400 MigrationStats ram_counters;
 401
 402 /* used by the search for pages to send */
 403 struct PageSearchStatus {
 404     /* Current block being searched */
 405     RAMBlock    *block;
 406     /* Current page to search from */
 407     unsigned long page;
 408     /* Set once we wrap around */
 409     bool         complete_round;
 410 };
 411 typedef struct PageSearchStatus PageSearchStatus;
 412
 413 CompressionStats compression_counters;
 414
 415 struct CompressParam {
 416     bool done;
 417     bool quit;
 418     bool zero_page;
 419     QEMUFile *file;
 420     QemuMutex mutex;
 421     QemuCond cond;
 422     RAMBlock *block;
 423     ram_addr_t offset;
 424
 425     /* internally used fields */
 426     z_stream stream;
 427     uint8_t *originbuf;
 428 };
 429 typedef struct CompressParam CompressParam;
 430
 431 struct DecompressParam {
 432     bool done;
 433     bool quit;
 434     QemuMutex mutex;
 435     QemuCond cond;
 436     void *des;
 437     uint8_t *compbuf;
 438     int len;
 439     z_stream stream;
 440 };
 441 typedef struct DecompressParam DecompressParam;
 442
 443 static CompressParam *comp_param;
 444 static QemuThread *compress_threads;
 445 /* comp_done_cond is used to wake up the migration thread when
 446  * one of the compression threads has finished the compression.
 447  * comp_done_lock is used to co-work with comp_done_cond.
 448  */
 449 static QemuMutex comp_done_lock;
 450 static QemuCond comp_done_cond;
 451 /* The empty QEMUFileOps will be used by file in CompressParam */
 452 static const QEMUFileOps empty_ops = { };
 453
 454 static QEMUFile *decomp_file;
 455 static DecompressParam *decomp_param;
 456 static QemuThread *decompress_threads;
 457 static QemuMutex decomp_done_lock;
 458 static QemuCond decomp_done_cond;
 459
 460 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 461                                  ram_addr_t offset, uint8_t *source_buf);
 462
 463 static void *do_data_compress(void *opaque)
 464 {
 465     CompressParam *param = opaque;
 466     RAMBlock *block;
 467     ram_addr_t offset;
 468     bool zero_page;
 469
 470     qemu_mutex_lock(&param->mutex);
 471     while (!param->quit) {
 472         if (param->block) {
 473             block = param->block;
 474             offset = param->offset;
 475             param->block = NULL;
 476             qemu_mutex_unlock(&param->mutex);
 477
 478             zero_page = do_compress_ram_page(param->file, &param->stream,
 479                                              block, offset, param->originbuf);
 480
 481             qemu_mutex_lock(&comp_done_lock);
 482             param->done = true;
 483             param->zero_page = zero_page;
 484             qemu_cond_signal(&comp_done_cond);
 485             qemu_mutex_unlock(&comp_done_lock);
 486
 487             qemu_mutex_lock(&param->mutex);
 488         } else {
 489             qemu_cond_wait(&param->cond, &param->mutex);
 490         }
 491     }
 492     qemu_mutex_unlock(&param->mutex);
 493
 494     return NULL;
 495 }
 496
 497 static void compress_threads_save_cleanup(void)
 498 {
 499     int i, thread_count;
 500
 501     if (!migrate_use_compression() || !comp_param) {
 502         return;
 503     }
 504
 505     thread_count = migrate_compress_threads();
 506     for (i = 0; i < thread_count; i++) {
 507         /*
 508          * we use it as a indicator which shows if the thread is
 509          * properly init'd or not
 510          */
 511         if (!comp_param[i].file) {
 512             break;
 513         }
 514
 515         qemu_mutex_lock(&comp_param[i].mutex);
 516         comp_param[i].quit = true;
 517         qemu_cond_signal(&comp_param[i].cond);
 518         qemu_mutex_unlock(&comp_param[i].mutex);
 519
 520         qemu_thread_join(compress_threads + i);
 521         qemu_mutex_destroy(&comp_param[i].mutex);
 522         qemu_cond_destroy(&comp_param[i].cond);
 523         deflateEnd(&comp_param[i].stream);
 524         g_free(comp_param[i].originbuf);
 525         qemu_fclose(comp_param[i].file);
 526         comp_param[i].file = NULL;
 527     }
 528     qemu_mutex_destroy(&comp_done_lock);
 529     qemu_cond_destroy(&comp_done_cond);
 530     g_free(compress_threads);
 531     g_free(comp_param);
 532     compress_threads = NULL;
 533     comp_param = NULL;
 534 }
 535
 536 static int compress_threads_save_setup(void)
 537 {
 538     int i, thread_count;
 539
 540     if (!migrate_use_compression()) {
 541         return 0;
 542     }
 543     thread_count = migrate_compress_threads();
 544     compress_threads = g_new0(QemuThread, thread_count);
 545     comp_param = g_new0(CompressParam, thread_count);
 546     qemu_cond_init(&comp_done_cond);
 547     qemu_mutex_init(&comp_done_lock);
 548     for (i = 0; i < thread_count; i++) {
 549         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 550         if (!comp_param[i].originbuf) {
 551             goto exit;
 552         }
 553
 554         if (deflateInit(&comp_param[i].stream,
 555                         migrate_compress_level()) != Z_OK) {
 556             g_free(comp_param[i].originbuf);
 557             goto exit;
 558         }
 559
 560         /* comp_param[i].file is just used as a dummy buffer to save data,
 561          * set its ops to empty.
 562          */
 563         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 564         comp_param[i].done = true;
 565         comp_param[i].quit = false;
 566         qemu_mutex_init(&comp_param[i].mutex);
 567         qemu_cond_init(&comp_param[i].cond);
 568         qemu_thread_create(compress_threads + i, "compress",
 569                            do_data_compress, comp_param + i,
 570                            QEMU_THREAD_JOINABLE);
 571     }
 572     return 0;
 573
 574 exit:
 575     compress_threads_save_cleanup();
 576     return -1;
 577 }
 578
 579 /* Multiple fd's */
 580
 581 #define MULTIFD_MAGIC 0x11223344U
 582 #define MULTIFD_VERSION 1
 583
 584 #define MULTIFD_FLAG_SYNC (1 << 0)
 585
 586 /* This value needs to be a multiple of qemu_target_page_size() */
 587 #define MULTIFD_PACKET_SIZE (512 * 1024)
 588
 589 typedef struct {
 590     uint32_t magic;
 591     uint32_t version;
 592     unsigned char uuid[16]; /* QemuUUID */
 593     uint8_t id;
 594     uint8_t unused1[7];     /* Reserved for future use */
 595     uint64_t unused2[4];    /* Reserved for future use */
 596 } __attribute__((packed)) MultiFDInit_t;
 597
 598 typedef struct {
 599     uint32_t magic;
 600     uint32_t version;
 601     uint32_t flags;
 602     /* maximum number of allocated pages */
 603     uint32_t pages_alloc;
 604     uint32_t pages_used;
 605     /* size of the next packet that contains pages */
 606     uint32_t next_packet_size;
 607     uint64_t packet_num;
 608     uint64_t unused[4];    /* Reserved for future use */
 609     char ramblock[256];
 610     uint64_t offset[];
 611 } __attribute__((packed)) MultiFDPacket_t;
 612
 613 typedef struct {
 614     /* number of used pages */
 615     uint32_t used;
 616     /* number of allocated pages */
 617     uint32_t allocated;
 618     /* global number of generated multifd packets */
 619     uint64_t packet_num;
 620     /* offset of each page */
 621     ram_addr_t *offset;
 622     /* pointer to each page */
 623     struct iovec *iov;
 624     RAMBlock *block;
 625 } MultiFDPages_t;
 626
 627 typedef struct {
 628     /* this fields are not changed once the thread is created */
 629     /* channel number */
 630     uint8_t id;
 631     /* channel thread name */
 632     char *name;
 633     /* channel thread id */
 634     QemuThread thread;
 635     /* communication channel */
 636     QIOChannel *c;
 637     /* sem where to wait for more work */
 638     QemuSemaphore sem;
 639     /* this mutex protects the following parameters */
 640     QemuMutex mutex;
 641     /* is this channel thread running */
 642     bool running;
 643     /* should this thread finish */
 644     bool quit;
 645     /* thread has work to do */
 646     int pending_job;
 647     /* array of pages to sent */
 648     MultiFDPages_t *pages;
 649     /* packet allocated len */
 650     uint32_t packet_len;
 651     /* pointer to the packet */
 652     MultiFDPacket_t *packet;
 653     /* multifd flags for each packet */
 654     uint32_t flags;
 655     /* size of the next packet that contains pages */
 656     uint32_t next_packet_size;
 657     /* global number of generated multifd packets */
 658     uint64_t packet_num;
 659     /* thread local variables */
 660     /* packets sent through this channel */
 661     uint64_t num_packets;
 662     /* pages sent through this channel */
 663     uint64_t num_pages;
 664 }  MultiFDSendParams;
 665
 666 typedef struct {
 667     /* this fields are not changed once the thread is created */
 668     /* channel number */
 669     uint8_t id;
 670     /* channel thread name */
 671     char *name;
 672     /* channel thread id */
 673     QemuThread thread;
 674     /* communication channel */
 675     QIOChannel *c;
 676     /* this mutex protects the following parameters */
 677     QemuMutex mutex;
 678     /* is this channel thread running */
 679     bool running;
 680     /* array of pages to receive */
 681     MultiFDPages_t *pages;
 682     /* packet allocated len */
 683     uint32_t packet_len;
 684     /* pointer to the packet */
 685     MultiFDPacket_t *packet;
 686     /* multifd flags for each packet */
 687     uint32_t flags;
 688     /* global number of generated multifd packets */
 689     uint64_t packet_num;
 690     /* thread local variables */
 691     /* size of the next packet that contains pages */
 692     uint32_t next_packet_size;
 693     /* packets sent through this channel */
 694     uint64_t num_packets;
 695     /* pages sent through this channel */
 696     uint64_t num_pages;
 697     /* syncs main thread and channels */
 698     QemuSemaphore sem_sync;
 699 } MultiFDRecvParams;
 700
 701 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 702 {
 703     MultiFDInit_t msg;
 704     int ret;
 705
 706     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 707     msg.version = cpu_to_be32(MULTIFD_VERSION);
 708     msg.id = p->id;
 709     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 710
 711     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 712     if (ret != 0) {
 713         return -1;
 714     }
 715     return 0;
 716 }
 717
 718 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 719 {
 720     MultiFDInit_t msg;
 721     int ret;
 722
 723     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 724     if (ret != 0) {
 725         return -1;
 726     }
 727
 728     msg.magic = be32_to_cpu(msg.magic);
 729     msg.version = be32_to_cpu(msg.version);
 730
 731     if (msg.magic != MULTIFD_MAGIC) {
 732         error_setg(errp, "multifd: received packet magic %x "
 733                    "expected %x", msg.magic, MULTIFD_MAGIC);
 734         return -1;
 735     }
 736
 737     if (msg.version != MULTIFD_VERSION) {
 738         error_setg(errp, "multifd: received packet version %d "
 739                    "expected %d", msg.version, MULTIFD_VERSION);
 740         return -1;
 741     }
 742
 743     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 744         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 745         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 746
 747         error_setg(errp, "multifd: received uuid '%s' and expected "
 748                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 749         g_free(uuid);
 750         g_free(msg_uuid);
 751         return -1;
 752     }
 753
 754     if (msg.id > migrate_multifd_channels()) {
 755         error_setg(errp, "multifd: received channel version %d "
 756                    "expected %d", msg.version, MULTIFD_VERSION);
 757         return -1;
 758     }
 759
 760     return msg.id;
 761 }
 762
 763 static MultiFDPages_t *multifd_pages_init(size_t size)
 764 {
 765     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 766
 767     pages->allocated = size;
 768     pages->iov = g_new0(struct iovec, size);
 769     pages->offset = g_new0(ram_addr_t, size);
 770
 771     return pages;
 772 }
 773
 774 static void multifd_pages_clear(MultiFDPages_t *pages)
 775 {
 776     pages->used = 0;
 777     pages->allocated = 0;
 778     pages->packet_num = 0;
 779     pages->block = NULL;
 780     g_free(pages->iov);
 781     pages->iov = NULL;
 782     g_free(pages->offset);
 783     pages->offset = NULL;
 784     g_free(pages);
 785 }
 786
 787 static void multifd_send_fill_packet(MultiFDSendParams *p)
 788 {
 789     MultiFDPacket_t *packet = p->packet;
 790     uint32_t page_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
 791     int i;
 792
 793     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
 794     packet->version = cpu_to_be32(MULTIFD_VERSION);
 795     packet->flags = cpu_to_be32(p->flags);
 796     packet->pages_alloc = cpu_to_be32(page_max);
 797     packet->pages_used = cpu_to_be32(p->pages->used);
 798     packet->next_packet_size = cpu_to_be32(p->next_packet_size);
 799     packet->packet_num = cpu_to_be64(p->packet_num);
 800
 801     if (p->pages->block) {
 802         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 803     }
 804
 805     for (i = 0; i < p->pages->used; i++) {
 806         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 807     }
 808 }
 809
 810 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 811 {
 812     MultiFDPacket_t *packet = p->packet;
 813     uint32_t pages_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
 814     RAMBlock *block;
 815     int i;
 816
 817     packet->magic = be32_to_cpu(packet->magic);
 818     if (packet->magic != MULTIFD_MAGIC) {
 819         error_setg(errp, "multifd: received packet "
 820                    "magic %x and expected magic %x",
 821                    packet->magic, MULTIFD_MAGIC);
 822         return -1;
 823     }
 824
 825     packet->version = be32_to_cpu(packet->version);
 826     if (packet->version != MULTIFD_VERSION) {
 827         error_setg(errp, "multifd: received packet "
 828                    "version %d and expected version %d",
 829                    packet->version, MULTIFD_VERSION);
 830         return -1;
 831     }
 832
 833     p->flags = be32_to_cpu(packet->flags);
 834
 835     packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
 836     /*
 837      * If we recevied a packet that is 100 times bigger than expected
 838      * just stop migration.  It is a magic number.
 839      */
 840     if (packet->pages_alloc > pages_max * 100) {
 841         error_setg(errp, "multifd: received packet "
 842                    "with size %d and expected a maximum size of %d",
 843                    packet->pages_alloc, pages_max * 100) ;
 844         return -1;
 845     }
 846     /*
 847      * We received a packet that is bigger than expected but inside
 848      * reasonable limits (see previous comment).  Just reallocate.
 849      */
 850     if (packet->pages_alloc > p->pages->allocated) {
 851         multifd_pages_clear(p->pages);
 852         p->pages = multifd_pages_init(packet->pages_alloc);
 853     }
 854
 855     p->pages->used = be32_to_cpu(packet->pages_used);
 856     if (p->pages->used > packet->pages_alloc) {
 857         error_setg(errp, "multifd: received packet "
 858                    "with %d pages and expected maximum pages are %d",
 859                    p->pages->used, packet->pages_alloc) ;
 860         return -1;
 861     }
 862
 863     p->next_packet_size = be32_to_cpu(packet->next_packet_size);
 864     p->packet_num = be64_to_cpu(packet->packet_num);
 865
 866     if (p->pages->used) {
 867         /* make sure that ramblock is 0 terminated */
 868         packet->ramblock[255] = 0;
 869         block = qemu_ram_block_by_name(packet->ramblock);
 870         if (!block) {
 871             error_setg(errp, "multifd: unknown ram block %s",
 872                        packet->ramblock);
 873             return -1;
 874         }
 875     }
 876
 877     for (i = 0; i < p->pages->used; i++) {
 878         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 879
 880         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 881             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 882                        " (max " RAM_ADDR_FMT ")",
 883                        offset, block->max_length);
 884             return -1;
 885         }
 886         p->pages->iov[i].iov_base = block->host + offset;
 887         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 888     }
 889
 890     return 0;
 891 }
 892
 893 struct {
 894     MultiFDSendParams *params;
 895     /* array of pages to sent */
 896     MultiFDPages_t *pages;
 897     /* syncs main thread and channels */
 898     QemuSemaphore sem_sync;
 899     /* global number of generated multifd packets */
 900     uint64_t packet_num;
 901     /* send channels ready */
 902     QemuSemaphore channels_ready;
 903 } *multifd_send_state;
 904
 905 /*
 906  * How we use multifd_send_state->pages and channel->pages?
 907  *
 908  * We create a pages for each channel, and a main one.  Each time that
 909  * we need to send a batch of pages we interchange the ones between
 910  * multifd_send_state and the channel that is sending it.  There are
 911  * two reasons for that:
 912  *    - to not have to do so many mallocs during migration
 913  *    - to make easier to know what to free at the end of migration
 914  *
 915  * This way we always know who is the owner of each "pages" struct,
 916  * and we don't need any locking.  It belongs to the migration thread
 917  * or to the channel thread.  Switching is safe because the migration
 918  * thread is using the channel mutex when changing it, and the channel
 919  * have to had finish with its own, otherwise pending_job can't be
 920  * false.
 921  */
 922
 923 static void multifd_send_pages(void)
 924 {
 925     int i;
 926     static int next_channel;
 927     MultiFDSendParams *p = NULL; /* make happy gcc */
 928     MultiFDPages_t *pages = multifd_send_state->pages;
 929     uint64_t transferred;
 930
 931     qemu_sem_wait(&multifd_send_state->channels_ready);
 932     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 933         p = &multifd_send_state->params[i];
 934
 935         qemu_mutex_lock(&p->mutex);
 936         if (!p->pending_job) {
 937             p->pending_job++;
 938             next_channel = (i + 1) % migrate_multifd_channels();
 939             break;
 940         }
 941         qemu_mutex_unlock(&p->mutex);
 942     }
 943     p->pages->used = 0;
 944
 945     p->packet_num = multifd_send_state->packet_num++;
 946     p->pages->block = NULL;
 947     multifd_send_state->pages = p->pages;
 948     p->pages = pages;
 949     transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
 950     ram_counters.multifd_bytes += transferred;
 951     ram_counters.transferred += transferred;;
 952     qemu_mutex_unlock(&p->mutex);
 953     qemu_sem_post(&p->sem);
 954 }
 955
 956 static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
 957 {
 958     MultiFDPages_t *pages = multifd_send_state->pages;
 959
 960     if (!pages->block) {
 961         pages->block = block;
 962     }
 963
 964     if (pages->block == block) {
 965         pages->offset[pages->used] = offset;
 966         pages->iov[pages->used].iov_base = block->host + offset;
 967         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
 968         pages->used++;
 969
 970         if (pages->used < pages->allocated) {
 971             return;
 972         }
 973     }
 974
 975     multifd_send_pages();
 976
 977     if (pages->block != block) {
 978         multifd_queue_page(block, offset);
 979     }
 980 }
 981
 982 static void multifd_send_terminate_threads(Error *err)
 983 {
 984     int i;
 985
 986     if (err) {
 987         MigrationState *s = migrate_get_current();
 988         migrate_set_error(s, err);
 989         if (s->state == MIGRATION_STATUS_SETUP ||
 990             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
 991             s->state == MIGRATION_STATUS_DEVICE ||
 992             s->state == MIGRATION_STATUS_ACTIVE) {
 993             migrate_set_state(&s->state, s->state,
 994                               MIGRATION_STATUS_FAILED);
 995         }
 996     }
 997
 998     for (i = 0; i < migrate_multifd_channels(); i++) {
 999         MultiFDSendParams *p = &multifd_send_state->params[i];
1000
1001         qemu_mutex_lock(&p->mutex);
1002         p->quit = true;
1003         qemu_sem_post(&p->sem);
1004         qemu_mutex_unlock(&p->mutex);
1005     }
1006 }
1007
1008 void multifd_save_cleanup(void)
1009 {
1010     int i;
1011
1012     if (!migrate_use_multifd()) {
1013         return;
1014     }
1015     multifd_send_terminate_threads(NULL);
1016     for (i = 0; i < migrate_multifd_channels(); i++) {
1017         MultiFDSendParams *p = &multifd_send_state->params[i];
1018
1019         if (p->running) {
1020             qemu_thread_join(&p->thread);
1021         }
1022         socket_send_channel_destroy(p->c);
1023         p->c = NULL;
1024         qemu_mutex_destroy(&p->mutex);
1025         qemu_sem_destroy(&p->sem);
1026         g_free(p->name);
1027         p->name = NULL;
1028         multifd_pages_clear(p->pages);
1029         p->pages = NULL;
1030         p->packet_len = 0;
1031         g_free(p->packet);
1032         p->packet = NULL;
1033     }
1034     qemu_sem_destroy(&multifd_send_state->channels_ready);
1035     qemu_sem_destroy(&multifd_send_state->sem_sync);
1036     g_free(multifd_send_state->params);
1037     multifd_send_state->params = NULL;
1038     multifd_pages_clear(multifd_send_state->pages);
1039     multifd_send_state->pages = NULL;
1040     g_free(multifd_send_state);
1041     multifd_send_state = NULL;
1042 }
1043
1044 static void multifd_send_sync_main(void)
1045 {
1046     int i;
1047
1048     if (!migrate_use_multifd()) {
1049         return;
1050     }
1051     if (multifd_send_state->pages->used) {
1052         multifd_send_pages();
1053     }
1054     for (i = 0; i < migrate_multifd_channels(); i++) {
1055         MultiFDSendParams *p = &multifd_send_state->params[i];
1056
1057         trace_multifd_send_sync_main_signal(p->id);
1058
1059         qemu_mutex_lock(&p->mutex);
1060
1061         p->packet_num = multifd_send_state->packet_num++;
1062         p->flags |= MULTIFD_FLAG_SYNC;
1063         p->pending_job++;
1064         qemu_mutex_unlock(&p->mutex);
1065         qemu_sem_post(&p->sem);
1066     }
1067     for (i = 0; i < migrate_multifd_channels(); i++) {
1068         MultiFDSendParams *p = &multifd_send_state->params[i];
1069
1070         trace_multifd_send_sync_main_wait(p->id);
1071         qemu_sem_wait(&multifd_send_state->sem_sync);
1072     }
1073     trace_multifd_send_sync_main(multifd_send_state->packet_num);
1074 }
1075
1076 static void *multifd_send_thread(void *opaque)
1077 {
1078     MultiFDSendParams *p = opaque;
1079     Error *local_err = NULL;
1080     int ret;
1081
1082     trace_multifd_send_thread_start(p->id);
1083     rcu_register_thread();
1084
1085     if (multifd_send_initial_packet(p, &local_err) < 0) {
1086         goto out;
1087     }
1088     /* initial packet */
1089     p->num_packets = 1;
1090
1091     while (true) {
1092         qemu_sem_wait(&p->sem);
1093         qemu_mutex_lock(&p->mutex);
1094
1095         if (p->pending_job) {
1096             uint32_t used = p->pages->used;
1097             uint64_t packet_num = p->packet_num;
1098             uint32_t flags = p->flags;
1099
1100             p->next_packet_size = used * qemu_target_page_size();
1101             multifd_send_fill_packet(p);
1102             p->flags = 0;
1103             p->num_packets++;
1104             p->num_pages += used;
1105             p->pages->used = 0;
1106             qemu_mutex_unlock(&p->mutex);
1107
1108             trace_multifd_send(p->id, packet_num, used, flags,
1109                                p->next_packet_size);
1110
1111             ret = qio_channel_write_all(p->c, (void *)p->packet,
1112                                         p->packet_len, &local_err);
1113             if (ret != 0) {
1114                 break;
1115             }
1116
1117             if (used) {
1118                 ret = qio_channel_writev_all(p->c, p->pages->iov,
1119                                              used, &local_err);
1120                 if (ret != 0) {
1121                     break;
1122                 }
1123             }
1124
1125             qemu_mutex_lock(&p->mutex);
1126             p->pending_job--;
1127             qemu_mutex_unlock(&p->mutex);
1128
1129             if (flags & MULTIFD_FLAG_SYNC) {
1130                 qemu_sem_post(&multifd_send_state->sem_sync);
1131             }
1132             qemu_sem_post(&multifd_send_state->channels_ready);
1133         } else if (p->quit) {
1134             qemu_mutex_unlock(&p->mutex);
1135             break;
1136         } else {
1137             qemu_mutex_unlock(&p->mutex);
1138             /* sometimes there are spurious wakeups */
1139         }
1140     }
1141
1142 out:
1143     if (local_err) {
1144         multifd_send_terminate_threads(local_err);
1145     }
1146
1147     qemu_mutex_lock(&p->mutex);
1148     p->running = false;
1149     qemu_mutex_unlock(&p->mutex);
1150
1151     rcu_unregister_thread();
1152     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1153
1154     return NULL;
1155 }
1156
1157 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1158 {
1159     MultiFDSendParams *p = opaque;
1160     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1161     Error *local_err = NULL;
1162
1163     if (qio_task_propagate_error(task, &local_err)) {
1164         migrate_set_error(migrate_get_current(), local_err);
1165         multifd_save_cleanup();
1166     } else {
1167         p->c = QIO_CHANNEL(sioc);
1168         qio_channel_set_delay(p->c, false);
1169         p->running = true;
1170         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1171                            QEMU_THREAD_JOINABLE);
1172     }
1173 }
1174
1175 int multifd_save_setup(void)
1176 {
1177     int thread_count;
1178     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1179     uint8_t i;
1180
1181     if (!migrate_use_multifd()) {
1182         return 0;
1183     }
1184     thread_count = migrate_multifd_channels();
1185     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1186     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1187     multifd_send_state->pages = multifd_pages_init(page_count);
1188     qemu_sem_init(&multifd_send_state->sem_sync, 0);
1189     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1190
1191     for (i = 0; i < thread_count; i++) {
1192         MultiFDSendParams *p = &multifd_send_state->params[i];
1193
1194         qemu_mutex_init(&p->mutex);
1195         qemu_sem_init(&p->sem, 0);
1196         p->quit = false;
1197         p->pending_job = 0;
1198         p->id = i;
1199         p->pages = multifd_pages_init(page_count);
1200         p->packet_len = sizeof(MultiFDPacket_t)
1201                       + sizeof(ram_addr_t) * page_count;
1202         p->packet = g_malloc0(p->packet_len);
1203         p->name = g_strdup_printf("multifdsend_%d", i);
1204         socket_send_channel_create(multifd_new_send_channel_async, p);
1205     }
1206     return 0;
1207 }
1208
1209 struct {
1210     MultiFDRecvParams *params;
1211     /* number of created threads */
1212     int count;
1213     /* syncs main thread and channels */
1214     QemuSemaphore sem_sync;
1215     /* global number of generated multifd packets */
1216     uint64_t packet_num;
1217 } *multifd_recv_state;
1218
1219 static void multifd_recv_terminate_threads(Error *err)
1220 {
1221     int i;
1222
1223     if (err) {
1224         MigrationState *s = migrate_get_current();
1225         migrate_set_error(s, err);
1226         if (s->state == MIGRATION_STATUS_SETUP ||
1227             s->state == MIGRATION_STATUS_ACTIVE) {
1228             migrate_set_state(&s->state, s->state,
1229                               MIGRATION_STATUS_FAILED);
1230         }
1231     }
1232
1233     for (i = 0; i < migrate_multifd_channels(); i++) {
1234         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1235
1236         qemu_mutex_lock(&p->mutex);
1237         /* We could arrive here for two reasons:
1238            - normal quit, i.e. everything went fine, just finished
1239            - error quit: We close the channels so the channel threads
1240              finish the qio_channel_read_all_eof() */
1241         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1242         qemu_mutex_unlock(&p->mutex);
1243     }
1244 }
1245
1246 int multifd_load_cleanup(Error **errp)
1247 {
1248     int i;
1249     int ret = 0;
1250
1251     if (!migrate_use_multifd()) {
1252         return 0;
1253     }
1254     multifd_recv_terminate_threads(NULL);
1255     for (i = 0; i < migrate_multifd_channels(); i++) {
1256         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1257
1258         if (p->running) {
1259             qemu_thread_join(&p->thread);
1260         }
1261         object_unref(OBJECT(p->c));
1262         p->c = NULL;
1263         qemu_mutex_destroy(&p->mutex);
1264         qemu_sem_destroy(&p->sem_sync);
1265         g_free(p->name);
1266         p->name = NULL;
1267         multifd_pages_clear(p->pages);
1268         p->pages = NULL;
1269         p->packet_len = 0;
1270         g_free(p->packet);
1271         p->packet = NULL;
1272     }
1273     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1274     g_free(multifd_recv_state->params);
1275     multifd_recv_state->params = NULL;
1276     g_free(multifd_recv_state);
1277     multifd_recv_state = NULL;
1278
1279     return ret;
1280 }
1281
1282 static void multifd_recv_sync_main(void)
1283 {
1284     int i;
1285
1286     if (!migrate_use_multifd()) {
1287         return;
1288     }
1289     for (i = 0; i < migrate_multifd_channels(); i++) {
1290         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1291
1292         trace_multifd_recv_sync_main_wait(p->id);
1293         qemu_sem_wait(&multifd_recv_state->sem_sync);
1294     }
1295     for (i = 0; i < migrate_multifd_channels(); i++) {
1296         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1297
1298         qemu_mutex_lock(&p->mutex);
1299         if (multifd_recv_state->packet_num < p->packet_num) {
1300             multifd_recv_state->packet_num = p->packet_num;
1301         }
1302         qemu_mutex_unlock(&p->mutex);
1303         trace_multifd_recv_sync_main_signal(p->id);
1304         qemu_sem_post(&p->sem_sync);
1305     }
1306     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1307 }
1308
1309 static void *multifd_recv_thread(void *opaque)
1310 {
1311     MultiFDRecvParams *p = opaque;
1312     Error *local_err = NULL;
1313     int ret;
1314
1315     trace_multifd_recv_thread_start(p->id);
1316     rcu_register_thread();
1317
1318     while (true) {
1319         uint32_t used;
1320         uint32_t flags;
1321
1322         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1323                                        p->packet_len, &local_err);
1324         if (ret == 0) {   /* EOF */
1325             break;
1326         }
1327         if (ret == -1) {   /* Error */
1328             break;
1329         }
1330
1331         qemu_mutex_lock(&p->mutex);
1332         ret = multifd_recv_unfill_packet(p, &local_err);
1333         if (ret) {
1334             qemu_mutex_unlock(&p->mutex);
1335             break;
1336         }
1337
1338         used = p->pages->used;
1339         flags = p->flags;
1340         trace_multifd_recv(p->id, p->packet_num, used, flags,
1341                            p->next_packet_size);
1342         p->num_packets++;
1343         p->num_pages += used;
1344         qemu_mutex_unlock(&p->mutex);
1345
1346         if (used) {
1347             ret = qio_channel_readv_all(p->c, p->pages->iov,
1348                                         used, &local_err);
1349             if (ret != 0) {
1350                 break;
1351             }
1352         }
1353
1354         if (flags & MULTIFD_FLAG_SYNC) {
1355             qemu_sem_post(&multifd_recv_state->sem_sync);
1356             qemu_sem_wait(&p->sem_sync);
1357         }
1358     }
1359
1360     if (local_err) {
1361         multifd_recv_terminate_threads(local_err);
1362     }
1363     qemu_mutex_lock(&p->mutex);
1364     p->running = false;
1365     qemu_mutex_unlock(&p->mutex);
1366
1367     rcu_unregister_thread();
1368     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1369
1370     return NULL;
1371 }
1372
1373 int multifd_load_setup(void)
1374 {
1375     int thread_count;
1376     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1377     uint8_t i;
1378
1379     if (!migrate_use_multifd()) {
1380         return 0;
1381     }
1382     thread_count = migrate_multifd_channels();
1383     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1384     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1385     atomic_set(&multifd_recv_state->count, 0);
1386     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1387
1388     for (i = 0; i < thread_count; i++) {
1389         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1390
1391         qemu_mutex_init(&p->mutex);
1392         qemu_sem_init(&p->sem_sync, 0);
1393         p->id = i;
1394         p->pages = multifd_pages_init(page_count);
1395         p->packet_len = sizeof(MultiFDPacket_t)
1396                       + sizeof(ram_addr_t) * page_count;
1397         p->packet = g_malloc0(p->packet_len);
1398         p->name = g_strdup_printf("multifdrecv_%d", i);
1399     }
1400     return 0;
1401 }
1402
1403 bool multifd_recv_all_channels_created(void)
1404 {
1405     int thread_count = migrate_multifd_channels();
1406
1407     if (!migrate_use_multifd()) {
1408         return true;
1409     }
1410
1411     return thread_count == atomic_read(&multifd_recv_state->count);
1412 }
1413
1414 /*
1415  * Try to receive all multifd channels to get ready for the migration.
1416  * - Return true and do not set @errp when correctly receving all channels;
1417  * - Return false and do not set @errp when correctly receiving the current one;
1418  * - Return false and set @errp when failing to receive the current channel.
1419  */
1420 bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
1421 {
1422     MultiFDRecvParams *p;
1423     Error *local_err = NULL;
1424     int id;
1425
1426     id = multifd_recv_initial_packet(ioc, &local_err);
1427     if (id < 0) {
1428         multifd_recv_terminate_threads(local_err);
1429         error_propagate_prepend(errp, local_err,
1430                                 "failed to receive packet"
1431                                 " via multifd channel %d: ",
1432                                 atomic_read(&multifd_recv_state->count));
1433         return false;
1434     }
1435
1436     p = &multifd_recv_state->params[id];
1437     if (p->c != NULL) {
1438         error_setg(&local_err, "multifd: received id '%d' already setup'",
1439                    id);
1440         multifd_recv_terminate_threads(local_err);
1441         error_propagate(errp, local_err);
1442         return false;
1443     }
1444     p->c = ioc;
1445     object_ref(OBJECT(ioc));
1446     /* initial packet */
1447     p->num_packets = 1;
1448
1449     p->running = true;
1450     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1451                        QEMU_THREAD_JOINABLE);
1452     atomic_inc(&multifd_recv_state->count);
1453     return atomic_read(&multifd_recv_state->count) ==
1454            migrate_multifd_channels();
1455 }
1456
1457 /**
1458  * save_page_header: write page header to wire
1459  *
1460  * If this is the 1st block, it also writes the block identification
1461  *
1462  * Returns the number of bytes written
1463  *
1464  * @f: QEMUFile where to send the data
1465  * @block: block that contains the page we want to send
1466  * @offset: offset inside the block for the page
1467  *          in the lower bits, it contains flags
1468  */
1469 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1470                                ram_addr_t offset)
1471 {
1472     size_t size, len;
1473
1474     if (block == rs->last_sent_block) {
1475         offset |= RAM_SAVE_FLAG_CONTINUE;
1476     }
1477     qemu_put_be64(f, offset);
1478     size = 8;
1479
1480     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1481         len = strlen(block->idstr);
1482         qemu_put_byte(f, len);
1483         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1484         size += 1 + len;
1485         rs->last_sent_block = block;
1486     }
1487     return size;
1488 }
1489
1490 /**
1491  * mig_throttle_guest_down: throotle down the guest
1492  *
1493  * Reduce amount of guest cpu execution to hopefully slow down memory
1494  * writes. If guest dirty memory rate is reduced below the rate at
1495  * which we can transfer pages to the destination then we should be
1496  * able to complete migration. Some workloads dirty memory way too
1497  * fast and will not effectively converge, even with auto-converge.
1498  */
1499 static void mig_throttle_guest_down(void)
1500 {
1501     MigrationState *s = migrate_get_current();
1502     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1503     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1504     int pct_max = s->parameters.max_cpu_throttle;
1505
1506     /* We have not started throttling yet. Let's start it. */
1507     if (!cpu_throttle_active()) {
1508         cpu_throttle_set(pct_initial);
1509     } else {
1510         /* Throttling already on, just increase the rate */
1511         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1512                          pct_max));
1513     }
1514 }
1515
1516 /**
1517  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1518  *
1519  * @rs: current RAM state
1520  * @current_addr: address for the zero page
1521  *
1522  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1523  * The important thing is that a stale (not-yet-0'd) page be replaced
1524  * by the new data.
1525  * As a bonus, if the page wasn't in the cache it gets added so that
1526  * when a small write is made into the 0'd page it gets XBZRLE sent.
1527  */
1528 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1529 {
1530     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1531         return;
1532     }
1533
1534     /* We don't care if this fails to allocate a new cache page
1535      * as long as it updated an old one */
1536     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1537                  ram_counters.dirty_sync_count);
1538 }
1539
1540 #define ENCODING_FLAG_XBZRLE 0x1
1541
1542 /**
1543  * save_xbzrle_page: compress and send current page
1544  *
1545  * Returns: 1 means that we wrote the page
1546  *          0 means that page is identical to the one already sent
1547  *          -1 means that xbzrle would be longer than normal
1548  *
1549  * @rs: current RAM state
1550  * @current_data: pointer to the address of the page contents
1551  * @current_addr: addr of the page
1552  * @block: block that contains the page we want to send
1553  * @offset: offset inside the block for the page
1554  * @last_stage: if we are at the completion stage
1555  */
1556 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1557                             ram_addr_t current_addr, RAMBlock *block,
1558                             ram_addr_t offset, bool last_stage)
1559 {
1560     int encoded_len = 0, bytes_xbzrle;
1561     uint8_t *prev_cached_page;
1562
1563     if (!cache_is_cached(XBZRLE.cache, current_addr,
1564                          ram_counters.dirty_sync_count)) {
1565         xbzrle_counters.cache_miss++;
1566         if (!last_stage) {
1567             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1568                              ram_counters.dirty_sync_count) == -1) {
1569                 return -1;
1570             } else {
1571                 /* update *current_data when the page has been
1572                    inserted into cache */
1573                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1574             }
1575         }
1576         return -1;
1577     }
1578
1579     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1580
1581     /* save current buffer into memory */
1582     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1583
1584     /* XBZRLE encoding (if there is no overflow) */
1585     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1586                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1587                                        TARGET_PAGE_SIZE);
1588
1589     /*
1590      * Update the cache contents, so that it corresponds to the data
1591      * sent, in all cases except where we skip the page.
1592      */
1593     if (!last_stage && encoded_len != 0) {
1594         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1595         /*
1596          * In the case where we couldn't compress, ensure that the caller
1597          * sends the data from the cache, since the guest might have
1598          * changed the RAM since we copied it.
1599          */
1600         *current_data = prev_cached_page;
1601     }
1602
1603     if (encoded_len == 0) {
1604         trace_save_xbzrle_page_skipping();
1605         return 0;
1606     } else if (encoded_len == -1) {
1607         trace_save_xbzrle_page_overflow();
1608         xbzrle_counters.overflow++;
1609         return -1;
1610     }
1611
1612     /* Send XBZRLE based compressed page */
1613     bytes_xbzrle = save_page_header(rs, rs->f, block,
1614                                     offset | RAM_SAVE_FLAG_XBZRLE);
1615     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1616     qemu_put_be16(rs->f, encoded_len);
1617     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1618     bytes_xbzrle += encoded_len + 1 + 2;
1619     xbzrle_counters.pages++;
1620     xbzrle_counters.bytes += bytes_xbzrle;
1621     ram_counters.transferred += bytes_xbzrle;
1622
1623     return 1;
1624 }
1625
1626 /**
1627  * migration_bitmap_find_dirty: find the next dirty page from start
1628  *
1629  * Returns the page offset within memory region of the start of a dirty page
1630  *
1631  * @rs: current RAM state
1632  * @rb: RAMBlock where to search for dirty pages
1633  * @start: page where we start the search
1634  */
1635 static inline
1636 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1637                                           unsigned long start)
1638 {
1639     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1640     unsigned long *bitmap = rb->bmap;
1641     unsigned long next;
1642
1643     if (ramblock_is_ignored(rb)) {
1644         return size;
1645     }
1646
1647     /*
1648      * When the free page optimization is enabled, we need to check the bitmap
1649      * to send the non-free pages rather than all the pages in the bulk stage.
1650      */
1651     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
1652         next = start + 1;
1653     } else {
1654         next = find_next_bit(bitmap, size, start);
1655     }
1656
1657     return next;
1658 }
1659
1660 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1661                                                 RAMBlock *rb,
1662                                                 unsigned long page)
1663 {
1664     bool ret;
1665
1666     qemu_mutex_lock(&rs->bitmap_mutex);
1667
1668     /*
1669      * Clear dirty bitmap if needed.  This _must_ be called before we
1670      * send any of the page in the chunk because we need to make sure
1671      * we can capture further page content changes when we sync dirty
1672      * log the next time.  So as long as we are going to send any of
1673      * the page in the chunk we clear the remote dirty bitmap for all.
1674      * Clearing it earlier won't be a problem, but too late will.
1675      */
1676     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
1677         uint8_t shift = rb->clear_bmap_shift;
1678         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
1679         hwaddr start = (page << TARGET_PAGE_BITS) & (-size);
1680
1681         /*
1682          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
1683          * can make things easier sometimes since then start address
1684          * of the small chunk will always be 64 pages aligned so the
1685          * bitmap will always be aligned to unsigned long.  We should
1686          * even be able to remove this restriction but I'm simply
1687          * keeping it.
1688          */
1689         assert(shift >= 6);
1690         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
1691         memory_region_clear_dirty_bitmap(rb->mr, start, size);
1692     }
1693
1694     ret = test_and_clear_bit(page, rb->bmap);
1695
1696     if (ret) {
1697         rs->migration_dirty_pages--;
1698     }
1699     qemu_mutex_unlock(&rs->bitmap_mutex);
1700
1701     return ret;
1702 }
1703
1704 /* Called with RCU critical section */
1705 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1706                                         ram_addr_t length)
1707 {
1708     rs->migration_dirty_pages +=
1709         cpu_physical_memory_sync_dirty_bitmap(rb, 0, length,
1710                                               &rs->num_dirty_pages_period);
1711 }
1712
1713 /**
1714  * ram_pagesize_summary: calculate all the pagesizes of a VM
1715  *
1716  * Returns a summary bitmap of the page sizes of all RAMBlocks
1717  *
1718  * For VMs with just normal pages this is equivalent to the host page
1719  * size. If it's got some huge pages then it's the OR of all the
1720  * different page sizes.
1721  */
1722 uint64_t ram_pagesize_summary(void)
1723 {
1724     RAMBlock *block;
1725     uint64_t summary = 0;
1726
1727     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1728         summary |= block->page_size;
1729     }
1730
1731     return summary;
1732 }
1733
1734 uint64_t ram_get_total_transferred_pages(void)
1735 {
1736     return  ram_counters.normal + ram_counters.duplicate +
1737                 compression_counters.pages + xbzrle_counters.pages;
1738 }
1739
1740 static void migration_update_rates(RAMState *rs, int64_t end_time)
1741 {
1742     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1743     double compressed_size;
1744
1745     /* calculate period counters */
1746     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1747                 / (end_time - rs->time_last_bitmap_sync);
1748
1749     if (!page_count) {
1750         return;
1751     }
1752
1753     if (migrate_use_xbzrle()) {
1754         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1755             rs->xbzrle_cache_miss_prev) / page_count;
1756         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1757     }
1758
1759     if (migrate_use_compression()) {
1760         compression_counters.busy_rate = (double)(compression_counters.busy -
1761             rs->compress_thread_busy_prev) / page_count;
1762         rs->compress_thread_busy_prev = compression_counters.busy;
1763
1764         compressed_size = compression_counters.compressed_size -
1765                           rs->compressed_size_prev;
1766         if (compressed_size) {
1767             double uncompressed_size = (compression_counters.pages -
1768                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1769
1770             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1771             compression_counters.compression_rate =
1772                                         uncompressed_size / compressed_size;
1773
1774             rs->compress_pages_prev = compression_counters.pages;
1775             rs->compressed_size_prev = compression_counters.compressed_size;
1776         }
1777     }
1778 }
1779
1780 static void migration_bitmap_sync(RAMState *rs)
1781 {
1782     RAMBlock *block;
1783     int64_t end_time;
1784     uint64_t bytes_xfer_now;
1785
1786     ram_counters.dirty_sync_count++;
1787
1788     if (!rs->time_last_bitmap_sync) {
1789         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1790     }
1791
1792     trace_migration_bitmap_sync_start();
1793     memory_global_dirty_log_sync();
1794
1795     qemu_mutex_lock(&rs->bitmap_mutex);
1796     rcu_read_lock();
1797     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1798         migration_bitmap_sync_range(rs, block, block->used_length);
1799     }
1800     ram_counters.remaining = ram_bytes_remaining();
1801     rcu_read_unlock();
1802     qemu_mutex_unlock(&rs->bitmap_mutex);
1803
1804     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1805
1806     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1807
1808     /* more than 1 second = 1000 millisecons */
1809     if (end_time > rs->time_last_bitmap_sync + 1000) {
1810         bytes_xfer_now = ram_counters.transferred;
1811
1812         /* During block migration the auto-converge logic incorrectly detects
1813          * that ram migration makes no progress. Avoid this by disabling the
1814          * throttling logic during the bulk phase of block migration. */
1815         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1816             /* The following detection logic can be refined later. For now:
1817                Check to see if the dirtied bytes is 50% more than the approx.
1818                amount of bytes that just got transferred since the last time we
1819                were in this routine. If that happens twice, start or increase
1820                throttling */
1821
1822             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1823                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1824                 (++rs->dirty_rate_high_cnt >= 2)) {
1825                     trace_migration_throttle();
1826                     rs->dirty_rate_high_cnt = 0;
1827                     mig_throttle_guest_down();
1828             }
1829         }
1830
1831         migration_update_rates(rs, end_time);
1832
1833         rs->target_page_count_prev = rs->target_page_count;
1834
1835         /* reset period counters */
1836         rs->time_last_bitmap_sync = end_time;
1837         rs->num_dirty_pages_period = 0;
1838         rs->bytes_xfer_prev = bytes_xfer_now;
1839     }
1840     if (migrate_use_events()) {
1841         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1842     }
1843 }
1844
1845 static void migration_bitmap_sync_precopy(RAMState *rs)
1846 {
1847     Error *local_err = NULL;
1848
1849     /*
1850      * The current notifier usage is just an optimization to migration, so we
1851      * don't stop the normal migration process in the error case.
1852      */
1853     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1854         error_report_err(local_err);
1855     }
1856
1857     migration_bitmap_sync(rs);
1858
1859     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1860         error_report_err(local_err);
1861     }
1862 }
1863
1864 /**
1865  * save_zero_page_to_file: send the zero page to the file
1866  *
1867  * Returns the size of data written to the file, 0 means the page is not
1868  * a zero page
1869  *
1870  * @rs: current RAM state
1871  * @file: the file where the data is saved
1872  * @block: block that contains the page we want to send
1873  * @offset: offset inside the block for the page
1874  */
1875 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1876                                   RAMBlock *block, ram_addr_t offset)
1877 {
1878     uint8_t *p = block->host + offset;
1879     int len = 0;
1880
1881     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1882         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1883         qemu_put_byte(file, 0);
1884         len += 1;
1885     }
1886     return len;
1887 }
1888
1889 /**
1890  * save_zero_page: send the zero page to the stream
1891  *
1892  * Returns the number of pages written.
1893  *
1894  * @rs: current RAM state
1895  * @block: block that contains the page we want to send
1896  * @offset: offset inside the block for the page
1897  */
1898 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1899 {
1900     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1901
1902     if (len) {
1903         ram_counters.duplicate++;
1904         ram_counters.transferred += len;
1905         return 1;
1906     }
1907     return -1;
1908 }
1909
1910 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1911 {
1912     if (!migrate_release_ram() || !migration_in_postcopy()) {
1913         return;
1914     }
1915
1916     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1917 }
1918
1919 /*
1920  * @pages: the number of pages written by the control path,
1921  *        < 0 - error
1922  *        > 0 - number of pages written
1923  *
1924  * Return true if the pages has been saved, otherwise false is returned.
1925  */
1926 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1927                               int *pages)
1928 {
1929     uint64_t bytes_xmit = 0;
1930     int ret;
1931
1932     *pages = -1;
1933     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1934                                 &bytes_xmit);
1935     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1936         return false;
1937     }
1938
1939     if (bytes_xmit) {
1940         ram_counters.transferred += bytes_xmit;
1941         *pages = 1;
1942     }
1943
1944     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1945         return true;
1946     }
1947
1948     if (bytes_xmit > 0) {
1949         ram_counters.normal++;
1950     } else if (bytes_xmit == 0) {
1951         ram_counters.duplicate++;
1952     }
1953
1954     return true;
1955 }
1956
1957 /*
1958  * directly send the page to the stream
1959  *
1960  * Returns the number of pages written.
1961  *
1962  * @rs: current RAM state
1963  * @block: block that contains the page we want to send
1964  * @offset: offset inside the block for the page
1965  * @buf: the page to be sent
1966  * @async: send to page asyncly
1967  */
1968 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1969                             uint8_t *buf, bool async)
1970 {
1971     ram_counters.transferred += save_page_header(rs, rs->f, block,
1972                                                  offset | RAM_SAVE_FLAG_PAGE);
1973     if (async) {
1974         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1975                               migrate_release_ram() &
1976                               migration_in_postcopy());
1977     } else {
1978         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1979     }
1980     ram_counters.transferred += TARGET_PAGE_SIZE;
1981     ram_counters.normal++;
1982     return 1;
1983 }
1984
1985 /**
1986  * ram_save_page: send the given page to the stream
1987  *
1988  * Returns the number of pages written.
1989  *          < 0 - error
1990  *          >=0 - Number of pages written - this might legally be 0
1991  *                if xbzrle noticed the page was the same.
1992  *
1993  * @rs: current RAM state
1994  * @block: block that contains the page we want to send
1995  * @offset: offset inside the block for the page
1996  * @last_stage: if we are at the completion stage
1997  */
1998 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1999 {
2000     int pages = -1;
2001     uint8_t *p;
2002     bool send_async = true;
2003     RAMBlock *block = pss->block;
2004     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2005     ram_addr_t current_addr = block->offset + offset;
2006
2007     p = block->host + offset;
2008     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
2009
2010     XBZRLE_cache_lock();
2011     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
2012         migrate_use_xbzrle()) {
2013         pages = save_xbzrle_page(rs, &p, current_addr, block,
2014                                  offset, last_stage);
2015         if (!last_stage) {
2016             /* Can't send this cached data async, since the cache page
2017              * might get updated before it gets to the wire
2018              */
2019             send_async = false;
2020         }
2021     }
2022
2023     /* XBZRLE overflow or normal page */
2024     if (pages == -1) {
2025         pages = save_normal_page(rs, block, offset, p, send_async);
2026     }
2027
2028     XBZRLE_cache_unlock();
2029
2030     return pages;
2031 }
2032
2033 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
2034                                  ram_addr_t offset)
2035 {
2036     multifd_queue_page(block, offset);
2037     ram_counters.normal++;
2038
2039     return 1;
2040 }
2041
2042 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
2043                                  ram_addr_t offset, uint8_t *source_buf)
2044 {
2045     RAMState *rs = ram_state;
2046     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
2047     bool zero_page = false;
2048     int ret;
2049
2050     if (save_zero_page_to_file(rs, f, block, offset)) {
2051         zero_page = true;
2052         goto exit;
2053     }
2054
2055     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
2056
2057     /*
2058      * copy it to a internal buffer to avoid it being modified by VM
2059      * so that we can catch up the error during compression and
2060      * decompression
2061      */
2062     memcpy(source_buf, p, TARGET_PAGE_SIZE);
2063     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
2064     if (ret < 0) {
2065         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
2066         error_report("compressed data failed!");
2067         return false;
2068     }
2069
2070 exit:
2071     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
2072     return zero_page;
2073 }
2074
2075 static void
2076 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
2077 {
2078     ram_counters.transferred += bytes_xmit;
2079
2080     if (param->zero_page) {
2081         ram_counters.duplicate++;
2082         return;
2083     }
2084
2085     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
2086     compression_counters.compressed_size += bytes_xmit - 8;
2087     compression_counters.pages++;
2088 }
2089
2090 static bool save_page_use_compression(RAMState *rs);
2091
2092 static void flush_compressed_data(RAMState *rs)
2093 {
2094     int idx, len, thread_count;
2095
2096     if (!save_page_use_compression(rs)) {
2097         return;
2098     }
2099     thread_count = migrate_compress_threads();
2100
2101     qemu_mutex_lock(&comp_done_lock);
2102     for (idx = 0; idx < thread_count; idx++) {
2103         while (!comp_param[idx].done) {
2104             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2105         }
2106     }
2107     qemu_mutex_unlock(&comp_done_lock);
2108
2109     for (idx = 0; idx < thread_count; idx++) {
2110         qemu_mutex_lock(&comp_param[idx].mutex);
2111         if (!comp_param[idx].quit) {
2112             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2113             /*
2114              * it's safe to fetch zero_page without holding comp_done_lock
2115              * as there is no further request submitted to the thread,
2116              * i.e, the thread should be waiting for a request at this point.
2117              */
2118             update_compress_thread_counts(&comp_param[idx], len);
2119         }
2120         qemu_mutex_unlock(&comp_param[idx].mutex);
2121     }
2122 }
2123
2124 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
2125                                        ram_addr_t offset)
2126 {
2127     param->block = block;
2128     param->offset = offset;
2129 }
2130
2131 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
2132                                            ram_addr_t offset)
2133 {
2134     int idx, thread_count, bytes_xmit = -1, pages = -1;
2135     bool wait = migrate_compress_wait_thread();
2136
2137     thread_count = migrate_compress_threads();
2138     qemu_mutex_lock(&comp_done_lock);
2139 retry:
2140     for (idx = 0; idx < thread_count; idx++) {
2141         if (comp_param[idx].done) {
2142             comp_param[idx].done = false;
2143             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2144             qemu_mutex_lock(&comp_param[idx].mutex);
2145             set_compress_params(&comp_param[idx], block, offset);
2146             qemu_cond_signal(&comp_param[idx].cond);
2147             qemu_mutex_unlock(&comp_param[idx].mutex);
2148             pages = 1;
2149             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
2150             break;
2151         }
2152     }
2153
2154     /*
2155      * wait for the free thread if the user specifies 'compress-wait-thread',
2156      * otherwise we will post the page out in the main thread as normal page.
2157      */
2158     if (pages < 0 && wait) {
2159         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2160         goto retry;
2161     }
2162     qemu_mutex_unlock(&comp_done_lock);
2163
2164     return pages;
2165 }
2166
2167 /**
2168  * find_dirty_block: find the next dirty page and update any state
2169  * associated with the search process.
2170  *
2171  * Returns true if a page is found
2172  *
2173  * @rs: current RAM state
2174  * @pss: data about the state of the current dirty page scan
2175  * @again: set to false if the search has scanned the whole of RAM
2176  */
2177 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
2178 {
2179     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2180     if (pss->complete_round && pss->block == rs->last_seen_block &&
2181         pss->page >= rs->last_page) {
2182         /*
2183          * We've been once around the RAM and haven't found anything.
2184          * Give up.
2185          */
2186         *again = false;
2187         return false;
2188     }
2189     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
2190         /* Didn't find anything in this RAM Block */
2191         pss->page = 0;
2192         pss->block = QLIST_NEXT_RCU(pss->block, next);
2193         if (!pss->block) {
2194             /*
2195              * If memory migration starts over, we will meet a dirtied page
2196              * which may still exists in compression threads's ring, so we
2197              * should flush the compressed data to make sure the new page
2198              * is not overwritten by the old one in the destination.
2199              *
2200              * Also If xbzrle is on, stop using the data compression at this
2201              * point. In theory, xbzrle can do better than compression.
2202              */
2203             flush_compressed_data(rs);
2204
2205             /* Hit the end of the list */
2206             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2207             /* Flag that we've looped */
2208             pss->complete_round = true;
2209             rs->ram_bulk_stage = false;
2210         }
2211         /* Didn't find anything this time, but try again on the new block */
2212         *again = true;
2213         return false;
2214     } else {
2215         /* Can go around again, but... */
2216         *again = true;
2217         /* We've found something so probably don't need to */
2218         return true;
2219     }
2220 }
2221
2222 /**
2223  * unqueue_page: gets a page of the queue
2224  *
2225  * Helper for 'get_queued_page' - gets a page off the queue
2226  *
2227  * Returns the block of the page (or NULL if none available)
2228  *
2229  * @rs: current RAM state
2230  * @offset: used to return the offset within the RAMBlock
2231  */
2232 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
2233 {
2234     RAMBlock *block = NULL;
2235
2236     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2237         return NULL;
2238     }
2239
2240     qemu_mutex_lock(&rs->src_page_req_mutex);
2241     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2242         struct RAMSrcPageRequest *entry =
2243                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
2244         block = entry->rb;
2245         *offset = entry->offset;
2246
2247         if (entry->len > TARGET_PAGE_SIZE) {
2248             entry->len -= TARGET_PAGE_SIZE;
2249             entry->offset += TARGET_PAGE_SIZE;
2250         } else {
2251             memory_region_unref(block->mr);
2252             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2253             g_free(entry);
2254             migration_consume_urgent_request();
2255         }
2256     }
2257     qemu_mutex_unlock(&rs->src_page_req_mutex);
2258
2259     return block;
2260 }
2261
2262 /**
2263  * get_queued_page: unqueue a page from the postcopy requests
2264  *
2265  * Skips pages that are already sent (!dirty)
2266  *
2267  * Returns true if a queued page is found
2268  *
2269  * @rs: current RAM state
2270  * @pss: data about the state of the current dirty page scan
2271  */
2272 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2273 {
2274     RAMBlock  *block;
2275     ram_addr_t offset;
2276     bool dirty;
2277
2278     do {
2279         block = unqueue_page(rs, &offset);
2280         /*
2281          * We're sending this page, and since it's postcopy nothing else
2282          * will dirty it, and we must make sure it doesn't get sent again
2283          * even if this queue request was received after the background
2284          * search already sent it.
2285          */
2286         if (block) {
2287             unsigned long page;
2288
2289             page = offset >> TARGET_PAGE_BITS;
2290             dirty = test_bit(page, block->bmap);
2291             if (!dirty) {
2292                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2293                        page, test_bit(page, block->unsentmap));
2294             } else {
2295                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2296             }
2297         }
2298
2299     } while (block && !dirty);
2300
2301     if (block) {
2302         /*
2303          * As soon as we start servicing pages out of order, then we have
2304          * to kill the bulk stage, since the bulk stage assumes
2305          * in (migration_bitmap_find_and_reset_dirty) that every page is
2306          * dirty, that's no longer true.
2307          */
2308         rs->ram_bulk_stage = false;
2309
2310         /*
2311          * We want the background search to continue from the queued page
2312          * since the guest is likely to want other pages near to the page
2313          * it just requested.
2314          */
2315         pss->block = block;
2316         pss->page = offset >> TARGET_PAGE_BITS;
2317
2318         /*
2319          * This unqueued page would break the "one round" check, even is
2320          * really rare.
2321          */
2322         pss->complete_round = false;
2323     }
2324
2325     return !!block;
2326 }
2327
2328 /**
2329  * migration_page_queue_free: drop any remaining pages in the ram
2330  * request queue
2331  *
2332  * It should be empty at the end anyway, but in error cases there may
2333  * be some left.  in case that there is any page left, we drop it.
2334  *
2335  */
2336 static void migration_page_queue_free(RAMState *rs)
2337 {
2338     struct RAMSrcPageRequest *mspr, *next_mspr;
2339     /* This queue generally should be empty - but in the case of a failed
2340      * migration might have some droppings in.
2341      */
2342     rcu_read_lock();
2343     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2344         memory_region_unref(mspr->rb->mr);
2345         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2346         g_free(mspr);
2347     }
2348     rcu_read_unlock();
2349 }
2350
2351 /**
2352  * ram_save_queue_pages: queue the page for transmission
2353  *
2354  * A request from postcopy destination for example.
2355  *
2356  * Returns zero on success or negative on error
2357  *
2358  * @rbname: Name of the RAMBLock of the request. NULL means the
2359  *          same that last one.
2360  * @start: starting address from the start of the RAMBlock
2361  * @len: length (in bytes) to send
2362  */
2363 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2364 {
2365     RAMBlock *ramblock;
2366     RAMState *rs = ram_state;
2367
2368     ram_counters.postcopy_requests++;
2369     rcu_read_lock();
2370     if (!rbname) {
2371         /* Reuse last RAMBlock */
2372         ramblock = rs->last_req_rb;
2373
2374         if (!ramblock) {
2375             /*
2376              * Shouldn't happen, we can't reuse the last RAMBlock if
2377              * it's the 1st request.
2378              */
2379             error_report("ram_save_queue_pages no previous block");
2380             goto err;
2381         }
2382     } else {
2383         ramblock = qemu_ram_block_by_name(rbname);
2384
2385         if (!ramblock) {
2386             /* We shouldn't be asked for a non-existent RAMBlock */
2387             error_report("ram_save_queue_pages no block '%s'", rbname);
2388             goto err;
2389         }
2390         rs->last_req_rb = ramblock;
2391     }
2392     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2393     if (start+len > ramblock->used_length) {
2394         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2395                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2396                      __func__, start, len, ramblock->used_length);
2397         goto err;
2398     }
2399
2400     struct RAMSrcPageRequest *new_entry =
2401         g_malloc0(sizeof(struct RAMSrcPageRequest));
2402     new_entry->rb = ramblock;
2403     new_entry->offset = start;
2404     new_entry->len = len;
2405
2406     memory_region_ref(ramblock->mr);
2407     qemu_mutex_lock(&rs->src_page_req_mutex);
2408     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2409     migration_make_urgent_request();
2410     qemu_mutex_unlock(&rs->src_page_req_mutex);
2411     rcu_read_unlock();
2412
2413     return 0;
2414
2415 err:
2416     rcu_read_unlock();
2417     return -1;
2418 }
2419
2420 static bool save_page_use_compression(RAMState *rs)
2421 {
2422     if (!migrate_use_compression()) {
2423         return false;
2424     }
2425
2426     /*
2427      * If xbzrle is on, stop using the data compression after first
2428      * round of migration even if compression is enabled. In theory,
2429      * xbzrle can do better than compression.
2430      */
2431     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2432         return true;
2433     }
2434
2435     return false;
2436 }
2437
2438 /*
2439  * try to compress the page before posting it out, return true if the page
2440  * has been properly handled by compression, otherwise needs other
2441  * paths to handle it
2442  */
2443 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2444 {
2445     if (!save_page_use_compression(rs)) {
2446         return false;
2447     }
2448
2449     /*
2450      * When starting the process of a new block, the first page of
2451      * the block should be sent out before other pages in the same
2452      * block, and all the pages in last block should have been sent
2453      * out, keeping this order is important, because the 'cont' flag
2454      * is used to avoid resending the block name.
2455      *
2456      * We post the fist page as normal page as compression will take
2457      * much CPU resource.
2458      */
2459     if (block != rs->last_sent_block) {
2460         flush_compressed_data(rs);
2461         return false;
2462     }
2463
2464     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2465         return true;
2466     }
2467
2468     compression_counters.busy++;
2469     return false;
2470 }
2471
2472 /**
2473  * ram_save_target_page: save one target page
2474  *
2475  * Returns the number of pages written
2476  *
2477  * @rs: current RAM state
2478  * @pss: data about the page we want to send
2479  * @last_stage: if we are at the completion stage
2480  */
2481 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2482                                 bool last_stage)
2483 {
2484     RAMBlock *block = pss->block;
2485     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2486     int res;
2487
2488     if (control_save_page(rs, block, offset, &res)) {
2489         return res;
2490     }
2491
2492     if (save_compress_page(rs, block, offset)) {
2493         return 1;
2494     }
2495
2496     res = save_zero_page(rs, block, offset);
2497     if (res > 0) {
2498         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2499          * page would be stale
2500          */
2501         if (!save_page_use_compression(rs)) {
2502             XBZRLE_cache_lock();
2503             xbzrle_cache_zero_page(rs, block->offset + offset);
2504             XBZRLE_cache_unlock();
2505         }
2506         ram_release_pages(block->idstr, offset, res);
2507         return res;
2508     }
2509
2510     /*
2511      * do not use multifd for compression as the first page in the new
2512      * block should be posted out before sending the compressed page
2513      */
2514     if (!save_page_use_compression(rs) && migrate_use_multifd()) {
2515         return ram_save_multifd_page(rs, block, offset);
2516     }
2517
2518     return ram_save_page(rs, pss, last_stage);
2519 }
2520
2521 /**
2522  * ram_save_host_page: save a whole host page
2523  *
2524  * Starting at *offset send pages up to the end of the current host
2525  * page. It's valid for the initial offset to point into the middle of
2526  * a host page in which case the remainder of the hostpage is sent.
2527  * Only dirty target pages are sent. Note that the host page size may
2528  * be a huge page for this block.
2529  * The saving stops at the boundary of the used_length of the block
2530  * if the RAMBlock isn't a multiple of the host page size.
2531  *
2532  * Returns the number of pages written or negative on error
2533  *
2534  * @rs: current RAM state
2535  * @ms: current migration state
2536  * @pss: data about the page we want to send
2537  * @last_stage: if we are at the completion stage
2538  */
2539 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2540                               bool last_stage)
2541 {
2542     int tmppages, pages = 0;
2543     size_t pagesize_bits =
2544         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2545
2546     if (ramblock_is_ignored(pss->block)) {
2547         error_report("block %s should not be migrated !", pss->block->idstr);
2548         return 0;
2549     }
2550
2551     do {
2552         /* Check the pages is dirty and if it is send it */
2553         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2554             pss->page++;
2555             continue;
2556         }
2557
2558         tmppages = ram_save_target_page(rs, pss, last_stage);
2559         if (tmppages < 0) {
2560             return tmppages;
2561         }
2562
2563         pages += tmppages;
2564         if (pss->block->unsentmap) {
2565             clear_bit(pss->page, pss->block->unsentmap);
2566         }
2567
2568         pss->page++;
2569     } while ((pss->page & (pagesize_bits - 1)) &&
2570              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2571
2572     /* The offset we leave with is the last one we looked at */
2573     pss->page--;
2574     return pages;
2575 }
2576
2577 /**
2578  * ram_find_and_save_block: finds a dirty page and sends it to f
2579  *
2580  * Called within an RCU critical section.
2581  *
2582  * Returns the number of pages written where zero means no dirty pages,
2583  * or negative on error
2584  *
2585  * @rs: current RAM state
2586  * @last_stage: if we are at the completion stage
2587  *
2588  * On systems where host-page-size > target-page-size it will send all the
2589  * pages in a host page that are dirty.
2590  */
2591
2592 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2593 {
2594     PageSearchStatus pss;
2595     int pages = 0;
2596     bool again, found;
2597
2598     /* No dirty page as there is zero RAM */
2599     if (!ram_bytes_total()) {
2600         return pages;
2601     }
2602
2603     pss.block = rs->last_seen_block;
2604     pss.page = rs->last_page;
2605     pss.complete_round = false;
2606
2607     if (!pss.block) {
2608         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2609     }
2610
2611     do {
2612         again = true;
2613         found = get_queued_page(rs, &pss);
2614
2615         if (!found) {
2616             /* priority queue empty, so just search for something dirty */
2617             found = find_dirty_block(rs, &pss, &again);
2618         }
2619
2620         if (found) {
2621             pages = ram_save_host_page(rs, &pss, last_stage);
2622         }
2623     } while (!pages && again);
2624
2625     rs->last_seen_block = pss.block;
2626     rs->last_page = pss.page;
2627
2628     return pages;
2629 }
2630
2631 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2632 {
2633     uint64_t pages = size / TARGET_PAGE_SIZE;
2634
2635     if (zero) {
2636         ram_counters.duplicate += pages;
2637     } else {
2638         ram_counters.normal += pages;
2639         ram_counters.transferred += size;
2640         qemu_update_position(f, size);
2641     }
2642 }
2643
2644 static uint64_t ram_bytes_total_common(bool count_ignored)
2645 {
2646     RAMBlock *block;
2647     uint64_t total = 0;
2648
2649     rcu_read_lock();
2650     if (count_ignored) {
2651         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2652             total += block->used_length;
2653         }
2654     } else {
2655         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2656             total += block->used_length;
2657         }
2658     }
2659     rcu_read_unlock();
2660     return total;
2661 }
2662
2663 uint64_t ram_bytes_total(void)
2664 {
2665     return ram_bytes_total_common(false);
2666 }
2667
2668 static void xbzrle_load_setup(void)
2669 {
2670     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2671 }
2672
2673 static void xbzrle_load_cleanup(void)
2674 {
2675     g_free(XBZRLE.decoded_buf);
2676     XBZRLE.decoded_buf = NULL;
2677 }
2678
2679 static void ram_state_cleanup(RAMState **rsp)
2680 {
2681     if (*rsp) {
2682         migration_page_queue_free(*rsp);
2683         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2684         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2685         g_free(*rsp);
2686         *rsp = NULL;
2687     }
2688 }
2689
2690 static void xbzrle_cleanup(void)
2691 {
2692     XBZRLE_cache_lock();
2693     if (XBZRLE.cache) {
2694         cache_fini(XBZRLE.cache);
2695         g_free(XBZRLE.encoded_buf);
2696         g_free(XBZRLE.current_buf);
2697         g_free(XBZRLE.zero_target_page);
2698         XBZRLE.cache = NULL;
2699         XBZRLE.encoded_buf = NULL;
2700         XBZRLE.current_buf = NULL;
2701         XBZRLE.zero_target_page = NULL;
2702     }
2703     XBZRLE_cache_unlock();
2704 }
2705
2706 static void ram_save_cleanup(void *opaque)
2707 {
2708     RAMState **rsp = opaque;
2709     RAMBlock *block;
2710
2711     /* caller have hold iothread lock or is in a bh, so there is
2712      * no writing race against the migration bitmap
2713      */
2714     memory_global_dirty_log_stop();
2715
2716     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2717         g_free(block->clear_bmap);
2718         block->clear_bmap = NULL;
2719         g_free(block->bmap);
2720         block->bmap = NULL;
2721         g_free(block->unsentmap);
2722         block->unsentmap = NULL;
2723     }
2724
2725     xbzrle_cleanup();
2726     compress_threads_save_cleanup();
2727     ram_state_cleanup(rsp);
2728 }
2729
2730 static void ram_state_reset(RAMState *rs)
2731 {
2732     rs->last_seen_block = NULL;
2733     rs->last_sent_block = NULL;
2734     rs->last_page = 0;
2735     rs->last_version = ram_list.version;
2736     rs->ram_bulk_stage = true;
2737     rs->fpo_enabled = false;
2738 }
2739
2740 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2741
2742 /*
2743  * 'expected' is the value you expect the bitmap mostly to be full
2744  * of; it won't bother printing lines that are all this value.
2745  * If 'todump' is null the migration bitmap is dumped.
2746  */
2747 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2748                            unsigned long pages)
2749 {
2750     int64_t cur;
2751     int64_t linelen = 128;
2752     char linebuf[129];
2753
2754     for (cur = 0; cur < pages; cur += linelen) {
2755         int64_t curb;
2756         bool found = false;
2757         /*
2758          * Last line; catch the case where the line length
2759          * is longer than remaining ram
2760          */
2761         if (cur + linelen > pages) {
2762             linelen = pages - cur;
2763         }
2764         for (curb = 0; curb < linelen; curb++) {
2765             bool thisbit = test_bit(cur + curb, todump);
2766             linebuf[curb] = thisbit ? '1' : '.';
2767             found = found || (thisbit != expected);
2768         }
2769         if (found) {
2770             linebuf[curb] = '\0';
2771             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2772         }
2773     }
2774 }
2775
2776 /* **** functions for postcopy ***** */
2777
2778 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2779 {
2780     struct RAMBlock *block;
2781
2782     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2783         unsigned long *bitmap = block->bmap;
2784         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2785         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2786
2787         while (run_start < range) {
2788             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2789             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2790                               (run_end - run_start) << TARGET_PAGE_BITS);
2791             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2792         }
2793     }
2794 }
2795
2796 /**
2797  * postcopy_send_discard_bm_ram: discard a RAMBlock
2798  *
2799  * Returns zero on success
2800  *
2801  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2802  * Note: At this point the 'unsentmap' is the processed bitmap combined
2803  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2804  *
2805  * @ms: current migration state
2806  * @pds: state for postcopy
2807  * @block: RAMBlock to discard
2808  */
2809 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2810                                         PostcopyDiscardState *pds,
2811                                         RAMBlock *block)
2812 {
2813     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2814     unsigned long current;
2815     unsigned long *unsentmap = block->unsentmap;
2816
2817     for (current = 0; current < end; ) {
2818         unsigned long one = find_next_bit(unsentmap, end, current);
2819
2820         if (one <= end) {
2821             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2822             unsigned long discard_length;
2823
2824             if (zero >= end) {
2825                 discard_length = end - one;
2826             } else {
2827                 discard_length = zero - one;
2828             }
2829             if (discard_length) {
2830                 postcopy_discard_send_range(ms, pds, one, discard_length);
2831             }
2832             current = one + discard_length;
2833         } else {
2834             current = one;
2835         }
2836     }
2837
2838     return 0;
2839 }
2840
2841 /**
2842  * postcopy_each_ram_send_discard: discard all RAMBlocks
2843  *
2844  * Returns 0 for success or negative for error
2845  *
2846  * Utility for the outgoing postcopy code.
2847  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2848  *   passing it bitmap indexes and name.
2849  * (qemu_ram_foreach_block ends up passing unscaled lengths
2850  *  which would mean postcopy code would have to deal with target page)
2851  *
2852  * @ms: current migration state
2853  */
2854 static int postcopy_each_ram_send_discard(MigrationState *ms)
2855 {
2856     struct RAMBlock *block;
2857     int ret;
2858
2859     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2860         PostcopyDiscardState *pds =
2861             postcopy_discard_send_init(ms, block->idstr);
2862
2863         /*
2864          * Postcopy sends chunks of bitmap over the wire, but it
2865          * just needs indexes at this point, avoids it having
2866          * target page specific code.
2867          */
2868         ret = postcopy_send_discard_bm_ram(ms, pds, block);
2869         postcopy_discard_send_finish(ms, pds);
2870         if (ret) {
2871             return ret;
2872         }
2873     }
2874
2875     return 0;
2876 }
2877
2878 /**
2879  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2880  *
2881  * Helper for postcopy_chunk_hostpages; it's called twice to
2882  * canonicalize the two bitmaps, that are similar, but one is
2883  * inverted.
2884  *
2885  * Postcopy requires that all target pages in a hostpage are dirty or
2886  * clean, not a mix.  This function canonicalizes the bitmaps.
2887  *
2888  * @ms: current migration state
2889  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2890  *               otherwise we need to canonicalize partially dirty host pages
2891  * @block: block that contains the page we want to canonicalize
2892  * @pds: state for postcopy
2893  */
2894 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2895                                           RAMBlock *block,
2896                                           PostcopyDiscardState *pds)
2897 {
2898     RAMState *rs = ram_state;
2899     unsigned long *bitmap = block->bmap;
2900     unsigned long *unsentmap = block->unsentmap;
2901     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2902     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2903     unsigned long run_start;
2904
2905     if (block->page_size == TARGET_PAGE_SIZE) {
2906         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2907         return;
2908     }
2909
2910     if (unsent_pass) {
2911         /* Find a sent page */
2912         run_start = find_next_zero_bit(unsentmap, pages, 0);
2913     } else {
2914         /* Find a dirty page */
2915         run_start = find_next_bit(bitmap, pages, 0);
2916     }
2917
2918     while (run_start < pages) {
2919         bool do_fixup = false;
2920         unsigned long fixup_start_addr;
2921         unsigned long host_offset;
2922
2923         /*
2924          * If the start of this run of pages is in the middle of a host
2925          * page, then we need to fixup this host page.
2926          */
2927         host_offset = run_start % host_ratio;
2928         if (host_offset) {
2929             do_fixup = true;
2930             run_start -= host_offset;
2931             fixup_start_addr = run_start;
2932             /* For the next pass */
2933             run_start = run_start + host_ratio;
2934         } else {
2935             /* Find the end of this run */
2936             unsigned long run_end;
2937             if (unsent_pass) {
2938                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2939             } else {
2940                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2941             }
2942             /*
2943              * If the end isn't at the start of a host page, then the
2944              * run doesn't finish at the end of a host page
2945              * and we need to discard.
2946              */
2947             host_offset = run_end % host_ratio;
2948             if (host_offset) {
2949                 do_fixup = true;
2950                 fixup_start_addr = run_end - host_offset;
2951                 /*
2952                  * This host page has gone, the next loop iteration starts
2953                  * from after the fixup
2954                  */
2955                 run_start = fixup_start_addr + host_ratio;
2956             } else {
2957                 /*
2958                  * No discards on this iteration, next loop starts from
2959                  * next sent/dirty page
2960                  */
2961                 run_start = run_end + 1;
2962             }
2963         }
2964
2965         if (do_fixup) {
2966             unsigned long page;
2967
2968             /* Tell the destination to discard this page */
2969             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2970                 /* For the unsent_pass we:
2971                  *     discard partially sent pages
2972                  * For the !unsent_pass (dirty) we:
2973                  *     discard partially dirty pages that were sent
2974                  *     (any partially sent pages were already discarded
2975                  *     by the previous unsent_pass)
2976                  */
2977                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2978                                             host_ratio);
2979             }
2980
2981             /* Clean up the bitmap */
2982             for (page = fixup_start_addr;
2983                  page < fixup_start_addr + host_ratio; page++) {
2984                 /* All pages in this host page are now not sent */
2985                 set_bit(page, unsentmap);
2986
2987                 /*
2988                  * Remark them as dirty, updating the count for any pages
2989                  * that weren't previously dirty.
2990                  */
2991                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2992             }
2993         }
2994
2995         if (unsent_pass) {
2996             /* Find the next sent page for the next iteration */
2997             run_start = find_next_zero_bit(unsentmap, pages, run_start);
2998         } else {
2999             /* Find the next dirty page for the next iteration */
3000             run_start = find_next_bit(bitmap, pages, run_start);
3001         }
3002     }
3003 }
3004
3005 /**
3006  * postcopy_chunk_hostpages: discard any partially sent host page
3007  *
3008  * Utility for the outgoing postcopy code.
3009  *
3010  * Discard any partially sent host-page size chunks, mark any partially
3011  * dirty host-page size chunks as all dirty.  In this case the host-page
3012  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
3013  *
3014  * Returns zero on success
3015  *
3016  * @ms: current migration state
3017  * @block: block we want to work with
3018  */
3019 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
3020 {
3021     PostcopyDiscardState *pds =
3022         postcopy_discard_send_init(ms, block->idstr);
3023
3024     /* First pass: Discard all partially sent host pages */
3025     postcopy_chunk_hostpages_pass(ms, true, block, pds);
3026     /*
3027      * Second pass: Ensure that all partially dirty host pages are made
3028      * fully dirty.
3029      */
3030     postcopy_chunk_hostpages_pass(ms, false, block, pds);
3031
3032     postcopy_discard_send_finish(ms, pds);
3033     return 0;
3034 }
3035
3036 /**
3037  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
3038  *
3039  * Returns zero on success
3040  *
3041  * Transmit the set of pages to be discarded after precopy to the target
3042  * these are pages that:
3043  *     a) Have been previously transmitted but are now dirty again
3044  *     b) Pages that have never been transmitted, this ensures that
3045  *        any pages on the destination that have been mapped by background
3046  *        tasks get discarded (transparent huge pages is the specific concern)
3047  * Hopefully this is pretty sparse
3048  *
3049  * @ms: current migration state
3050  */
3051 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
3052 {
3053     RAMState *rs = ram_state;
3054     RAMBlock *block;
3055     int ret;
3056
3057     rcu_read_lock();
3058
3059     /* This should be our last sync, the src is now paused */
3060     migration_bitmap_sync(rs);
3061
3062     /* Easiest way to make sure we don't resume in the middle of a host-page */
3063     rs->last_seen_block = NULL;
3064     rs->last_sent_block = NULL;
3065     rs->last_page = 0;
3066
3067     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3068         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
3069         unsigned long *bitmap = block->bmap;
3070         unsigned long *unsentmap = block->unsentmap;
3071
3072         if (!unsentmap) {
3073             /* We don't have a safe way to resize the sentmap, so
3074              * if the bitmap was resized it will be NULL at this
3075              * point.
3076              */
3077             error_report("migration ram resized during precopy phase");
3078             rcu_read_unlock();
3079             return -EINVAL;
3080         }
3081         /* Deal with TPS != HPS and huge pages */
3082         ret = postcopy_chunk_hostpages(ms, block);
3083         if (ret) {
3084             rcu_read_unlock();
3085             return ret;
3086         }
3087
3088         /*
3089          * Update the unsentmap to be unsentmap = unsentmap | dirty
3090          */
3091         bitmap_or(unsentmap, unsentmap, bitmap, pages);
3092 #ifdef DEBUG_POSTCOPY
3093         ram_debug_dump_bitmap(unsentmap, true, pages);
3094 #endif
3095     }
3096     trace_ram_postcopy_send_discard_bitmap();
3097
3098     ret = postcopy_each_ram_send_discard(ms);
3099     rcu_read_unlock();
3100
3101     return ret;
3102 }
3103
3104 /**
3105  * ram_discard_range: discard dirtied pages at the beginning of postcopy
3106  *
3107  * Returns zero on success
3108  *
3109  * @rbname: name of the RAMBlock of the request. NULL means the
3110  *          same that last one.
3111  * @start: RAMBlock starting page
3112  * @length: RAMBlock size
3113  */
3114 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
3115 {
3116     int ret = -1;
3117
3118     trace_ram_discard_range(rbname, start, length);
3119
3120     rcu_read_lock();
3121     RAMBlock *rb = qemu_ram_block_by_name(rbname);
3122
3123     if (!rb) {
3124         error_report("ram_discard_range: Failed to find block '%s'", rbname);
3125         goto err;
3126     }
3127
3128     /*
3129      * On source VM, we don't need to update the received bitmap since
3130      * we don't even have one.
3131      */
3132     if (rb->receivedmap) {
3133         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3134                      length >> qemu_target_page_bits());
3135     }
3136
3137     ret = ram_block_discard_range(rb, start, length);
3138
3139 err:
3140     rcu_read_unlock();
3141
3142     return ret;
3143 }
3144
3145 /*
3146  * For every allocation, we will try not to crash the VM if the
3147  * allocation failed.
3148  */
3149 static int xbzrle_init(void)
3150 {
3151     Error *local_err = NULL;
3152
3153     if (!migrate_use_xbzrle()) {
3154         return 0;
3155     }
3156
3157     XBZRLE_cache_lock();
3158
3159     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3160     if (!XBZRLE.zero_target_page) {
3161         error_report("%s: Error allocating zero page", __func__);
3162         goto err_out;
3163     }
3164
3165     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3166                               TARGET_PAGE_SIZE, &local_err);
3167     if (!XBZRLE.cache) {
3168         error_report_err(local_err);
3169         goto free_zero_page;
3170     }
3171
3172     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3173     if (!XBZRLE.encoded_buf) {
3174         error_report("%s: Error allocating encoded_buf", __func__);
3175         goto free_cache;
3176     }
3177
3178     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3179     if (!XBZRLE.current_buf) {
3180         error_report("%s: Error allocating current_buf", __func__);
3181         goto free_encoded_buf;
3182     }
3183
3184     /* We are all good */
3185     XBZRLE_cache_unlock();
3186     return 0;
3187
3188 free_encoded_buf:
3189     g_free(XBZRLE.encoded_buf);
3190     XBZRLE.encoded_buf = NULL;
3191 free_cache:
3192     cache_fini(XBZRLE.cache);
3193     XBZRLE.cache = NULL;
3194 free_zero_page:
3195     g_free(XBZRLE.zero_target_page);
3196     XBZRLE.zero_target_page = NULL;
3197 err_out:
3198     XBZRLE_cache_unlock();
3199     return -ENOMEM;
3200 }
3201
3202 static int ram_state_init(RAMState **rsp)
3203 {
3204     *rsp = g_try_new0(RAMState, 1);
3205
3206     if (!*rsp) {
3207         error_report("%s: Init ramstate fail", __func__);
3208         return -1;
3209     }
3210
3211     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3212     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3213     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3214
3215     /*
3216      * Count the total number of pages used by ram blocks not including any
3217      * gaps due to alignment or unplugs.
3218      * This must match with the initial values of dirty bitmap.
3219      */
3220     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3221     ram_state_reset(*rsp);
3222
3223     return 0;
3224 }
3225
3226 static void ram_list_init_bitmaps(void)
3227 {
3228     MigrationState *ms = migrate_get_current();
3229     RAMBlock *block;
3230     unsigned long pages;
3231     uint8_t shift;
3232
3233     /* Skip setting bitmap if there is no RAM */
3234     if (ram_bytes_total()) {
3235         shift = ms->clear_bitmap_shift;
3236         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3237             error_report("clear_bitmap_shift (%u) too big, using "
3238                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3239             shift = CLEAR_BITMAP_SHIFT_MAX;
3240         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3241             error_report("clear_bitmap_shift (%u) too small, using "
3242                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3243             shift = CLEAR_BITMAP_SHIFT_MIN;
3244         }
3245
3246         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3247             pages = block->max_length >> TARGET_PAGE_BITS;
3248             /*
3249              * The initial dirty bitmap for migration must be set with all
3250              * ones to make sure we'll migrate every guest RAM page to
3251              * destination.
3252              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3253              * new migration after a failed migration, ram_list.
3254              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3255              * guest memory.
3256              */
3257             block->bmap = bitmap_new(pages);
3258             bitmap_set(block->bmap, 0, pages);
3259             block->clear_bmap_shift = shift;
3260             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3261             if (migrate_postcopy_ram()) {
3262                 block->unsentmap = bitmap_new(pages);
3263                 bitmap_set(block->unsentmap, 0, pages);
3264             }
3265         }
3266     }
3267 }
3268
3269 static void ram_init_bitmaps(RAMState *rs)
3270 {
3271     /* For memory_global_dirty_log_start below.  */
3272     qemu_mutex_lock_iothread();
3273     qemu_mutex_lock_ramlist();
3274     rcu_read_lock();
3275
3276     ram_list_init_bitmaps();
3277     memory_global_dirty_log_start();
3278     migration_bitmap_sync_precopy(rs);
3279
3280     rcu_read_unlock();
3281     qemu_mutex_unlock_ramlist();
3282     qemu_mutex_unlock_iothread();
3283 }
3284
3285 static int ram_init_all(RAMState **rsp)
3286 {
3287     if (ram_state_init(rsp)) {
3288         return -1;
3289     }
3290
3291     if (xbzrle_init()) {
3292         ram_state_cleanup(rsp);
3293         return -1;
3294     }
3295
3296     ram_init_bitmaps(*rsp);
3297
3298     return 0;
3299 }
3300
3301 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3302 {
3303     RAMBlock *block;
3304     uint64_t pages = 0;
3305
3306     /*
3307      * Postcopy is not using xbzrle/compression, so no need for that.
3308      * Also, since source are already halted, we don't need to care
3309      * about dirty page logging as well.
3310      */
3311
3312     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3313         pages += bitmap_count_one(block->bmap,
3314                                   block->used_length >> TARGET_PAGE_BITS);
3315     }
3316
3317     /* This may not be aligned with current bitmaps. Recalculate. */
3318     rs->migration_dirty_pages = pages;
3319
3320     rs->last_seen_block = NULL;
3321     rs->last_sent_block = NULL;
3322     rs->last_page = 0;
3323     rs->last_version = ram_list.version;
3324     /*
3325      * Disable the bulk stage, otherwise we'll resend the whole RAM no
3326      * matter what we have sent.
3327      */
3328     rs->ram_bulk_stage = false;
3329
3330     /* Update RAMState cache of output QEMUFile */
3331     rs->f = out;
3332
3333     trace_ram_state_resume_prepare(pages);
3334 }
3335
3336 /*
3337  * This function clears bits of the free pages reported by the caller from the
3338  * migration dirty bitmap. @addr is the host address corresponding to the
3339  * start of the continuous guest free pages, and @len is the total bytes of
3340  * those pages.
3341  */
3342 void qemu_guest_free_page_hint(void *addr, size_t len)
3343 {
3344     RAMBlock *block;
3345     ram_addr_t offset;
3346     size_t used_len, start, npages;
3347     MigrationState *s = migrate_get_current();
3348
3349     /* This function is currently expected to be used during live migration */
3350     if (!migration_is_setup_or_active(s->state)) {
3351         return;
3352     }
3353
3354     for (; len > 0; len -= used_len, addr += used_len) {
3355         block = qemu_ram_block_from_host(addr, false, &offset);
3356         if (unlikely(!block || offset >= block->used_length)) {
3357             /*
3358              * The implementation might not support RAMBlock resize during
3359              * live migration, but it could happen in theory with future
3360              * updates. So we add a check here to capture that case.
3361              */
3362             error_report_once("%s unexpected error", __func__);
3363             return;
3364         }
3365
3366         if (len <= block->used_length - offset) {
3367             used_len = len;
3368         } else {
3369             used_len = block->used_length - offset;
3370         }
3371
3372         start = offset >> TARGET_PAGE_BITS;
3373         npages = used_len >> TARGET_PAGE_BITS;
3374
3375         qemu_mutex_lock(&ram_state->bitmap_mutex);
3376         ram_state->migration_dirty_pages -=
3377                       bitmap_count_one_with_offset(block->bmap, start, npages);
3378         bitmap_clear(block->bmap, start, npages);
3379         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3380     }
3381 }
3382
3383 /*
3384  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3385  * long-running RCU critical section.  When rcu-reclaims in the code
3386  * start to become numerous it will be necessary to reduce the
3387  * granularity of these critical sections.
3388  */
3389
3390 /**
3391  * ram_save_setup: Setup RAM for migration
3392  *
3393  * Returns zero to indicate success and negative for error
3394  *
3395  * @f: QEMUFile where to send the data
3396  * @opaque: RAMState pointer
3397  */
3398 static int ram_save_setup(QEMUFile *f, void *opaque)
3399 {
3400     RAMState **rsp = opaque;
3401     RAMBlock *block;
3402
3403     if (compress_threads_save_setup()) {
3404         return -1;
3405     }
3406
3407     /* migration has already setup the bitmap, reuse it. */
3408     if (!migration_in_colo_state()) {
3409         if (ram_init_all(rsp) != 0) {
3410             compress_threads_save_cleanup();
3411             return -1;
3412         }
3413     }
3414     (*rsp)->f = f;
3415
3416     rcu_read_lock();
3417
3418     qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3419
3420     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3421         qemu_put_byte(f, strlen(block->idstr));
3422         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3423         qemu_put_be64(f, block->used_length);
3424         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3425             qemu_put_be64(f, block->page_size);
3426         }
3427         if (migrate_ignore_shared()) {
3428             qemu_put_be64(f, block->mr->addr);
3429         }
3430     }
3431
3432     rcu_read_unlock();
3433
3434     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3435     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3436
3437     multifd_send_sync_main();
3438     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3439     qemu_fflush(f);
3440
3441     return 0;
3442 }
3443
3444 /**
3445  * ram_save_iterate: iterative stage for migration
3446  *
3447  * Returns zero to indicate success and negative for error
3448  *
3449  * @f: QEMUFile where to send the data
3450  * @opaque: RAMState pointer
3451  */
3452 static int ram_save_iterate(QEMUFile *f, void *opaque)
3453 {
3454     RAMState **temp = opaque;
3455     RAMState *rs = *temp;
3456     int ret;
3457     int i;
3458     int64_t t0;
3459     int done = 0;
3460
3461     if (blk_mig_bulk_active()) {
3462         /* Avoid transferring ram during bulk phase of block migration as
3463          * the bulk phase will usually take a long time and transferring
3464          * ram updates during that time is pointless. */
3465         goto out;
3466     }
3467
3468     rcu_read_lock();
3469     if (ram_list.version != rs->last_version) {
3470         ram_state_reset(rs);
3471     }
3472
3473     /* Read version before ram_list.blocks */
3474     smp_rmb();
3475
3476     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3477
3478     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3479     i = 0;
3480     while ((ret = qemu_file_rate_limit(f)) == 0 ||
3481             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3482         int pages;
3483
3484         if (qemu_file_get_error(f)) {
3485             break;
3486         }
3487
3488         pages = ram_find_and_save_block(rs, false);
3489         /* no more pages to sent */
3490         if (pages == 0) {
3491             done = 1;
3492             break;
3493         }
3494
3495         if (pages < 0) {
3496             qemu_file_set_error(f, pages);
3497             break;
3498         }
3499
3500         rs->target_page_count += pages;
3501
3502         /* we want to check in the 1st loop, just in case it was the 1st time
3503            and we had to sync the dirty bitmap.
3504            qemu_clock_get_ns() is a bit expensive, so we only check each some
3505            iterations
3506         */
3507         if ((i & 63) == 0) {
3508             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3509             if (t1 > MAX_WAIT) {
3510                 trace_ram_save_iterate_big_wait(t1, i);
3511                 break;
3512             }
3513         }
3514         i++;
3515     }
3516     rcu_read_unlock();
3517
3518     /*
3519      * Must occur before EOS (or any QEMUFile operation)
3520      * because of RDMA protocol.
3521      */
3522     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3523
3524 out:
3525     multifd_send_sync_main();
3526     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3527     qemu_fflush(f);
3528     ram_counters.transferred += 8;
3529
3530     ret = qemu_file_get_error(f);
3531     if (ret < 0) {
3532         return ret;
3533     }
3534
3535     return done;
3536 }
3537
3538 /**
3539  * ram_save_complete: function called to send the remaining amount of ram
3540  *
3541  * Returns zero to indicate success or negative on error
3542  *
3543  * Called with iothread lock
3544  *
3545  * @f: QEMUFile where to send the data
3546  * @opaque: RAMState pointer
3547  */
3548 static int ram_save_complete(QEMUFile *f, void *opaque)
3549 {
3550     RAMState **temp = opaque;
3551     RAMState *rs = *temp;
3552     int ret = 0;
3553
3554     rcu_read_lock();
3555
3556     if (!migration_in_postcopy()) {
3557         migration_bitmap_sync_precopy(rs);
3558     }
3559
3560     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3561
3562     /* try transferring iterative blocks of memory */
3563
3564     /* flush all remaining blocks regardless of rate limiting */
3565     while (true) {
3566         int pages;
3567
3568         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3569         /* no more blocks to sent */
3570         if (pages == 0) {
3571             break;
3572         }
3573         if (pages < 0) {
3574             ret = pages;
3575             break;
3576         }
3577     }
3578
3579     flush_compressed_data(rs);
3580     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3581
3582     rcu_read_unlock();
3583
3584     multifd_send_sync_main();
3585     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3586     qemu_fflush(f);
3587
3588     return ret;
3589 }
3590
3591 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3592                              uint64_t *res_precopy_only,
3593                              uint64_t *res_compatible,
3594                              uint64_t *res_postcopy_only)
3595 {
3596     RAMState **temp = opaque;
3597     RAMState *rs = *temp;
3598     uint64_t remaining_size;
3599
3600     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3601
3602     if (!migration_in_postcopy() &&
3603         remaining_size < max_size) {
3604         qemu_mutex_lock_iothread();
3605         rcu_read_lock();
3606         migration_bitmap_sync_precopy(rs);
3607         rcu_read_unlock();
3608         qemu_mutex_unlock_iothread();
3609         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3610     }
3611
3612     if (migrate_postcopy_ram()) {
3613         /* We can do postcopy, and all the data is postcopiable */
3614         *res_compatible += remaining_size;
3615     } else {
3616         *res_precopy_only += remaining_size;
3617     }
3618 }
3619
3620 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3621 {
3622     unsigned int xh_len;
3623     int xh_flags;
3624     uint8_t *loaded_data;
3625
3626     /* extract RLE header */
3627     xh_flags = qemu_get_byte(f);
3628     xh_len = qemu_get_be16(f);
3629
3630     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3631         error_report("Failed to load XBZRLE page - wrong compression!");
3632         return -1;
3633     }
3634
3635     if (xh_len > TARGET_PAGE_SIZE) {
3636         error_report("Failed to load XBZRLE page - len overflow!");
3637         return -1;
3638     }
3639     loaded_data = XBZRLE.decoded_buf;
3640     /* load data and decode */
3641     /* it can change loaded_data to point to an internal buffer */
3642     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3643
3644     /* decode RLE */
3645     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3646                              TARGET_PAGE_SIZE) == -1) {
3647         error_report("Failed to load XBZRLE page - decode error!");
3648         return -1;
3649     }
3650
3651     return 0;
3652 }
3653
3654 /**
3655  * ram_block_from_stream: read a RAMBlock id from the migration stream
3656  *
3657  * Must be called from within a rcu critical section.
3658  *
3659  * Returns a pointer from within the RCU-protected ram_list.
3660  *
3661  * @f: QEMUFile where to read the data from
3662  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3663  */
3664 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3665 {
3666     static RAMBlock *block = NULL;
3667     char id[256];
3668     uint8_t len;
3669
3670     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3671         if (!block) {
3672             error_report("Ack, bad migration stream!");
3673             return NULL;
3674         }
3675         return block;
3676     }
3677
3678     len = qemu_get_byte(f);
3679     qemu_get_buffer(f, (uint8_t *)id, len);
3680     id[len] = 0;
3681
3682     block = qemu_ram_block_by_name(id);
3683     if (!block) {
3684         error_report("Can't find block %s", id);
3685         return NULL;
3686     }
3687
3688     if (ramblock_is_ignored(block)) {
3689         error_report("block %s should not be migrated !", id);
3690         return NULL;
3691     }
3692
3693     return block;
3694 }
3695
3696 static inline void *host_from_ram_block_offset(RAMBlock *block,
3697                                                ram_addr_t offset)
3698 {
3699     if (!offset_in_ramblock(block, offset)) {
3700         return NULL;
3701     }
3702
3703     return block->host + offset;
3704 }
3705
3706 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3707                                                  ram_addr_t offset)
3708 {
3709     if (!offset_in_ramblock(block, offset)) {
3710         return NULL;
3711     }
3712     if (!block->colo_cache) {
3713         error_report("%s: colo_cache is NULL in block :%s",
3714                      __func__, block->idstr);
3715         return NULL;
3716     }
3717
3718     /*
3719     * During colo checkpoint, we need bitmap of these migrated pages.
3720     * It help us to decide which pages in ram cache should be flushed
3721     * into VM's RAM later.
3722     */
3723     if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3724         ram_state->migration_dirty_pages++;
3725     }
3726     return block->colo_cache + offset;
3727 }
3728
3729 /**
3730  * ram_handle_compressed: handle the zero page case
3731  *
3732  * If a page (or a whole RDMA chunk) has been
3733  * determined to be zero, then zap it.
3734  *
3735  * @host: host address for the zero page
3736  * @ch: what the page is filled from.  We only support zero
3737  * @size: size of the zero page
3738  */
3739 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3740 {
3741     if (ch != 0 || !is_zero_range(host, size)) {
3742         memset(host, ch, size);
3743     }
3744 }
3745
3746 /* return the size after decompression, or negative value on error */
3747 static int
3748 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3749                      const uint8_t *source, size_t source_len)
3750 {
3751     int err;
3752
3753     err = inflateReset(stream);
3754     if (err != Z_OK) {
3755         return -1;
3756     }
3757
3758     stream->avail_in = source_len;
3759     stream->next_in = (uint8_t *)source;
3760     stream->avail_out = dest_len;
3761     stream->next_out = dest;
3762
3763     err = inflate(stream, Z_NO_FLUSH);
3764     if (err != Z_STREAM_END) {
3765         return -1;
3766     }
3767
3768     return stream->total_out;
3769 }
3770
3771 static void *do_data_decompress(void *opaque)
3772 {
3773     DecompressParam *param = opaque;
3774     unsigned long pagesize;
3775     uint8_t *des;
3776     int len, ret;
3777
3778     qemu_mutex_lock(&param->mutex);
3779     while (!param->quit) {
3780         if (param->des) {
3781             des = param->des;
3782             len = param->len;
3783             param->des = 0;
3784             qemu_mutex_unlock(&param->mutex);
3785
3786             pagesize = TARGET_PAGE_SIZE;
3787
3788             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3789                                        param->compbuf, len);
3790             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3791                 error_report("decompress data failed");
3792                 qemu_file_set_error(decomp_file, ret);
3793             }
3794
3795             qemu_mutex_lock(&decomp_done_lock);
3796             param->done = true;
3797             qemu_cond_signal(&decomp_done_cond);
3798             qemu_mutex_unlock(&decomp_done_lock);
3799
3800             qemu_mutex_lock(&param->mutex);
3801         } else {
3802             qemu_cond_wait(&param->cond, &param->mutex);
3803         }
3804     }
3805     qemu_mutex_unlock(&param->mutex);
3806
3807     return NULL;
3808 }
3809
3810 static int wait_for_decompress_done(void)
3811 {
3812     int idx, thread_count;
3813
3814     if (!migrate_use_compression()) {
3815         return 0;
3816     }
3817
3818     thread_count = migrate_decompress_threads();
3819     qemu_mutex_lock(&decomp_done_lock);
3820     for (idx = 0; idx < thread_count; idx++) {
3821         while (!decomp_param[idx].done) {
3822             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3823         }
3824     }
3825     qemu_mutex_unlock(&decomp_done_lock);
3826     return qemu_file_get_error(decomp_file);
3827 }
3828
3829 static void compress_threads_load_cleanup(void)
3830 {
3831     int i, thread_count;
3832
3833     if (!migrate_use_compression()) {
3834         return;
3835     }
3836     thread_count = migrate_decompress_threads();
3837     for (i = 0; i < thread_count; i++) {
3838         /*
3839          * we use it as a indicator which shows if the thread is
3840          * properly init'd or not
3841          */
3842         if (!decomp_param[i].compbuf) {
3843             break;
3844         }
3845
3846         qemu_mutex_lock(&decomp_param[i].mutex);
3847         decomp_param[i].quit = true;
3848         qemu_cond_signal(&decomp_param[i].cond);
3849         qemu_mutex_unlock(&decomp_param[i].mutex);
3850     }
3851     for (i = 0; i < thread_count; i++) {
3852         if (!decomp_param[i].compbuf) {
3853             break;
3854         }
3855
3856         qemu_thread_join(decompress_threads + i);
3857         qemu_mutex_destroy(&decomp_param[i].mutex);
3858         qemu_cond_destroy(&decomp_param[i].cond);
3859         inflateEnd(&decomp_param[i].stream);
3860         g_free(decomp_param[i].compbuf);
3861         decomp_param[i].compbuf = NULL;
3862     }
3863     g_free(decompress_threads);
3864     g_free(decomp_param);
3865     decompress_threads = NULL;
3866     decomp_param = NULL;
3867     decomp_file = NULL;
3868 }
3869
3870 static int compress_threads_load_setup(QEMUFile *f)
3871 {
3872     int i, thread_count;
3873
3874     if (!migrate_use_compression()) {
3875         return 0;
3876     }
3877
3878     thread_count = migrate_decompress_threads();
3879     decompress_threads = g_new0(QemuThread, thread_count);
3880     decomp_param = g_new0(DecompressParam, thread_count);
3881     qemu_mutex_init(&decomp_done_lock);
3882     qemu_cond_init(&decomp_done_cond);
3883     decomp_file = f;
3884     for (i = 0; i < thread_count; i++) {
3885         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3886             goto exit;
3887         }
3888
3889         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3890         qemu_mutex_init(&decomp_param[i].mutex);
3891         qemu_cond_init(&decomp_param[i].cond);
3892         decomp_param[i].done = true;
3893         decomp_param[i].quit = false;
3894         qemu_thread_create(decompress_threads + i, "decompress",
3895                            do_data_decompress, decomp_param + i,
3896                            QEMU_THREAD_JOINABLE);
3897     }
3898     return 0;
3899 exit:
3900     compress_threads_load_cleanup();
3901     return -1;
3902 }
3903
3904 static void decompress_data_with_multi_threads(QEMUFile *f,
3905                                                void *host, int len)
3906 {
3907     int idx, thread_count;
3908
3909     thread_count = migrate_decompress_threads();
3910     qemu_mutex_lock(&decomp_done_lock);
3911     while (true) {
3912         for (idx = 0; idx < thread_count; idx++) {
3913             if (decomp_param[idx].done) {
3914                 decomp_param[idx].done = false;
3915                 qemu_mutex_lock(&decomp_param[idx].mutex);
3916                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3917                 decomp_param[idx].des = host;
3918                 decomp_param[idx].len = len;
3919                 qemu_cond_signal(&decomp_param[idx].cond);
3920                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3921                 break;
3922             }
3923         }
3924         if (idx < thread_count) {
3925             break;
3926         } else {
3927             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3928         }
3929     }
3930     qemu_mutex_unlock(&decomp_done_lock);
3931 }
3932
3933 /*
3934  * colo cache: this is for secondary VM, we cache the whole
3935  * memory of the secondary VM, it is need to hold the global lock
3936  * to call this helper.
3937  */
3938 int colo_init_ram_cache(void)
3939 {
3940     RAMBlock *block;
3941
3942     rcu_read_lock();
3943     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3944         block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3945                                                 NULL,
3946                                                 false);
3947         if (!block->colo_cache) {
3948             error_report("%s: Can't alloc memory for COLO cache of block %s,"
3949                          "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3950                          block->used_length);
3951             goto out_locked;
3952         }
3953         memcpy(block->colo_cache, block->host, block->used_length);
3954     }
3955     rcu_read_unlock();
3956     /*
3957     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3958     * with to decide which page in cache should be flushed into SVM's RAM. Here
3959     * we use the same name 'ram_bitmap' as for migration.
3960     */
3961     if (ram_bytes_total()) {
3962         RAMBlock *block;
3963
3964         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3965             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3966
3967             block->bmap = bitmap_new(pages);
3968             bitmap_set(block->bmap, 0, pages);
3969         }
3970     }
3971     ram_state = g_new0(RAMState, 1);
3972     ram_state->migration_dirty_pages = 0;
3973     qemu_mutex_init(&ram_state->bitmap_mutex);
3974     memory_global_dirty_log_start();
3975
3976     return 0;
3977
3978 out_locked:
3979
3980     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3981         if (block->colo_cache) {
3982             qemu_anon_ram_free(block->colo_cache, block->used_length);
3983             block->colo_cache = NULL;
3984         }
3985     }
3986
3987     rcu_read_unlock();
3988     return -errno;
3989 }
3990
3991 /* It is need to hold the global lock to call this helper */
3992 void colo_release_ram_cache(void)
3993 {
3994     RAMBlock *block;
3995
3996     memory_global_dirty_log_stop();
3997     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3998         g_free(block->bmap);
3999         block->bmap = NULL;
4000     }
4001
4002     rcu_read_lock();
4003
4004     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4005         if (block->colo_cache) {
4006             qemu_anon_ram_free(block->colo_cache, block->used_length);
4007             block->colo_cache = NULL;
4008         }
4009     }
4010
4011     rcu_read_unlock();
4012     qemu_mutex_destroy(&ram_state->bitmap_mutex);
4013     g_free(ram_state);
4014     ram_state = NULL;
4015 }
4016
4017 /**
4018  * ram_load_setup: Setup RAM for migration incoming side
4019  *
4020  * Returns zero to indicate success and negative for error
4021  *
4022  * @f: QEMUFile where to receive the data
4023  * @opaque: RAMState pointer
4024  */
4025 static int ram_load_setup(QEMUFile *f, void *opaque)
4026 {
4027     if (compress_threads_load_setup(f)) {
4028         return -1;
4029     }
4030
4031     xbzrle_load_setup();
4032     ramblock_recv_map_init();
4033
4034     return 0;
4035 }
4036
4037 static int ram_load_cleanup(void *opaque)
4038 {
4039     RAMBlock *rb;
4040
4041     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4042         if (ramblock_is_pmem(rb)) {
4043             pmem_persist(rb->host, rb->used_length);
4044         }
4045     }
4046
4047     xbzrle_load_cleanup();
4048     compress_threads_load_cleanup();
4049
4050     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4051         g_free(rb->receivedmap);
4052         rb->receivedmap = NULL;
4053     }
4054
4055     return 0;
4056 }
4057
4058 /**
4059  * ram_postcopy_incoming_init: allocate postcopy data structures
4060  *
4061  * Returns 0 for success and negative if there was one error
4062  *
4063  * @mis: current migration incoming state
4064  *
4065  * Allocate data structures etc needed by incoming migration with
4066  * postcopy-ram. postcopy-ram's similarly names
4067  * postcopy_ram_incoming_init does the work.
4068  */
4069 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4070 {
4071     return postcopy_ram_incoming_init(mis);
4072 }
4073
4074 /**
4075  * ram_load_postcopy: load a page in postcopy case
4076  *
4077  * Returns 0 for success or -errno in case of error
4078  *
4079  * Called in postcopy mode by ram_load().
4080  * rcu_read_lock is taken prior to this being called.
4081  *
4082  * @f: QEMUFile where to send the data
4083  */
4084 static int ram_load_postcopy(QEMUFile *f)
4085 {
4086     int flags = 0, ret = 0;
4087     bool place_needed = false;
4088     bool matches_target_page_size = false;
4089     MigrationIncomingState *mis = migration_incoming_get_current();
4090     /* Temporary page that is later 'placed' */
4091     void *postcopy_host_page = postcopy_get_tmp_page(mis);
4092     void *last_host = NULL;
4093     bool all_zero = false;
4094
4095     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4096         ram_addr_t addr;
4097         void *host = NULL;
4098         void *page_buffer = NULL;
4099         void *place_source = NULL;
4100         RAMBlock *block = NULL;
4101         uint8_t ch;
4102
4103         addr = qemu_get_be64(f);
4104
4105         /*
4106          * If qemu file error, we should stop here, and then "addr"
4107          * may be invalid
4108          */
4109         ret = qemu_file_get_error(f);
4110         if (ret) {
4111             break;
4112         }
4113
4114         flags = addr & ~TARGET_PAGE_MASK;
4115         addr &= TARGET_PAGE_MASK;
4116
4117         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
4118         place_needed = false;
4119         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
4120             block = ram_block_from_stream(f, flags);
4121
4122             host = host_from_ram_block_offset(block, addr);
4123             if (!host) {
4124                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4125                 ret = -EINVAL;
4126                 break;
4127             }
4128             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4129             /*
4130              * Postcopy requires that we place whole host pages atomically;
4131              * these may be huge pages for RAMBlocks that are backed by
4132              * hugetlbfs.
4133              * To make it atomic, the data is read into a temporary page
4134              * that's moved into place later.
4135              * The migration protocol uses,  possibly smaller, target-pages
4136              * however the source ensures it always sends all the components
4137              * of a host page in order.
4138              */
4139             page_buffer = postcopy_host_page +
4140                           ((uintptr_t)host & (block->page_size - 1));
4141             /* If all TP are zero then we can optimise the place */
4142             if (!((uintptr_t)host & (block->page_size - 1))) {
4143                 all_zero = true;
4144             } else {
4145                 /* not the 1st TP within the HP */
4146                 if (host != (last_host + TARGET_PAGE_SIZE)) {
4147                     error_report("Non-sequential target page %p/%p",
4148                                   host, last_host);
4149                     ret = -EINVAL;
4150                     break;
4151                 }
4152             }
4153
4154
4155             /*
4156              * If it's the last part of a host page then we place the host
4157              * page
4158              */
4159             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
4160                                      (block->page_size - 1)) == 0;
4161             place_source = postcopy_host_page;
4162         }
4163         last_host = host;
4164
4165         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4166         case RAM_SAVE_FLAG_ZERO:
4167             ch = qemu_get_byte(f);
4168             memset(page_buffer, ch, TARGET_PAGE_SIZE);
4169             if (ch) {
4170                 all_zero = false;
4171             }
4172             break;
4173
4174         case RAM_SAVE_FLAG_PAGE:
4175             all_zero = false;
4176             if (!matches_target_page_size) {
4177                 /* For huge pages, we always use temporary buffer */
4178                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4179             } else {
4180                 /*
4181                  * For small pages that matches target page size, we
4182                  * avoid the qemu_file copy.  Instead we directly use
4183                  * the buffer of QEMUFile to place the page.  Note: we
4184                  * cannot do any QEMUFile operation before using that
4185                  * buffer to make sure the buffer is valid when
4186                  * placing the page.
4187                  */
4188                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4189                                          TARGET_PAGE_SIZE);
4190             }
4191             break;
4192         case RAM_SAVE_FLAG_EOS:
4193             /* normal exit */
4194             multifd_recv_sync_main();
4195             break;
4196         default:
4197             error_report("Unknown combination of migration flags: %#x"
4198                          " (postcopy mode)", flags);
4199             ret = -EINVAL;
4200             break;
4201         }
4202
4203         /* Detect for any possible file errors */
4204         if (!ret && qemu_file_get_error(f)) {
4205             ret = qemu_file_get_error(f);
4206         }
4207
4208         if (!ret && place_needed) {
4209             /* This gets called at the last target page in the host page */
4210             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
4211
4212             if (all_zero) {
4213                 ret = postcopy_place_page_zero(mis, place_dest,
4214                                                block);
4215             } else {
4216                 ret = postcopy_place_page(mis, place_dest,
4217                                           place_source, block);
4218             }
4219         }
4220     }
4221
4222     return ret;
4223 }
4224
4225 static bool postcopy_is_advised(void)
4226 {
4227     PostcopyState ps = postcopy_state_get();
4228     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4229 }
4230
4231 static bool postcopy_is_running(void)
4232 {
4233     PostcopyState ps = postcopy_state_get();
4234     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4235 }
4236
4237 /*
4238  * Flush content of RAM cache into SVM's memory.
4239  * Only flush the pages that be dirtied by PVM or SVM or both.
4240  */
4241 static void colo_flush_ram_cache(void)
4242 {
4243     RAMBlock *block = NULL;
4244     void *dst_host;
4245     void *src_host;
4246     unsigned long offset = 0;
4247
4248     memory_global_dirty_log_sync();
4249     rcu_read_lock();
4250     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4251         migration_bitmap_sync_range(ram_state, block, block->used_length);
4252     }
4253     rcu_read_unlock();
4254
4255     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4256     rcu_read_lock();
4257     block = QLIST_FIRST_RCU(&ram_list.blocks);
4258
4259     while (block) {
4260         offset = migration_bitmap_find_dirty(ram_state, block, offset);
4261
4262         if (offset << TARGET_PAGE_BITS >= block->used_length) {
4263             offset = 0;
4264             block = QLIST_NEXT_RCU(block, next);
4265         } else {
4266             migration_bitmap_clear_dirty(ram_state, block, offset);
4267             dst_host = block->host + (offset << TARGET_PAGE_BITS);
4268             src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
4269             memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4270         }
4271     }
4272
4273     rcu_read_unlock();
4274     trace_colo_flush_ram_cache_end();
4275 }
4276
4277 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4278 {
4279     int flags = 0, ret = 0, invalid_flags = 0;
4280     static uint64_t seq_iter;
4281     int len = 0;
4282     /*
4283      * If system is running in postcopy mode, page inserts to host memory must
4284      * be atomic
4285      */
4286     bool postcopy_running = postcopy_is_running();
4287     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4288     bool postcopy_advised = postcopy_is_advised();
4289
4290     seq_iter++;
4291
4292     if (version_id != 4) {
4293         ret = -EINVAL;
4294     }
4295
4296     if (!migrate_use_compression()) {
4297         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4298     }
4299     /* This RCU critical section can be very long running.
4300      * When RCU reclaims in the code start to become numerous,
4301      * it will be necessary to reduce the granularity of this
4302      * critical section.
4303      */
4304     rcu_read_lock();
4305
4306     if (postcopy_running) {
4307         ret = ram_load_postcopy(f);
4308     }
4309
4310     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4311         ram_addr_t addr, total_ram_bytes;
4312         void *host = NULL;
4313         uint8_t ch;
4314
4315         addr = qemu_get_be64(f);
4316         flags = addr & ~TARGET_PAGE_MASK;
4317         addr &= TARGET_PAGE_MASK;
4318
4319         if (flags & invalid_flags) {
4320             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4321                 error_report("Received an unexpected compressed page");
4322             }
4323
4324             ret = -EINVAL;
4325             break;
4326         }
4327
4328         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4329                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4330             RAMBlock *block = ram_block_from_stream(f, flags);
4331
4332             /*
4333              * After going into COLO, we should load the Page into colo_cache.
4334              */
4335             if (migration_incoming_in_colo_state()) {
4336                 host = colo_cache_from_block_offset(block, addr);
4337             } else {
4338                 host = host_from_ram_block_offset(block, addr);
4339             }
4340             if (!host) {
4341                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4342                 ret = -EINVAL;
4343                 break;
4344             }
4345
4346             if (!migration_incoming_in_colo_state()) {
4347                 ramblock_recv_bitmap_set(block, host);
4348             }
4349
4350             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4351         }
4352
4353         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4354         case RAM_SAVE_FLAG_MEM_SIZE:
4355             /* Synchronize RAM block list */
4356             total_ram_bytes = addr;
4357             while (!ret && total_ram_bytes) {
4358                 RAMBlock *block;
4359                 char id[256];
4360                 ram_addr_t length;
4361
4362                 len = qemu_get_byte(f);
4363                 qemu_get_buffer(f, (uint8_t *)id, len);
4364                 id[len] = 0;
4365                 length = qemu_get_be64(f);
4366
4367                 block = qemu_ram_block_by_name(id);
4368                 if (block && !qemu_ram_is_migratable(block)) {
4369                     error_report("block %s should not be migrated !", id);
4370                     ret = -EINVAL;
4371                 } else if (block) {
4372                     if (length != block->used_length) {
4373                         Error *local_err = NULL;
4374
4375                         ret = qemu_ram_resize(block, length,
4376                                               &local_err);
4377                         if (local_err) {
4378                             error_report_err(local_err);
4379                         }
4380                     }
4381                     /* For postcopy we need to check hugepage sizes match */
4382                     if (postcopy_advised &&
4383                         block->page_size != qemu_host_page_size) {
4384                         uint64_t remote_page_size = qemu_get_be64(f);
4385                         if (remote_page_size != block->page_size) {
4386                             error_report("Mismatched RAM page size %s "
4387                                          "(local) %zd != %" PRId64,
4388                                          id, block->page_size,
4389                                          remote_page_size);
4390                             ret = -EINVAL;
4391                         }
4392                     }
4393                     if (migrate_ignore_shared()) {
4394                         hwaddr addr = qemu_get_be64(f);
4395                         if (ramblock_is_ignored(block) &&
4396                             block->mr->addr != addr) {
4397                             error_report("Mismatched GPAs for block %s "
4398                                          "%" PRId64 "!= %" PRId64,
4399                                          id, (uint64_t)addr,
4400                                          (uint64_t)block->mr->addr);
4401                             ret = -EINVAL;
4402                         }
4403                     }
4404                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4405                                           block->idstr);
4406                 } else {
4407                     error_report("Unknown ramblock \"%s\", cannot "
4408                                  "accept migration", id);
4409                     ret = -EINVAL;
4410                 }
4411
4412                 total_ram_bytes -= length;
4413             }
4414             break;
4415
4416         case RAM_SAVE_FLAG_ZERO:
4417             ch = qemu_get_byte(f);
4418             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4419             break;
4420
4421         case RAM_SAVE_FLAG_PAGE:
4422             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4423             break;
4424
4425         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4426             len = qemu_get_be32(f);
4427             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4428                 error_report("Invalid compressed data length: %d", len);
4429                 ret = -EINVAL;
4430                 break;
4431             }
4432             decompress_data_with_multi_threads(f, host, len);
4433             break;
4434
4435         case RAM_SAVE_FLAG_XBZRLE:
4436             if (load_xbzrle(f, addr, host) < 0) {
4437                 error_report("Failed to decompress XBZRLE page at "
4438                              RAM_ADDR_FMT, addr);
4439                 ret = -EINVAL;
4440                 break;
4441             }
4442             break;
4443         case RAM_SAVE_FLAG_EOS:
4444             /* normal exit */
4445             multifd_recv_sync_main();
4446             break;
4447         default:
4448             if (flags & RAM_SAVE_FLAG_HOOK) {
4449                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4450             } else {
4451                 error_report("Unknown combination of migration flags: %#x",
4452                              flags);
4453                 ret = -EINVAL;
4454             }
4455         }
4456         if (!ret) {
4457             ret = qemu_file_get_error(f);
4458         }
4459     }
4460
4461     ret |= wait_for_decompress_done();
4462     rcu_read_unlock();
4463     trace_ram_load_complete(ret, seq_iter);
4464
4465     if (!ret  && migration_incoming_in_colo_state()) {
4466         colo_flush_ram_cache();
4467     }
4468     return ret;
4469 }
4470
4471 static bool ram_has_postcopy(void *opaque)
4472 {
4473     RAMBlock *rb;
4474     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4475         if (ramblock_is_pmem(rb)) {
4476             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4477                          "is not supported now!", rb->idstr, rb->host);
4478             return false;
4479         }
4480     }
4481
4482     return migrate_postcopy_ram();
4483 }
4484
4485 /* Sync all the dirty bitmap with destination VM.  */
4486 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4487 {
4488     RAMBlock *block;
4489     QEMUFile *file = s->to_dst_file;
4490     int ramblock_count = 0;
4491
4492     trace_ram_dirty_bitmap_sync_start();
4493
4494     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4495         qemu_savevm_send_recv_bitmap(file, block->idstr);
4496         trace_ram_dirty_bitmap_request(block->idstr);
4497         ramblock_count++;
4498     }
4499
4500     trace_ram_dirty_bitmap_sync_wait();
4501
4502     /* Wait until all the ramblocks' dirty bitmap synced */
4503     while (ramblock_count--) {
4504         qemu_sem_wait(&s->rp_state.rp_sem);
4505     }
4506
4507     trace_ram_dirty_bitmap_sync_complete();
4508
4509     return 0;
4510 }
4511
4512 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4513 {
4514     qemu_sem_post(&s->rp_state.rp_sem);
4515 }
4516
4517 /*
4518  * Read the received bitmap, revert it as the initial dirty bitmap.
4519  * This is only used when the postcopy migration is paused but wants
4520  * to resume from a middle point.
4521  */
4522 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4523 {
4524     int ret = -EINVAL;
4525     QEMUFile *file = s->rp_state.from_dst_file;
4526     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4527     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4528     uint64_t size, end_mark;
4529
4530     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4531
4532     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4533         error_report("%s: incorrect state %s", __func__,
4534                      MigrationStatus_str(s->state));
4535         return -EINVAL;
4536     }
4537
4538     /*
4539      * Note: see comments in ramblock_recv_bitmap_send() on why we
4540      * need the endianess convertion, and the paddings.
4541      */
4542     local_size = ROUND_UP(local_size, 8);
4543
4544     /* Add paddings */
4545     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4546
4547     size = qemu_get_be64(file);
4548
4549     /* The size of the bitmap should match with our ramblock */
4550     if (size != local_size) {
4551         error_report("%s: ramblock '%s' bitmap size mismatch "
4552                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4553                      block->idstr, size, local_size);
4554         ret = -EINVAL;
4555         goto out;
4556     }
4557
4558     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4559     end_mark = qemu_get_be64(file);
4560
4561     ret = qemu_file_get_error(file);
4562     if (ret || size != local_size) {
4563         error_report("%s: read bitmap failed for ramblock '%s': %d"
4564                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4565                      __func__, block->idstr, ret, local_size, size);
4566         ret = -EIO;
4567         goto out;
4568     }
4569
4570     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4571         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4572                      __func__, block->idstr, end_mark);
4573         ret = -EINVAL;
4574         goto out;
4575     }
4576
4577     /*
4578      * Endianess convertion. We are during postcopy (though paused).
4579      * The dirty bitmap won't change. We can directly modify it.
4580      */
4581     bitmap_from_le(block->bmap, le_bitmap, nbits);
4582
4583     /*
4584      * What we received is "received bitmap". Revert it as the initial
4585      * dirty bitmap for this ramblock.
4586      */
4587     bitmap_complement(block->bmap, block->bmap, nbits);
4588
4589     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4590
4591     /*
4592      * We succeeded to sync bitmap for current ramblock. If this is
4593      * the last one to sync, we need to notify the main send thread.
4594      */
4595     ram_dirty_bitmap_reload_notify(s);
4596
4597     ret = 0;
4598 out:
4599     g_free(le_bitmap);
4600     return ret;
4601 }
4602
4603 static int ram_resume_prepare(MigrationState *s, void *opaque)
4604 {
4605     RAMState *rs = *(RAMState **)opaque;
4606     int ret;
4607
4608     ret = ram_dirty_bitmap_sync_all(s, rs);
4609     if (ret) {
4610         return ret;
4611     }
4612
4613     ram_state_resume_prepare(rs, s->to_dst_file);
4614
4615     return 0;
4616 }
4617
4618 static SaveVMHandlers savevm_ram_handlers = {
4619     .save_setup = ram_save_setup,
4620     .save_live_iterate = ram_save_iterate,
4621     .save_live_complete_postcopy = ram_save_complete,
4622     .save_live_complete_precopy = ram_save_complete,
4623     .has_postcopy = ram_has_postcopy,
4624     .save_live_pending = ram_save_pending,
4625     .load_state = ram_load,
4626     .save_cleanup = ram_save_cleanup,
4627     .load_setup = ram_load_setup,
4628     .load_cleanup = ram_load_cleanup,
4629     .resume_prepare = ram_resume_prepare,
4630 };
4631
4632 void ram_mig_init(void)
4633 {
4634     qemu_mutex_init(&XBZRLE.lock);
4635     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4636 }