migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "qemu/pmem.h"
  37 #include "xbzrle.h"
  38 #include "ram.h"
  39 #include "migration.h"
  40 #include "socket.h"
  41 #include "migration/register.h"
  42 #include "migration/misc.h"
  43 #include "qemu-file.h"
  44 #include "postcopy-ram.h"
  45 #include "page_cache.h"
  46 #include "qemu/error-report.h"
  47 #include "qapi/error.h"
  48 #include "qapi/qapi-events-migration.h"
  49 #include "qapi/qmp/qerror.h"
  50 #include "trace.h"
  51 #include "exec/ram_addr.h"
  52 #include "exec/target_page.h"
  53 #include "qemu/rcu_queue.h"
  54 #include "migration/colo.h"
  55 #include "block.h"
  56 #include "sysemu/sysemu.h"
  57 #include "qemu/uuid.h"
  58 #include "savevm.h"
  59 #include "qemu/iov.h"
  60
  61 /***********************************************************/
  62 /* ram save/restore */
  63
  64 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  65  * worked for pages that where filled with the same char.  We switched
  66  * it to only search for the zero value.  And to avoid confusion with
  67  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  68  */
  69
  70 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  71 #define RAM_SAVE_FLAG_ZERO     0x02
  72 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  73 #define RAM_SAVE_FLAG_PAGE     0x08
  74 #define RAM_SAVE_FLAG_EOS      0x10
  75 #define RAM_SAVE_FLAG_CONTINUE 0x20
  76 #define RAM_SAVE_FLAG_XBZRLE   0x40
  77 /* 0x80 is reserved in migration.h start with 0x100 next */
  78 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  79
  80 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  81 {
  82     return buffer_is_zero(p, size);
  83 }
  84
  85 XBZRLECacheStats xbzrle_counters;
  86
  87 /* struct contains XBZRLE cache and a static page
  88    used by the compression */
  89 static struct {
  90     /* buffer used for XBZRLE encoding */
  91     uint8_t *encoded_buf;
  92     /* buffer for storing page content */
  93     uint8_t *current_buf;
  94     /* Cache for XBZRLE, Protected by lock. */
  95     PageCache *cache;
  96     QemuMutex lock;
  97     /* it will store a page full of zeros */
  98     uint8_t *zero_target_page;
  99     /* buffer used for XBZRLE decoding */
 100     uint8_t *decoded_buf;
 101 } XBZRLE;
 102
 103 static void XBZRLE_cache_lock(void)
 104 {
 105     if (migrate_use_xbzrle())
 106         qemu_mutex_lock(&XBZRLE.lock);
 107 }
 108
 109 static void XBZRLE_cache_unlock(void)
 110 {
 111     if (migrate_use_xbzrle())
 112         qemu_mutex_unlock(&XBZRLE.lock);
 113 }
 114
 115 /**
 116  * xbzrle_cache_resize: resize the xbzrle cache
 117  *
 118  * This function is called from qmp_migrate_set_cache_size in main
 119  * thread, possibly while a migration is in progress.  A running
 120  * migration may be using the cache and might finish during this call,
 121  * hence changes to the cache are protected by XBZRLE.lock().
 122  *
 123  * Returns 0 for success or -1 for error
 124  *
 125  * @new_size: new cache size
 126  * @errp: set *errp if the check failed, with reason
 127  */
 128 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 129 {
 130     PageCache *new_cache;
 131     int64_t ret = 0;
 132
 133     /* Check for truncation */
 134     if (new_size != (size_t)new_size) {
 135         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 136                    "exceeding address space");
 137         return -1;
 138     }
 139
 140     if (new_size == migrate_xbzrle_cache_size()) {
 141         /* nothing to do */
 142         return 0;
 143     }
 144
 145     XBZRLE_cache_lock();
 146
 147     if (XBZRLE.cache != NULL) {
 148         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 149         if (!new_cache) {
 150             ret = -1;
 151             goto out;
 152         }
 153
 154         cache_fini(XBZRLE.cache);
 155         XBZRLE.cache = new_cache;
 156     }
 157 out:
 158     XBZRLE_cache_unlock();
 159     return ret;
 160 }
 161
 162 static bool ramblock_is_ignored(RAMBlock *block)
 163 {
 164     return !qemu_ram_is_migratable(block) ||
 165            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 166 }
 167
 168 /* Should be holding either ram_list.mutex, or the RCU lock. */
 169 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
 170     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 171         if (ramblock_is_ignored(block)) {} else
 172
 173 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 174     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 175         if (!qemu_ram_is_migratable(block)) {} else
 176
 177 #undef RAMBLOCK_FOREACH
 178
 179 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 180 {
 181     RAMBlock *block;
 182     int ret = 0;
 183
 184     rcu_read_lock();
 185     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 186         ret = func(block, opaque);
 187         if (ret) {
 188             break;
 189         }
 190     }
 191     rcu_read_unlock();
 192     return ret;
 193 }
 194
 195 static void ramblock_recv_map_init(void)
 196 {
 197     RAMBlock *rb;
 198
 199     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 200         assert(!rb->receivedmap);
 201         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 202     }
 203 }
 204
 205 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 206 {
 207     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 208                     rb->receivedmap);
 209 }
 210
 211 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 212 {
 213     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 214 }
 215
 216 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 217 {
 218     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 219 }
 220
 221 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 222                                     size_t nr)
 223 {
 224     bitmap_set_atomic(rb->receivedmap,
 225                       ramblock_recv_bitmap_offset(host_addr, rb),
 226                       nr);
 227 }
 228
 229 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 230
 231 /*
 232  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 233  *
 234  * Returns >0 if success with sent bytes, or <0 if error.
 235  */
 236 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 237                                   const char *block_name)
 238 {
 239     RAMBlock *block = qemu_ram_block_by_name(block_name);
 240     unsigned long *le_bitmap, nbits;
 241     uint64_t size;
 242
 243     if (!block) {
 244         error_report("%s: invalid block name: %s", __func__, block_name);
 245         return -1;
 246     }
 247
 248     nbits = block->used_length >> TARGET_PAGE_BITS;
 249
 250     /*
 251      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 252      * machines we may need 4 more bytes for padding (see below
 253      * comment). So extend it a bit before hand.
 254      */
 255     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 256
 257     /*
 258      * Always use little endian when sending the bitmap. This is
 259      * required that when source and destination VMs are not using the
 260      * same endianess. (Note: big endian won't work.)
 261      */
 262     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 263
 264     /* Size of the bitmap, in bytes */
 265     size = DIV_ROUND_UP(nbits, 8);
 266
 267     /*
 268      * size is always aligned to 8 bytes for 64bit machines, but it
 269      * may not be true for 32bit machines. We need this padding to
 270      * make sure the migration can survive even between 32bit and
 271      * 64bit machines.
 272      */
 273     size = ROUND_UP(size, 8);
 274
 275     qemu_put_be64(file, size);
 276     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 277     /*
 278      * Mark as an end, in case the middle part is screwed up due to
 279      * some "misterious" reason.
 280      */
 281     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 282     qemu_fflush(file);
 283
 284     g_free(le_bitmap);
 285
 286     if (qemu_file_get_error(file)) {
 287         return qemu_file_get_error(file);
 288     }
 289
 290     return size + sizeof(size);
 291 }
 292
 293 /*
 294  * An outstanding page request, on the source, having been received
 295  * and queued
 296  */
 297 struct RAMSrcPageRequest {
 298     RAMBlock *rb;
 299     hwaddr    offset;
 300     hwaddr    len;
 301
 302     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 303 };
 304
 305 /* State of RAM for migration */
 306 struct RAMState {
 307     /* QEMUFile used for this migration */
 308     QEMUFile *f;
 309     /* Last block that we have visited searching for dirty pages */
 310     RAMBlock *last_seen_block;
 311     /* Last block from where we have sent data */
 312     RAMBlock *last_sent_block;
 313     /* Last dirty target page we have sent */
 314     ram_addr_t last_page;
 315     /* last ram version we have seen */
 316     uint32_t last_version;
 317     /* We are in the first round */
 318     bool ram_bulk_stage;
 319     /* The free page optimization is enabled */
 320     bool fpo_enabled;
 321     /* How many times we have dirty too many pages */
 322     int dirty_rate_high_cnt;
 323     /* these variables are used for bitmap sync */
 324     /* last time we did a full bitmap_sync */
 325     int64_t time_last_bitmap_sync;
 326     /* bytes transferred at start_time */
 327     uint64_t bytes_xfer_prev;
 328     /* number of dirty pages since start_time */
 329     uint64_t num_dirty_pages_period;
 330     /* xbzrle misses since the beginning of the period */
 331     uint64_t xbzrle_cache_miss_prev;
 332
 333     /* compression statistics since the beginning of the period */
 334     /* amount of count that no free thread to compress data */
 335     uint64_t compress_thread_busy_prev;
 336     /* amount bytes after compression */
 337     uint64_t compressed_size_prev;
 338     /* amount of compressed pages */
 339     uint64_t compress_pages_prev;
 340
 341     /* total handled target pages at the beginning of period */
 342     uint64_t target_page_count_prev;
 343     /* total handled target pages since start */
 344     uint64_t target_page_count;
 345     /* number of dirty bits in the bitmap */
 346     uint64_t migration_dirty_pages;
 347     /* Protects modification of the bitmap and migration dirty pages */
 348     QemuMutex bitmap_mutex;
 349     /* The RAMBlock used in the last src_page_requests */
 350     RAMBlock *last_req_rb;
 351     /* Queue of outstanding page requests from the destination */
 352     QemuMutex src_page_req_mutex;
 353     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 354 };
 355 typedef struct RAMState RAMState;
 356
 357 static RAMState *ram_state;
 358
 359 static NotifierWithReturnList precopy_notifier_list;
 360
 361 void precopy_infrastructure_init(void)
 362 {
 363     notifier_with_return_list_init(&precopy_notifier_list);
 364 }
 365
 366 void precopy_add_notifier(NotifierWithReturn *n)
 367 {
 368     notifier_with_return_list_add(&precopy_notifier_list, n);
 369 }
 370
 371 void precopy_remove_notifier(NotifierWithReturn *n)
 372 {
 373     notifier_with_return_remove(n);
 374 }
 375
 376 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 377 {
 378     PrecopyNotifyData pnd;
 379     pnd.reason = reason;
 380     pnd.errp = errp;
 381
 382     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 383 }
 384
 385 void precopy_enable_free_page_optimization(void)
 386 {
 387     if (!ram_state) {
 388         return;
 389     }
 390
 391     ram_state->fpo_enabled = true;
 392 }
 393
 394 uint64_t ram_bytes_remaining(void)
 395 {
 396     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 397                        0;
 398 }
 399
 400 MigrationStats ram_counters;
 401
 402 /* used by the search for pages to send */
 403 struct PageSearchStatus {
 404     /* Current block being searched */
 405     RAMBlock    *block;
 406     /* Current page to search from */
 407     unsigned long page;
 408     /* Set once we wrap around */
 409     bool         complete_round;
 410 };
 411 typedef struct PageSearchStatus PageSearchStatus;
 412
 413 CompressionStats compression_counters;
 414
 415 struct CompressParam {
 416     bool done;
 417     bool quit;
 418     bool zero_page;
 419     QEMUFile *file;
 420     QemuMutex mutex;
 421     QemuCond cond;
 422     RAMBlock *block;
 423     ram_addr_t offset;
 424
 425     /* internally used fields */
 426     z_stream stream;
 427     uint8_t *originbuf;
 428 };
 429 typedef struct CompressParam CompressParam;
 430
 431 struct DecompressParam {
 432     bool done;
 433     bool quit;
 434     QemuMutex mutex;
 435     QemuCond cond;
 436     void *des;
 437     uint8_t *compbuf;
 438     int len;
 439     z_stream stream;
 440 };
 441 typedef struct DecompressParam DecompressParam;
 442
 443 static CompressParam *comp_param;
 444 static QemuThread *compress_threads;
 445 /* comp_done_cond is used to wake up the migration thread when
 446  * one of the compression threads has finished the compression.
 447  * comp_done_lock is used to co-work with comp_done_cond.
 448  */
 449 static QemuMutex comp_done_lock;
 450 static QemuCond comp_done_cond;
 451 /* The empty QEMUFileOps will be used by file in CompressParam */
 452 static const QEMUFileOps empty_ops = { };
 453
 454 static QEMUFile *decomp_file;
 455 static DecompressParam *decomp_param;
 456 static QemuThread *decompress_threads;
 457 static QemuMutex decomp_done_lock;
 458 static QemuCond decomp_done_cond;
 459
 460 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 461                                  ram_addr_t offset, uint8_t *source_buf);
 462
 463 static void *do_data_compress(void *opaque)
 464 {
 465     CompressParam *param = opaque;
 466     RAMBlock *block;
 467     ram_addr_t offset;
 468     bool zero_page;
 469
 470     qemu_mutex_lock(&param->mutex);
 471     while (!param->quit) {
 472         if (param->block) {
 473             block = param->block;
 474             offset = param->offset;
 475             param->block = NULL;
 476             qemu_mutex_unlock(&param->mutex);
 477
 478             zero_page = do_compress_ram_page(param->file, &param->stream,
 479                                              block, offset, param->originbuf);
 480
 481             qemu_mutex_lock(&comp_done_lock);
 482             param->done = true;
 483             param->zero_page = zero_page;
 484             qemu_cond_signal(&comp_done_cond);
 485             qemu_mutex_unlock(&comp_done_lock);
 486
 487             qemu_mutex_lock(&param->mutex);
 488         } else {
 489             qemu_cond_wait(&param->cond, &param->mutex);
 490         }
 491     }
 492     qemu_mutex_unlock(&param->mutex);
 493
 494     return NULL;
 495 }
 496
 497 static void compress_threads_save_cleanup(void)
 498 {
 499     int i, thread_count;
 500
 501     if (!migrate_use_compression() || !comp_param) {
 502         return;
 503     }
 504
 505     thread_count = migrate_compress_threads();
 506     for (i = 0; i < thread_count; i++) {
 507         /*
 508          * we use it as a indicator which shows if the thread is
 509          * properly init'd or not
 510          */
 511         if (!comp_param[i].file) {
 512             break;
 513         }
 514
 515         qemu_mutex_lock(&comp_param[i].mutex);
 516         comp_param[i].quit = true;
 517         qemu_cond_signal(&comp_param[i].cond);
 518         qemu_mutex_unlock(&comp_param[i].mutex);
 519
 520         qemu_thread_join(compress_threads + i);
 521         qemu_mutex_destroy(&comp_param[i].mutex);
 522         qemu_cond_destroy(&comp_param[i].cond);
 523         deflateEnd(&comp_param[i].stream);
 524         g_free(comp_param[i].originbuf);
 525         qemu_fclose(comp_param[i].file);
 526         comp_param[i].file = NULL;
 527     }
 528     qemu_mutex_destroy(&comp_done_lock);
 529     qemu_cond_destroy(&comp_done_cond);
 530     g_free(compress_threads);
 531     g_free(comp_param);
 532     compress_threads = NULL;
 533     comp_param = NULL;
 534 }
 535
 536 static int compress_threads_save_setup(void)
 537 {
 538     int i, thread_count;
 539
 540     if (!migrate_use_compression()) {
 541         return 0;
 542     }
 543     thread_count = migrate_compress_threads();
 544     compress_threads = g_new0(QemuThread, thread_count);
 545     comp_param = g_new0(CompressParam, thread_count);
 546     qemu_cond_init(&comp_done_cond);
 547     qemu_mutex_init(&comp_done_lock);
 548     for (i = 0; i < thread_count; i++) {
 549         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 550         if (!comp_param[i].originbuf) {
 551             goto exit;
 552         }
 553
 554         if (deflateInit(&comp_param[i].stream,
 555                         migrate_compress_level()) != Z_OK) {
 556             g_free(comp_param[i].originbuf);
 557             goto exit;
 558         }
 559
 560         /* comp_param[i].file is just used as a dummy buffer to save data,
 561          * set its ops to empty.
 562          */
 563         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 564         comp_param[i].done = true;
 565         comp_param[i].quit = false;
 566         qemu_mutex_init(&comp_param[i].mutex);
 567         qemu_cond_init(&comp_param[i].cond);
 568         qemu_thread_create(compress_threads + i, "compress",
 569                            do_data_compress, comp_param + i,
 570                            QEMU_THREAD_JOINABLE);
 571     }
 572     return 0;
 573
 574 exit:
 575     compress_threads_save_cleanup();
 576     return -1;
 577 }
 578
 579 /* Multiple fd's */
 580
 581 #define MULTIFD_MAGIC 0x11223344U
 582 #define MULTIFD_VERSION 1
 583
 584 #define MULTIFD_FLAG_SYNC (1 << 0)
 585
 586 /* This value needs to be a multiple of qemu_target_page_size() */
 587 #define MULTIFD_PACKET_SIZE (512 * 1024)
 588
 589 typedef struct {
 590     uint32_t magic;
 591     uint32_t version;
 592     unsigned char uuid[16]; /* QemuUUID */
 593     uint8_t id;
 594     uint8_t unused1[7];     /* Reserved for future use */
 595     uint64_t unused2[4];    /* Reserved for future use */
 596 } __attribute__((packed)) MultiFDInit_t;
 597
 598 typedef struct {
 599     uint32_t magic;
 600     uint32_t version;
 601     uint32_t flags;
 602     /* maximum number of allocated pages */
 603     uint32_t pages_alloc;
 604     uint32_t pages_used;
 605     /* size of the next packet that contains pages */
 606     uint32_t next_packet_size;
 607     uint64_t packet_num;
 608     uint64_t unused[4];    /* Reserved for future use */
 609     char ramblock[256];
 610     uint64_t offset[];
 611 } __attribute__((packed)) MultiFDPacket_t;
 612
 613 typedef struct {
 614     /* number of used pages */
 615     uint32_t used;
 616     /* number of allocated pages */
 617     uint32_t allocated;
 618     /* global number of generated multifd packets */
 619     uint64_t packet_num;
 620     /* offset of each page */
 621     ram_addr_t *offset;
 622     /* pointer to each page */
 623     struct iovec *iov;
 624     RAMBlock *block;
 625 } MultiFDPages_t;
 626
 627 typedef struct {
 628     /* this fields are not changed once the thread is created */
 629     /* channel number */
 630     uint8_t id;
 631     /* channel thread name */
 632     char *name;
 633     /* channel thread id */
 634     QemuThread thread;
 635     /* communication channel */
 636     QIOChannel *c;
 637     /* sem where to wait for more work */
 638     QemuSemaphore sem;
 639     /* this mutex protects the following parameters */
 640     QemuMutex mutex;
 641     /* is this channel thread running */
 642     bool running;
 643     /* should this thread finish */
 644     bool quit;
 645     /* thread has work to do */
 646     int pending_job;
 647     /* array of pages to sent */
 648     MultiFDPages_t *pages;
 649     /* packet allocated len */
 650     uint32_t packet_len;
 651     /* pointer to the packet */
 652     MultiFDPacket_t *packet;
 653     /* multifd flags for each packet */
 654     uint32_t flags;
 655     /* size of the next packet that contains pages */
 656     uint32_t next_packet_size;
 657     /* global number of generated multifd packets */
 658     uint64_t packet_num;
 659     /* thread local variables */
 660     /* packets sent through this channel */
 661     uint64_t num_packets;
 662     /* pages sent through this channel */
 663     uint64_t num_pages;
 664     /* syncs main thread and channels */
 665     QemuSemaphore sem_sync;
 666 }  MultiFDSendParams;
 667
 668 typedef struct {
 669     /* this fields are not changed once the thread is created */
 670     /* channel number */
 671     uint8_t id;
 672     /* channel thread name */
 673     char *name;
 674     /* channel thread id */
 675     QemuThread thread;
 676     /* communication channel */
 677     QIOChannel *c;
 678     /* this mutex protects the following parameters */
 679     QemuMutex mutex;
 680     /* is this channel thread running */
 681     bool running;
 682     /* array of pages to receive */
 683     MultiFDPages_t *pages;
 684     /* packet allocated len */
 685     uint32_t packet_len;
 686     /* pointer to the packet */
 687     MultiFDPacket_t *packet;
 688     /* multifd flags for each packet */
 689     uint32_t flags;
 690     /* global number of generated multifd packets */
 691     uint64_t packet_num;
 692     /* thread local variables */
 693     /* size of the next packet that contains pages */
 694     uint32_t next_packet_size;
 695     /* packets sent through this channel */
 696     uint64_t num_packets;
 697     /* pages sent through this channel */
 698     uint64_t num_pages;
 699     /* syncs main thread and channels */
 700     QemuSemaphore sem_sync;
 701 } MultiFDRecvParams;
 702
 703 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 704 {
 705     MultiFDInit_t msg;
 706     int ret;
 707
 708     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 709     msg.version = cpu_to_be32(MULTIFD_VERSION);
 710     msg.id = p->id;
 711     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 712
 713     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 714     if (ret != 0) {
 715         return -1;
 716     }
 717     return 0;
 718 }
 719
 720 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 721 {
 722     MultiFDInit_t msg;
 723     int ret;
 724
 725     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 726     if (ret != 0) {
 727         return -1;
 728     }
 729
 730     msg.magic = be32_to_cpu(msg.magic);
 731     msg.version = be32_to_cpu(msg.version);
 732
 733     if (msg.magic != MULTIFD_MAGIC) {
 734         error_setg(errp, "multifd: received packet magic %x "
 735                    "expected %x", msg.magic, MULTIFD_MAGIC);
 736         return -1;
 737     }
 738
 739     if (msg.version != MULTIFD_VERSION) {
 740         error_setg(errp, "multifd: received packet version %d "
 741                    "expected %d", msg.version, MULTIFD_VERSION);
 742         return -1;
 743     }
 744
 745     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 746         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 747         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 748
 749         error_setg(errp, "multifd: received uuid '%s' and expected "
 750                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 751         g_free(uuid);
 752         g_free(msg_uuid);
 753         return -1;
 754     }
 755
 756     if (msg.id > migrate_multifd_channels()) {
 757         error_setg(errp, "multifd: received channel version %d "
 758                    "expected %d", msg.version, MULTIFD_VERSION);
 759         return -1;
 760     }
 761
 762     return msg.id;
 763 }
 764
 765 static MultiFDPages_t *multifd_pages_init(size_t size)
 766 {
 767     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 768
 769     pages->allocated = size;
 770     pages->iov = g_new0(struct iovec, size);
 771     pages->offset = g_new0(ram_addr_t, size);
 772
 773     return pages;
 774 }
 775
 776 static void multifd_pages_clear(MultiFDPages_t *pages)
 777 {
 778     pages->used = 0;
 779     pages->allocated = 0;
 780     pages->packet_num = 0;
 781     pages->block = NULL;
 782     g_free(pages->iov);
 783     pages->iov = NULL;
 784     g_free(pages->offset);
 785     pages->offset = NULL;
 786     g_free(pages);
 787 }
 788
 789 static void multifd_send_fill_packet(MultiFDSendParams *p)
 790 {
 791     MultiFDPacket_t *packet = p->packet;
 792     uint32_t page_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
 793     int i;
 794
 795     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
 796     packet->version = cpu_to_be32(MULTIFD_VERSION);
 797     packet->flags = cpu_to_be32(p->flags);
 798     packet->pages_alloc = cpu_to_be32(page_max);
 799     packet->pages_used = cpu_to_be32(p->pages->used);
 800     packet->next_packet_size = cpu_to_be32(p->next_packet_size);
 801     packet->packet_num = cpu_to_be64(p->packet_num);
 802
 803     if (p->pages->block) {
 804         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 805     }
 806
 807     for (i = 0; i < p->pages->used; i++) {
 808         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 809     }
 810 }
 811
 812 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 813 {
 814     MultiFDPacket_t *packet = p->packet;
 815     uint32_t pages_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
 816     RAMBlock *block;
 817     int i;
 818
 819     packet->magic = be32_to_cpu(packet->magic);
 820     if (packet->magic != MULTIFD_MAGIC) {
 821         error_setg(errp, "multifd: received packet "
 822                    "magic %x and expected magic %x",
 823                    packet->magic, MULTIFD_MAGIC);
 824         return -1;
 825     }
 826
 827     packet->version = be32_to_cpu(packet->version);
 828     if (packet->version != MULTIFD_VERSION) {
 829         error_setg(errp, "multifd: received packet "
 830                    "version %d and expected version %d",
 831                    packet->version, MULTIFD_VERSION);
 832         return -1;
 833     }
 834
 835     p->flags = be32_to_cpu(packet->flags);
 836
 837     packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
 838     /*
 839      * If we recevied a packet that is 100 times bigger than expected
 840      * just stop migration.  It is a magic number.
 841      */
 842     if (packet->pages_alloc > pages_max * 100) {
 843         error_setg(errp, "multifd: received packet "
 844                    "with size %d and expected a maximum size of %d",
 845                    packet->pages_alloc, pages_max * 100) ;
 846         return -1;
 847     }
 848     /*
 849      * We received a packet that is bigger than expected but inside
 850      * reasonable limits (see previous comment).  Just reallocate.
 851      */
 852     if (packet->pages_alloc > p->pages->allocated) {
 853         multifd_pages_clear(p->pages);
 854         p->pages = multifd_pages_init(packet->pages_alloc);
 855     }
 856
 857     p->pages->used = be32_to_cpu(packet->pages_used);
 858     if (p->pages->used > packet->pages_alloc) {
 859         error_setg(errp, "multifd: received packet "
 860                    "with %d pages and expected maximum pages are %d",
 861                    p->pages->used, packet->pages_alloc) ;
 862         return -1;
 863     }
 864
 865     p->next_packet_size = be32_to_cpu(packet->next_packet_size);
 866     p->packet_num = be64_to_cpu(packet->packet_num);
 867
 868     if (p->pages->used) {
 869         /* make sure that ramblock is 0 terminated */
 870         packet->ramblock[255] = 0;
 871         block = qemu_ram_block_by_name(packet->ramblock);
 872         if (!block) {
 873             error_setg(errp, "multifd: unknown ram block %s",
 874                        packet->ramblock);
 875             return -1;
 876         }
 877     }
 878
 879     for (i = 0; i < p->pages->used; i++) {
 880         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 881
 882         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 883             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 884                        " (max " RAM_ADDR_FMT ")",
 885                        offset, block->max_length);
 886             return -1;
 887         }
 888         p->pages->iov[i].iov_base = block->host + offset;
 889         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 890     }
 891
 892     return 0;
 893 }
 894
 895 struct {
 896     MultiFDSendParams *params;
 897     /* number of created threads */
 898     int count;
 899     /* array of pages to sent */
 900     MultiFDPages_t *pages;
 901     /* syncs main thread and channels */
 902     QemuSemaphore sem_sync;
 903     /* global number of generated multifd packets */
 904     uint64_t packet_num;
 905     /* send channels ready */
 906     QemuSemaphore channels_ready;
 907 } *multifd_send_state;
 908
 909 /*
 910  * How we use multifd_send_state->pages and channel->pages?
 911  *
 912  * We create a pages for each channel, and a main one.  Each time that
 913  * we need to send a batch of pages we interchange the ones between
 914  * multifd_send_state and the channel that is sending it.  There are
 915  * two reasons for that:
 916  *    - to not have to do so many mallocs during migration
 917  *    - to make easier to know what to free at the end of migration
 918  *
 919  * This way we always know who is the owner of each "pages" struct,
 920  * and we don't need any locking.  It belongs to the migration thread
 921  * or to the channel thread.  Switching is safe because the migration
 922  * thread is using the channel mutex when changing it, and the channel
 923  * have to had finish with its own, otherwise pending_job can't be
 924  * false.
 925  */
 926
 927 static void multifd_send_pages(void)
 928 {
 929     int i;
 930     static int next_channel;
 931     MultiFDSendParams *p = NULL; /* make happy gcc */
 932     MultiFDPages_t *pages = multifd_send_state->pages;
 933     uint64_t transferred;
 934
 935     qemu_sem_wait(&multifd_send_state->channels_ready);
 936     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 937         p = &multifd_send_state->params[i];
 938
 939         qemu_mutex_lock(&p->mutex);
 940         if (!p->pending_job) {
 941             p->pending_job++;
 942             next_channel = (i + 1) % migrate_multifd_channels();
 943             break;
 944         }
 945         qemu_mutex_unlock(&p->mutex);
 946     }
 947     p->pages->used = 0;
 948
 949     p->packet_num = multifd_send_state->packet_num++;
 950     p->pages->block = NULL;
 951     multifd_send_state->pages = p->pages;
 952     p->pages = pages;
 953     transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
 954     ram_counters.multifd_bytes += transferred;
 955     ram_counters.transferred += transferred;;
 956     qemu_mutex_unlock(&p->mutex);
 957     qemu_sem_post(&p->sem);
 958 }
 959
 960 static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
 961 {
 962     MultiFDPages_t *pages = multifd_send_state->pages;
 963
 964     if (!pages->block) {
 965         pages->block = block;
 966     }
 967
 968     if (pages->block == block) {
 969         pages->offset[pages->used] = offset;
 970         pages->iov[pages->used].iov_base = block->host + offset;
 971         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
 972         pages->used++;
 973
 974         if (pages->used < pages->allocated) {
 975             return;
 976         }
 977     }
 978
 979     multifd_send_pages();
 980
 981     if (pages->block != block) {
 982         multifd_queue_page(block, offset);
 983     }
 984 }
 985
 986 static void multifd_send_terminate_threads(Error *err)
 987 {
 988     int i;
 989
 990     if (err) {
 991         MigrationState *s = migrate_get_current();
 992         migrate_set_error(s, err);
 993         if (s->state == MIGRATION_STATUS_SETUP ||
 994             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
 995             s->state == MIGRATION_STATUS_DEVICE ||
 996             s->state == MIGRATION_STATUS_ACTIVE) {
 997             migrate_set_state(&s->state, s->state,
 998                               MIGRATION_STATUS_FAILED);
 999         }
1000     }
1001
1002     for (i = 0; i < migrate_multifd_channels(); i++) {
1003         MultiFDSendParams *p = &multifd_send_state->params[i];
1004
1005         qemu_mutex_lock(&p->mutex);
1006         p->quit = true;
1007         qemu_sem_post(&p->sem);
1008         qemu_mutex_unlock(&p->mutex);
1009     }
1010 }
1011
1012 void multifd_save_cleanup(void)
1013 {
1014     int i;
1015
1016     if (!migrate_use_multifd()) {
1017         return;
1018     }
1019     multifd_send_terminate_threads(NULL);
1020     for (i = 0; i < migrate_multifd_channels(); i++) {
1021         MultiFDSendParams *p = &multifd_send_state->params[i];
1022
1023         if (p->running) {
1024             qemu_thread_join(&p->thread);
1025         }
1026         socket_send_channel_destroy(p->c);
1027         p->c = NULL;
1028         qemu_mutex_destroy(&p->mutex);
1029         qemu_sem_destroy(&p->sem);
1030         qemu_sem_destroy(&p->sem_sync);
1031         g_free(p->name);
1032         p->name = NULL;
1033         multifd_pages_clear(p->pages);
1034         p->pages = NULL;
1035         p->packet_len = 0;
1036         g_free(p->packet);
1037         p->packet = NULL;
1038     }
1039     qemu_sem_destroy(&multifd_send_state->channels_ready);
1040     qemu_sem_destroy(&multifd_send_state->sem_sync);
1041     g_free(multifd_send_state->params);
1042     multifd_send_state->params = NULL;
1043     multifd_pages_clear(multifd_send_state->pages);
1044     multifd_send_state->pages = NULL;
1045     g_free(multifd_send_state);
1046     multifd_send_state = NULL;
1047 }
1048
1049 static void multifd_send_sync_main(void)
1050 {
1051     int i;
1052
1053     if (!migrate_use_multifd()) {
1054         return;
1055     }
1056     if (multifd_send_state->pages->used) {
1057         multifd_send_pages();
1058     }
1059     for (i = 0; i < migrate_multifd_channels(); i++) {
1060         MultiFDSendParams *p = &multifd_send_state->params[i];
1061
1062         trace_multifd_send_sync_main_signal(p->id);
1063
1064         qemu_mutex_lock(&p->mutex);
1065
1066         p->packet_num = multifd_send_state->packet_num++;
1067         p->flags |= MULTIFD_FLAG_SYNC;
1068         p->pending_job++;
1069         qemu_mutex_unlock(&p->mutex);
1070         qemu_sem_post(&p->sem);
1071     }
1072     for (i = 0; i < migrate_multifd_channels(); i++) {
1073         MultiFDSendParams *p = &multifd_send_state->params[i];
1074
1075         trace_multifd_send_sync_main_wait(p->id);
1076         qemu_sem_wait(&multifd_send_state->sem_sync);
1077     }
1078     trace_multifd_send_sync_main(multifd_send_state->packet_num);
1079 }
1080
1081 static void *multifd_send_thread(void *opaque)
1082 {
1083     MultiFDSendParams *p = opaque;
1084     Error *local_err = NULL;
1085     int ret;
1086
1087     trace_multifd_send_thread_start(p->id);
1088     rcu_register_thread();
1089
1090     if (multifd_send_initial_packet(p, &local_err) < 0) {
1091         goto out;
1092     }
1093     /* initial packet */
1094     p->num_packets = 1;
1095
1096     while (true) {
1097         qemu_sem_wait(&p->sem);
1098         qemu_mutex_lock(&p->mutex);
1099
1100         if (p->pending_job) {
1101             uint32_t used = p->pages->used;
1102             uint64_t packet_num = p->packet_num;
1103             uint32_t flags = p->flags;
1104
1105             p->next_packet_size = used * qemu_target_page_size();
1106             multifd_send_fill_packet(p);
1107             p->flags = 0;
1108             p->num_packets++;
1109             p->num_pages += used;
1110             p->pages->used = 0;
1111             qemu_mutex_unlock(&p->mutex);
1112
1113             trace_multifd_send(p->id, packet_num, used, flags,
1114                                p->next_packet_size);
1115
1116             ret = qio_channel_write_all(p->c, (void *)p->packet,
1117                                         p->packet_len, &local_err);
1118             if (ret != 0) {
1119                 break;
1120             }
1121
1122             if (used) {
1123                 ret = qio_channel_writev_all(p->c, p->pages->iov,
1124                                              used, &local_err);
1125                 if (ret != 0) {
1126                     break;
1127                 }
1128             }
1129
1130             qemu_mutex_lock(&p->mutex);
1131             p->pending_job--;
1132             qemu_mutex_unlock(&p->mutex);
1133
1134             if (flags & MULTIFD_FLAG_SYNC) {
1135                 qemu_sem_post(&multifd_send_state->sem_sync);
1136             }
1137             qemu_sem_post(&multifd_send_state->channels_ready);
1138         } else if (p->quit) {
1139             qemu_mutex_unlock(&p->mutex);
1140             break;
1141         } else {
1142             qemu_mutex_unlock(&p->mutex);
1143             /* sometimes there are spurious wakeups */
1144         }
1145     }
1146
1147 out:
1148     if (local_err) {
1149         multifd_send_terminate_threads(local_err);
1150     }
1151
1152     qemu_mutex_lock(&p->mutex);
1153     p->running = false;
1154     qemu_mutex_unlock(&p->mutex);
1155
1156     rcu_unregister_thread();
1157     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1158
1159     return NULL;
1160 }
1161
1162 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1163 {
1164     MultiFDSendParams *p = opaque;
1165     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1166     Error *local_err = NULL;
1167
1168     if (qio_task_propagate_error(task, &local_err)) {
1169         migrate_set_error(migrate_get_current(), local_err);
1170         multifd_save_cleanup();
1171     } else {
1172         p->c = QIO_CHANNEL(sioc);
1173         qio_channel_set_delay(p->c, false);
1174         p->running = true;
1175         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1176                            QEMU_THREAD_JOINABLE);
1177
1178         atomic_inc(&multifd_send_state->count);
1179     }
1180 }
1181
1182 int multifd_save_setup(void)
1183 {
1184     int thread_count;
1185     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1186     uint8_t i;
1187
1188     if (!migrate_use_multifd()) {
1189         return 0;
1190     }
1191     thread_count = migrate_multifd_channels();
1192     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1193     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1194     atomic_set(&multifd_send_state->count, 0);
1195     multifd_send_state->pages = multifd_pages_init(page_count);
1196     qemu_sem_init(&multifd_send_state->sem_sync, 0);
1197     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1198
1199     for (i = 0; i < thread_count; i++) {
1200         MultiFDSendParams *p = &multifd_send_state->params[i];
1201
1202         qemu_mutex_init(&p->mutex);
1203         qemu_sem_init(&p->sem, 0);
1204         qemu_sem_init(&p->sem_sync, 0);
1205         p->quit = false;
1206         p->pending_job = 0;
1207         p->id = i;
1208         p->pages = multifd_pages_init(page_count);
1209         p->packet_len = sizeof(MultiFDPacket_t)
1210                       + sizeof(ram_addr_t) * page_count;
1211         p->packet = g_malloc0(p->packet_len);
1212         p->name = g_strdup_printf("multifdsend_%d", i);
1213         socket_send_channel_create(multifd_new_send_channel_async, p);
1214     }
1215     return 0;
1216 }
1217
1218 struct {
1219     MultiFDRecvParams *params;
1220     /* number of created threads */
1221     int count;
1222     /* syncs main thread and channels */
1223     QemuSemaphore sem_sync;
1224     /* global number of generated multifd packets */
1225     uint64_t packet_num;
1226 } *multifd_recv_state;
1227
1228 static void multifd_recv_terminate_threads(Error *err)
1229 {
1230     int i;
1231
1232     if (err) {
1233         MigrationState *s = migrate_get_current();
1234         migrate_set_error(s, err);
1235         if (s->state == MIGRATION_STATUS_SETUP ||
1236             s->state == MIGRATION_STATUS_ACTIVE) {
1237             migrate_set_state(&s->state, s->state,
1238                               MIGRATION_STATUS_FAILED);
1239         }
1240     }
1241
1242     for (i = 0; i < migrate_multifd_channels(); i++) {
1243         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1244
1245         qemu_mutex_lock(&p->mutex);
1246         /* We could arrive here for two reasons:
1247            - normal quit, i.e. everything went fine, just finished
1248            - error quit: We close the channels so the channel threads
1249              finish the qio_channel_read_all_eof() */
1250         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1251         qemu_mutex_unlock(&p->mutex);
1252     }
1253 }
1254
1255 int multifd_load_cleanup(Error **errp)
1256 {
1257     int i;
1258     int ret = 0;
1259
1260     if (!migrate_use_multifd()) {
1261         return 0;
1262     }
1263     multifd_recv_terminate_threads(NULL);
1264     for (i = 0; i < migrate_multifd_channels(); i++) {
1265         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1266
1267         if (p->running) {
1268             qemu_thread_join(&p->thread);
1269         }
1270         object_unref(OBJECT(p->c));
1271         p->c = NULL;
1272         qemu_mutex_destroy(&p->mutex);
1273         qemu_sem_destroy(&p->sem_sync);
1274         g_free(p->name);
1275         p->name = NULL;
1276         multifd_pages_clear(p->pages);
1277         p->pages = NULL;
1278         p->packet_len = 0;
1279         g_free(p->packet);
1280         p->packet = NULL;
1281     }
1282     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1283     g_free(multifd_recv_state->params);
1284     multifd_recv_state->params = NULL;
1285     g_free(multifd_recv_state);
1286     multifd_recv_state = NULL;
1287
1288     return ret;
1289 }
1290
1291 static void multifd_recv_sync_main(void)
1292 {
1293     int i;
1294
1295     if (!migrate_use_multifd()) {
1296         return;
1297     }
1298     for (i = 0; i < migrate_multifd_channels(); i++) {
1299         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1300
1301         trace_multifd_recv_sync_main_wait(p->id);
1302         qemu_sem_wait(&multifd_recv_state->sem_sync);
1303         qemu_mutex_lock(&p->mutex);
1304         if (multifd_recv_state->packet_num < p->packet_num) {
1305             multifd_recv_state->packet_num = p->packet_num;
1306         }
1307         qemu_mutex_unlock(&p->mutex);
1308     }
1309     for (i = 0; i < migrate_multifd_channels(); i++) {
1310         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1311
1312         trace_multifd_recv_sync_main_signal(p->id);
1313         qemu_sem_post(&p->sem_sync);
1314     }
1315     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1316 }
1317
1318 static void *multifd_recv_thread(void *opaque)
1319 {
1320     MultiFDRecvParams *p = opaque;
1321     Error *local_err = NULL;
1322     int ret;
1323
1324     trace_multifd_recv_thread_start(p->id);
1325     rcu_register_thread();
1326
1327     while (true) {
1328         uint32_t used;
1329         uint32_t flags;
1330
1331         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1332                                        p->packet_len, &local_err);
1333         if (ret == 0) {   /* EOF */
1334             break;
1335         }
1336         if (ret == -1) {   /* Error */
1337             break;
1338         }
1339
1340         qemu_mutex_lock(&p->mutex);
1341         ret = multifd_recv_unfill_packet(p, &local_err);
1342         if (ret) {
1343             qemu_mutex_unlock(&p->mutex);
1344             break;
1345         }
1346
1347         used = p->pages->used;
1348         flags = p->flags;
1349         trace_multifd_recv(p->id, p->packet_num, used, flags,
1350                            p->next_packet_size);
1351         p->num_packets++;
1352         p->num_pages += used;
1353         qemu_mutex_unlock(&p->mutex);
1354
1355         if (used) {
1356             ret = qio_channel_readv_all(p->c, p->pages->iov,
1357                                         used, &local_err);
1358             if (ret != 0) {
1359                 break;
1360             }
1361         }
1362
1363         if (flags & MULTIFD_FLAG_SYNC) {
1364             qemu_sem_post(&multifd_recv_state->sem_sync);
1365             qemu_sem_wait(&p->sem_sync);
1366         }
1367     }
1368
1369     if (local_err) {
1370         multifd_recv_terminate_threads(local_err);
1371     }
1372     qemu_mutex_lock(&p->mutex);
1373     p->running = false;
1374     qemu_mutex_unlock(&p->mutex);
1375
1376     rcu_unregister_thread();
1377     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1378
1379     return NULL;
1380 }
1381
1382 int multifd_load_setup(void)
1383 {
1384     int thread_count;
1385     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1386     uint8_t i;
1387
1388     if (!migrate_use_multifd()) {
1389         return 0;
1390     }
1391     thread_count = migrate_multifd_channels();
1392     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1393     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1394     atomic_set(&multifd_recv_state->count, 0);
1395     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1396
1397     for (i = 0; i < thread_count; i++) {
1398         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1399
1400         qemu_mutex_init(&p->mutex);
1401         qemu_sem_init(&p->sem_sync, 0);
1402         p->id = i;
1403         p->pages = multifd_pages_init(page_count);
1404         p->packet_len = sizeof(MultiFDPacket_t)
1405                       + sizeof(ram_addr_t) * page_count;
1406         p->packet = g_malloc0(p->packet_len);
1407         p->name = g_strdup_printf("multifdrecv_%d", i);
1408     }
1409     return 0;
1410 }
1411
1412 bool multifd_recv_all_channels_created(void)
1413 {
1414     int thread_count = migrate_multifd_channels();
1415
1416     if (!migrate_use_multifd()) {
1417         return true;
1418     }
1419
1420     return thread_count == atomic_read(&multifd_recv_state->count);
1421 }
1422
1423 /*
1424  * Try to receive all multifd channels to get ready for the migration.
1425  * - Return true and do not set @errp when correctly receving all channels;
1426  * - Return false and do not set @errp when correctly receiving the current one;
1427  * - Return false and set @errp when failing to receive the current channel.
1428  */
1429 bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
1430 {
1431     MultiFDRecvParams *p;
1432     Error *local_err = NULL;
1433     int id;
1434
1435     id = multifd_recv_initial_packet(ioc, &local_err);
1436     if (id < 0) {
1437         multifd_recv_terminate_threads(local_err);
1438         error_propagate_prepend(errp, local_err,
1439                                 "failed to receive packet"
1440                                 " via multifd channel %d: ",
1441                                 atomic_read(&multifd_recv_state->count));
1442         return false;
1443     }
1444
1445     p = &multifd_recv_state->params[id];
1446     if (p->c != NULL) {
1447         error_setg(&local_err, "multifd: received id '%d' already setup'",
1448                    id);
1449         multifd_recv_terminate_threads(local_err);
1450         error_propagate(errp, local_err);
1451         return false;
1452     }
1453     p->c = ioc;
1454     object_ref(OBJECT(ioc));
1455     /* initial packet */
1456     p->num_packets = 1;
1457
1458     p->running = true;
1459     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1460                        QEMU_THREAD_JOINABLE);
1461     atomic_inc(&multifd_recv_state->count);
1462     return atomic_read(&multifd_recv_state->count) ==
1463            migrate_multifd_channels();
1464 }
1465
1466 /**
1467  * save_page_header: write page header to wire
1468  *
1469  * If this is the 1st block, it also writes the block identification
1470  *
1471  * Returns the number of bytes written
1472  *
1473  * @f: QEMUFile where to send the data
1474  * @block: block that contains the page we want to send
1475  * @offset: offset inside the block for the page
1476  *          in the lower bits, it contains flags
1477  */
1478 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1479                                ram_addr_t offset)
1480 {
1481     size_t size, len;
1482
1483     if (block == rs->last_sent_block) {
1484         offset |= RAM_SAVE_FLAG_CONTINUE;
1485     }
1486     qemu_put_be64(f, offset);
1487     size = 8;
1488
1489     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1490         len = strlen(block->idstr);
1491         qemu_put_byte(f, len);
1492         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1493         size += 1 + len;
1494         rs->last_sent_block = block;
1495     }
1496     return size;
1497 }
1498
1499 /**
1500  * mig_throttle_guest_down: throotle down the guest
1501  *
1502  * Reduce amount of guest cpu execution to hopefully slow down memory
1503  * writes. If guest dirty memory rate is reduced below the rate at
1504  * which we can transfer pages to the destination then we should be
1505  * able to complete migration. Some workloads dirty memory way too
1506  * fast and will not effectively converge, even with auto-converge.
1507  */
1508 static void mig_throttle_guest_down(void)
1509 {
1510     MigrationState *s = migrate_get_current();
1511     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1512     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1513     int pct_max = s->parameters.max_cpu_throttle;
1514
1515     /* We have not started throttling yet. Let's start it. */
1516     if (!cpu_throttle_active()) {
1517         cpu_throttle_set(pct_initial);
1518     } else {
1519         /* Throttling already on, just increase the rate */
1520         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1521                          pct_max));
1522     }
1523 }
1524
1525 /**
1526  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1527  *
1528  * @rs: current RAM state
1529  * @current_addr: address for the zero page
1530  *
1531  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1532  * The important thing is that a stale (not-yet-0'd) page be replaced
1533  * by the new data.
1534  * As a bonus, if the page wasn't in the cache it gets added so that
1535  * when a small write is made into the 0'd page it gets XBZRLE sent.
1536  */
1537 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1538 {
1539     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1540         return;
1541     }
1542
1543     /* We don't care if this fails to allocate a new cache page
1544      * as long as it updated an old one */
1545     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1546                  ram_counters.dirty_sync_count);
1547 }
1548
1549 #define ENCODING_FLAG_XBZRLE 0x1
1550
1551 /**
1552  * save_xbzrle_page: compress and send current page
1553  *
1554  * Returns: 1 means that we wrote the page
1555  *          0 means that page is identical to the one already sent
1556  *          -1 means that xbzrle would be longer than normal
1557  *
1558  * @rs: current RAM state
1559  * @current_data: pointer to the address of the page contents
1560  * @current_addr: addr of the page
1561  * @block: block that contains the page we want to send
1562  * @offset: offset inside the block for the page
1563  * @last_stage: if we are at the completion stage
1564  */
1565 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1566                             ram_addr_t current_addr, RAMBlock *block,
1567                             ram_addr_t offset, bool last_stage)
1568 {
1569     int encoded_len = 0, bytes_xbzrle;
1570     uint8_t *prev_cached_page;
1571
1572     if (!cache_is_cached(XBZRLE.cache, current_addr,
1573                          ram_counters.dirty_sync_count)) {
1574         xbzrle_counters.cache_miss++;
1575         if (!last_stage) {
1576             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1577                              ram_counters.dirty_sync_count) == -1) {
1578                 return -1;
1579             } else {
1580                 /* update *current_data when the page has been
1581                    inserted into cache */
1582                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1583             }
1584         }
1585         return -1;
1586     }
1587
1588     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1589
1590     /* save current buffer into memory */
1591     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1592
1593     /* XBZRLE encoding (if there is no overflow) */
1594     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1595                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1596                                        TARGET_PAGE_SIZE);
1597     if (encoded_len == 0) {
1598         trace_save_xbzrle_page_skipping();
1599         return 0;
1600     } else if (encoded_len == -1) {
1601         trace_save_xbzrle_page_overflow();
1602         xbzrle_counters.overflow++;
1603         /* update data in the cache */
1604         if (!last_stage) {
1605             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1606             *current_data = prev_cached_page;
1607         }
1608         return -1;
1609     }
1610
1611     /* we need to update the data in the cache, in order to get the same data */
1612     if (!last_stage) {
1613         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1614     }
1615
1616     /* Send XBZRLE based compressed page */
1617     bytes_xbzrle = save_page_header(rs, rs->f, block,
1618                                     offset | RAM_SAVE_FLAG_XBZRLE);
1619     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1620     qemu_put_be16(rs->f, encoded_len);
1621     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1622     bytes_xbzrle += encoded_len + 1 + 2;
1623     xbzrle_counters.pages++;
1624     xbzrle_counters.bytes += bytes_xbzrle;
1625     ram_counters.transferred += bytes_xbzrle;
1626
1627     return 1;
1628 }
1629
1630 /**
1631  * migration_bitmap_find_dirty: find the next dirty page from start
1632  *
1633  * Returns the page offset within memory region of the start of a dirty page
1634  *
1635  * @rs: current RAM state
1636  * @rb: RAMBlock where to search for dirty pages
1637  * @start: page where we start the search
1638  */
1639 static inline
1640 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1641                                           unsigned long start)
1642 {
1643     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1644     unsigned long *bitmap = rb->bmap;
1645     unsigned long next;
1646
1647     if (ramblock_is_ignored(rb)) {
1648         return size;
1649     }
1650
1651     /*
1652      * When the free page optimization is enabled, we need to check the bitmap
1653      * to send the non-free pages rather than all the pages in the bulk stage.
1654      */
1655     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
1656         next = start + 1;
1657     } else {
1658         next = find_next_bit(bitmap, size, start);
1659     }
1660
1661     return next;
1662 }
1663
1664 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1665                                                 RAMBlock *rb,
1666                                                 unsigned long page)
1667 {
1668     bool ret;
1669
1670     qemu_mutex_lock(&rs->bitmap_mutex);
1671     ret = test_and_clear_bit(page, rb->bmap);
1672
1673     if (ret) {
1674         rs->migration_dirty_pages--;
1675     }
1676     qemu_mutex_unlock(&rs->bitmap_mutex);
1677
1678     return ret;
1679 }
1680
1681 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1682                                         ram_addr_t length)
1683 {
1684     rs->migration_dirty_pages +=
1685         cpu_physical_memory_sync_dirty_bitmap(rb, 0, length,
1686                                               &rs->num_dirty_pages_period);
1687 }
1688
1689 /**
1690  * ram_pagesize_summary: calculate all the pagesizes of a VM
1691  *
1692  * Returns a summary bitmap of the page sizes of all RAMBlocks
1693  *
1694  * For VMs with just normal pages this is equivalent to the host page
1695  * size. If it's got some huge pages then it's the OR of all the
1696  * different page sizes.
1697  */
1698 uint64_t ram_pagesize_summary(void)
1699 {
1700     RAMBlock *block;
1701     uint64_t summary = 0;
1702
1703     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1704         summary |= block->page_size;
1705     }
1706
1707     return summary;
1708 }
1709
1710 uint64_t ram_get_total_transferred_pages(void)
1711 {
1712     return  ram_counters.normal + ram_counters.duplicate +
1713                 compression_counters.pages + xbzrle_counters.pages;
1714 }
1715
1716 static void migration_update_rates(RAMState *rs, int64_t end_time)
1717 {
1718     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1719     double compressed_size;
1720
1721     /* calculate period counters */
1722     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1723                 / (end_time - rs->time_last_bitmap_sync);
1724
1725     if (!page_count) {
1726         return;
1727     }
1728
1729     if (migrate_use_xbzrle()) {
1730         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1731             rs->xbzrle_cache_miss_prev) / page_count;
1732         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1733     }
1734
1735     if (migrate_use_compression()) {
1736         compression_counters.busy_rate = (double)(compression_counters.busy -
1737             rs->compress_thread_busy_prev) / page_count;
1738         rs->compress_thread_busy_prev = compression_counters.busy;
1739
1740         compressed_size = compression_counters.compressed_size -
1741                           rs->compressed_size_prev;
1742         if (compressed_size) {
1743             double uncompressed_size = (compression_counters.pages -
1744                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1745
1746             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1747             compression_counters.compression_rate =
1748                                         uncompressed_size / compressed_size;
1749
1750             rs->compress_pages_prev = compression_counters.pages;
1751             rs->compressed_size_prev = compression_counters.compressed_size;
1752         }
1753     }
1754 }
1755
1756 static void migration_bitmap_sync(RAMState *rs)
1757 {
1758     RAMBlock *block;
1759     int64_t end_time;
1760     uint64_t bytes_xfer_now;
1761
1762     ram_counters.dirty_sync_count++;
1763
1764     if (!rs->time_last_bitmap_sync) {
1765         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1766     }
1767
1768     trace_migration_bitmap_sync_start();
1769     memory_global_dirty_log_sync();
1770
1771     qemu_mutex_lock(&rs->bitmap_mutex);
1772     rcu_read_lock();
1773     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1774         migration_bitmap_sync_range(rs, block, block->used_length);
1775     }
1776     ram_counters.remaining = ram_bytes_remaining();
1777     rcu_read_unlock();
1778     qemu_mutex_unlock(&rs->bitmap_mutex);
1779
1780     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1781
1782     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1783
1784     /* more than 1 second = 1000 millisecons */
1785     if (end_time > rs->time_last_bitmap_sync + 1000) {
1786         bytes_xfer_now = ram_counters.transferred;
1787
1788         /* During block migration the auto-converge logic incorrectly detects
1789          * that ram migration makes no progress. Avoid this by disabling the
1790          * throttling logic during the bulk phase of block migration. */
1791         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1792             /* The following detection logic can be refined later. For now:
1793                Check to see if the dirtied bytes is 50% more than the approx.
1794                amount of bytes that just got transferred since the last time we
1795                were in this routine. If that happens twice, start or increase
1796                throttling */
1797
1798             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1799                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1800                 (++rs->dirty_rate_high_cnt >= 2)) {
1801                     trace_migration_throttle();
1802                     rs->dirty_rate_high_cnt = 0;
1803                     mig_throttle_guest_down();
1804             }
1805         }
1806
1807         migration_update_rates(rs, end_time);
1808
1809         rs->target_page_count_prev = rs->target_page_count;
1810
1811         /* reset period counters */
1812         rs->time_last_bitmap_sync = end_time;
1813         rs->num_dirty_pages_period = 0;
1814         rs->bytes_xfer_prev = bytes_xfer_now;
1815     }
1816     if (migrate_use_events()) {
1817         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1818     }
1819 }
1820
1821 static void migration_bitmap_sync_precopy(RAMState *rs)
1822 {
1823     Error *local_err = NULL;
1824
1825     /*
1826      * The current notifier usage is just an optimization to migration, so we
1827      * don't stop the normal migration process in the error case.
1828      */
1829     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1830         error_report_err(local_err);
1831     }
1832
1833     migration_bitmap_sync(rs);
1834
1835     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1836         error_report_err(local_err);
1837     }
1838 }
1839
1840 /**
1841  * save_zero_page_to_file: send the zero page to the file
1842  *
1843  * Returns the size of data written to the file, 0 means the page is not
1844  * a zero page
1845  *
1846  * @rs: current RAM state
1847  * @file: the file where the data is saved
1848  * @block: block that contains the page we want to send
1849  * @offset: offset inside the block for the page
1850  */
1851 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1852                                   RAMBlock *block, ram_addr_t offset)
1853 {
1854     uint8_t *p = block->host + offset;
1855     int len = 0;
1856
1857     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1858         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1859         qemu_put_byte(file, 0);
1860         len += 1;
1861     }
1862     return len;
1863 }
1864
1865 /**
1866  * save_zero_page: send the zero page to the stream
1867  *
1868  * Returns the number of pages written.
1869  *
1870  * @rs: current RAM state
1871  * @block: block that contains the page we want to send
1872  * @offset: offset inside the block for the page
1873  */
1874 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1875 {
1876     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1877
1878     if (len) {
1879         ram_counters.duplicate++;
1880         ram_counters.transferred += len;
1881         return 1;
1882     }
1883     return -1;
1884 }
1885
1886 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1887 {
1888     if (!migrate_release_ram() || !migration_in_postcopy()) {
1889         return;
1890     }
1891
1892     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1893 }
1894
1895 /*
1896  * @pages: the number of pages written by the control path,
1897  *        < 0 - error
1898  *        > 0 - number of pages written
1899  *
1900  * Return true if the pages has been saved, otherwise false is returned.
1901  */
1902 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1903                               int *pages)
1904 {
1905     uint64_t bytes_xmit = 0;
1906     int ret;
1907
1908     *pages = -1;
1909     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1910                                 &bytes_xmit);
1911     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1912         return false;
1913     }
1914
1915     if (bytes_xmit) {
1916         ram_counters.transferred += bytes_xmit;
1917         *pages = 1;
1918     }
1919
1920     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1921         return true;
1922     }
1923
1924     if (bytes_xmit > 0) {
1925         ram_counters.normal++;
1926     } else if (bytes_xmit == 0) {
1927         ram_counters.duplicate++;
1928     }
1929
1930     return true;
1931 }
1932
1933 /*
1934  * directly send the page to the stream
1935  *
1936  * Returns the number of pages written.
1937  *
1938  * @rs: current RAM state
1939  * @block: block that contains the page we want to send
1940  * @offset: offset inside the block for the page
1941  * @buf: the page to be sent
1942  * @async: send to page asyncly
1943  */
1944 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1945                             uint8_t *buf, bool async)
1946 {
1947     ram_counters.transferred += save_page_header(rs, rs->f, block,
1948                                                  offset | RAM_SAVE_FLAG_PAGE);
1949     if (async) {
1950         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1951                               migrate_release_ram() &
1952                               migration_in_postcopy());
1953     } else {
1954         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1955     }
1956     ram_counters.transferred += TARGET_PAGE_SIZE;
1957     ram_counters.normal++;
1958     return 1;
1959 }
1960
1961 /**
1962  * ram_save_page: send the given page to the stream
1963  *
1964  * Returns the number of pages written.
1965  *          < 0 - error
1966  *          >=0 - Number of pages written - this might legally be 0
1967  *                if xbzrle noticed the page was the same.
1968  *
1969  * @rs: current RAM state
1970  * @block: block that contains the page we want to send
1971  * @offset: offset inside the block for the page
1972  * @last_stage: if we are at the completion stage
1973  */
1974 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1975 {
1976     int pages = -1;
1977     uint8_t *p;
1978     bool send_async = true;
1979     RAMBlock *block = pss->block;
1980     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1981     ram_addr_t current_addr = block->offset + offset;
1982
1983     p = block->host + offset;
1984     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1985
1986     XBZRLE_cache_lock();
1987     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1988         migrate_use_xbzrle()) {
1989         pages = save_xbzrle_page(rs, &p, current_addr, block,
1990                                  offset, last_stage);
1991         if (!last_stage) {
1992             /* Can't send this cached data async, since the cache page
1993              * might get updated before it gets to the wire
1994              */
1995             send_async = false;
1996         }
1997     }
1998
1999     /* XBZRLE overflow or normal page */
2000     if (pages == -1) {
2001         pages = save_normal_page(rs, block, offset, p, send_async);
2002     }
2003
2004     XBZRLE_cache_unlock();
2005
2006     return pages;
2007 }
2008
2009 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
2010                                  ram_addr_t offset)
2011 {
2012     multifd_queue_page(block, offset);
2013     ram_counters.normal++;
2014
2015     return 1;
2016 }
2017
2018 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
2019                                  ram_addr_t offset, uint8_t *source_buf)
2020 {
2021     RAMState *rs = ram_state;
2022     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
2023     bool zero_page = false;
2024     int ret;
2025
2026     if (save_zero_page_to_file(rs, f, block, offset)) {
2027         zero_page = true;
2028         goto exit;
2029     }
2030
2031     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
2032
2033     /*
2034      * copy it to a internal buffer to avoid it being modified by VM
2035      * so that we can catch up the error during compression and
2036      * decompression
2037      */
2038     memcpy(source_buf, p, TARGET_PAGE_SIZE);
2039     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
2040     if (ret < 0) {
2041         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
2042         error_report("compressed data failed!");
2043         return false;
2044     }
2045
2046 exit:
2047     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
2048     return zero_page;
2049 }
2050
2051 static void
2052 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
2053 {
2054     ram_counters.transferred += bytes_xmit;
2055
2056     if (param->zero_page) {
2057         ram_counters.duplicate++;
2058         return;
2059     }
2060
2061     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
2062     compression_counters.compressed_size += bytes_xmit - 8;
2063     compression_counters.pages++;
2064 }
2065
2066 static bool save_page_use_compression(RAMState *rs);
2067
2068 static void flush_compressed_data(RAMState *rs)
2069 {
2070     int idx, len, thread_count;
2071
2072     if (!save_page_use_compression(rs)) {
2073         return;
2074     }
2075     thread_count = migrate_compress_threads();
2076
2077     qemu_mutex_lock(&comp_done_lock);
2078     for (idx = 0; idx < thread_count; idx++) {
2079         while (!comp_param[idx].done) {
2080             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2081         }
2082     }
2083     qemu_mutex_unlock(&comp_done_lock);
2084
2085     for (idx = 0; idx < thread_count; idx++) {
2086         qemu_mutex_lock(&comp_param[idx].mutex);
2087         if (!comp_param[idx].quit) {
2088             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2089             /*
2090              * it's safe to fetch zero_page without holding comp_done_lock
2091              * as there is no further request submitted to the thread,
2092              * i.e, the thread should be waiting for a request at this point.
2093              */
2094             update_compress_thread_counts(&comp_param[idx], len);
2095         }
2096         qemu_mutex_unlock(&comp_param[idx].mutex);
2097     }
2098 }
2099
2100 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
2101                                        ram_addr_t offset)
2102 {
2103     param->block = block;
2104     param->offset = offset;
2105 }
2106
2107 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
2108                                            ram_addr_t offset)
2109 {
2110     int idx, thread_count, bytes_xmit = -1, pages = -1;
2111     bool wait = migrate_compress_wait_thread();
2112
2113     thread_count = migrate_compress_threads();
2114     qemu_mutex_lock(&comp_done_lock);
2115 retry:
2116     for (idx = 0; idx < thread_count; idx++) {
2117         if (comp_param[idx].done) {
2118             comp_param[idx].done = false;
2119             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2120             qemu_mutex_lock(&comp_param[idx].mutex);
2121             set_compress_params(&comp_param[idx], block, offset);
2122             qemu_cond_signal(&comp_param[idx].cond);
2123             qemu_mutex_unlock(&comp_param[idx].mutex);
2124             pages = 1;
2125             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
2126             break;
2127         }
2128     }
2129
2130     /*
2131      * wait for the free thread if the user specifies 'compress-wait-thread',
2132      * otherwise we will post the page out in the main thread as normal page.
2133      */
2134     if (pages < 0 && wait) {
2135         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2136         goto retry;
2137     }
2138     qemu_mutex_unlock(&comp_done_lock);
2139
2140     return pages;
2141 }
2142
2143 /**
2144  * find_dirty_block: find the next dirty page and update any state
2145  * associated with the search process.
2146  *
2147  * Returns true if a page is found
2148  *
2149  * @rs: current RAM state
2150  * @pss: data about the state of the current dirty page scan
2151  * @again: set to false if the search has scanned the whole of RAM
2152  */
2153 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
2154 {
2155     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2156     if (pss->complete_round && pss->block == rs->last_seen_block &&
2157         pss->page >= rs->last_page) {
2158         /*
2159          * We've been once around the RAM and haven't found anything.
2160          * Give up.
2161          */
2162         *again = false;
2163         return false;
2164     }
2165     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
2166         /* Didn't find anything in this RAM Block */
2167         pss->page = 0;
2168         pss->block = QLIST_NEXT_RCU(pss->block, next);
2169         if (!pss->block) {
2170             /*
2171              * If memory migration starts over, we will meet a dirtied page
2172              * which may still exists in compression threads's ring, so we
2173              * should flush the compressed data to make sure the new page
2174              * is not overwritten by the old one in the destination.
2175              *
2176              * Also If xbzrle is on, stop using the data compression at this
2177              * point. In theory, xbzrle can do better than compression.
2178              */
2179             flush_compressed_data(rs);
2180
2181             /* Hit the end of the list */
2182             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2183             /* Flag that we've looped */
2184             pss->complete_round = true;
2185             rs->ram_bulk_stage = false;
2186         }
2187         /* Didn't find anything this time, but try again on the new block */
2188         *again = true;
2189         return false;
2190     } else {
2191         /* Can go around again, but... */
2192         *again = true;
2193         /* We've found something so probably don't need to */
2194         return true;
2195     }
2196 }
2197
2198 /**
2199  * unqueue_page: gets a page of the queue
2200  *
2201  * Helper for 'get_queued_page' - gets a page off the queue
2202  *
2203  * Returns the block of the page (or NULL if none available)
2204  *
2205  * @rs: current RAM state
2206  * @offset: used to return the offset within the RAMBlock
2207  */
2208 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
2209 {
2210     RAMBlock *block = NULL;
2211
2212     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2213         return NULL;
2214     }
2215
2216     qemu_mutex_lock(&rs->src_page_req_mutex);
2217     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2218         struct RAMSrcPageRequest *entry =
2219                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
2220         block = entry->rb;
2221         *offset = entry->offset;
2222
2223         if (entry->len > TARGET_PAGE_SIZE) {
2224             entry->len -= TARGET_PAGE_SIZE;
2225             entry->offset += TARGET_PAGE_SIZE;
2226         } else {
2227             memory_region_unref(block->mr);
2228             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2229             g_free(entry);
2230             migration_consume_urgent_request();
2231         }
2232     }
2233     qemu_mutex_unlock(&rs->src_page_req_mutex);
2234
2235     return block;
2236 }
2237
2238 /**
2239  * get_queued_page: unqueue a page from the postcopy requests
2240  *
2241  * Skips pages that are already sent (!dirty)
2242  *
2243  * Returns true if a queued page is found
2244  *
2245  * @rs: current RAM state
2246  * @pss: data about the state of the current dirty page scan
2247  */
2248 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2249 {
2250     RAMBlock  *block;
2251     ram_addr_t offset;
2252     bool dirty;
2253
2254     do {
2255         block = unqueue_page(rs, &offset);
2256         /*
2257          * We're sending this page, and since it's postcopy nothing else
2258          * will dirty it, and we must make sure it doesn't get sent again
2259          * even if this queue request was received after the background
2260          * search already sent it.
2261          */
2262         if (block) {
2263             unsigned long page;
2264
2265             page = offset >> TARGET_PAGE_BITS;
2266             dirty = test_bit(page, block->bmap);
2267             if (!dirty) {
2268                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2269                        page, test_bit(page, block->unsentmap));
2270             } else {
2271                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2272             }
2273         }
2274
2275     } while (block && !dirty);
2276
2277     if (block) {
2278         /*
2279          * As soon as we start servicing pages out of order, then we have
2280          * to kill the bulk stage, since the bulk stage assumes
2281          * in (migration_bitmap_find_and_reset_dirty) that every page is
2282          * dirty, that's no longer true.
2283          */
2284         rs->ram_bulk_stage = false;
2285
2286         /*
2287          * We want the background search to continue from the queued page
2288          * since the guest is likely to want other pages near to the page
2289          * it just requested.
2290          */
2291         pss->block = block;
2292         pss->page = offset >> TARGET_PAGE_BITS;
2293     }
2294
2295     return !!block;
2296 }
2297
2298 /**
2299  * migration_page_queue_free: drop any remaining pages in the ram
2300  * request queue
2301  *
2302  * It should be empty at the end anyway, but in error cases there may
2303  * be some left.  in case that there is any page left, we drop it.
2304  *
2305  */
2306 static void migration_page_queue_free(RAMState *rs)
2307 {
2308     struct RAMSrcPageRequest *mspr, *next_mspr;
2309     /* This queue generally should be empty - but in the case of a failed
2310      * migration might have some droppings in.
2311      */
2312     rcu_read_lock();
2313     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2314         memory_region_unref(mspr->rb->mr);
2315         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2316         g_free(mspr);
2317     }
2318     rcu_read_unlock();
2319 }
2320
2321 /**
2322  * ram_save_queue_pages: queue the page for transmission
2323  *
2324  * A request from postcopy destination for example.
2325  *
2326  * Returns zero on success or negative on error
2327  *
2328  * @rbname: Name of the RAMBLock of the request. NULL means the
2329  *          same that last one.
2330  * @start: starting address from the start of the RAMBlock
2331  * @len: length (in bytes) to send
2332  */
2333 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2334 {
2335     RAMBlock *ramblock;
2336     RAMState *rs = ram_state;
2337
2338     ram_counters.postcopy_requests++;
2339     rcu_read_lock();
2340     if (!rbname) {
2341         /* Reuse last RAMBlock */
2342         ramblock = rs->last_req_rb;
2343
2344         if (!ramblock) {
2345             /*
2346              * Shouldn't happen, we can't reuse the last RAMBlock if
2347              * it's the 1st request.
2348              */
2349             error_report("ram_save_queue_pages no previous block");
2350             goto err;
2351         }
2352     } else {
2353         ramblock = qemu_ram_block_by_name(rbname);
2354
2355         if (!ramblock) {
2356             /* We shouldn't be asked for a non-existent RAMBlock */
2357             error_report("ram_save_queue_pages no block '%s'", rbname);
2358             goto err;
2359         }
2360         rs->last_req_rb = ramblock;
2361     }
2362     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2363     if (start+len > ramblock->used_length) {
2364         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2365                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2366                      __func__, start, len, ramblock->used_length);
2367         goto err;
2368     }
2369
2370     struct RAMSrcPageRequest *new_entry =
2371         g_malloc0(sizeof(struct RAMSrcPageRequest));
2372     new_entry->rb = ramblock;
2373     new_entry->offset = start;
2374     new_entry->len = len;
2375
2376     memory_region_ref(ramblock->mr);
2377     qemu_mutex_lock(&rs->src_page_req_mutex);
2378     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2379     migration_make_urgent_request();
2380     qemu_mutex_unlock(&rs->src_page_req_mutex);
2381     rcu_read_unlock();
2382
2383     return 0;
2384
2385 err:
2386     rcu_read_unlock();
2387     return -1;
2388 }
2389
2390 static bool save_page_use_compression(RAMState *rs)
2391 {
2392     if (!migrate_use_compression()) {
2393         return false;
2394     }
2395
2396     /*
2397      * If xbzrle is on, stop using the data compression after first
2398      * round of migration even if compression is enabled. In theory,
2399      * xbzrle can do better than compression.
2400      */
2401     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2402         return true;
2403     }
2404
2405     return false;
2406 }
2407
2408 /*
2409  * try to compress the page before posting it out, return true if the page
2410  * has been properly handled by compression, otherwise needs other
2411  * paths to handle it
2412  */
2413 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2414 {
2415     if (!save_page_use_compression(rs)) {
2416         return false;
2417     }
2418
2419     /*
2420      * When starting the process of a new block, the first page of
2421      * the block should be sent out before other pages in the same
2422      * block, and all the pages in last block should have been sent
2423      * out, keeping this order is important, because the 'cont' flag
2424      * is used to avoid resending the block name.
2425      *
2426      * We post the fist page as normal page as compression will take
2427      * much CPU resource.
2428      */
2429     if (block != rs->last_sent_block) {
2430         flush_compressed_data(rs);
2431         return false;
2432     }
2433
2434     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2435         return true;
2436     }
2437
2438     compression_counters.busy++;
2439     return false;
2440 }
2441
2442 /**
2443  * ram_save_target_page: save one target page
2444  *
2445  * Returns the number of pages written
2446  *
2447  * @rs: current RAM state
2448  * @pss: data about the page we want to send
2449  * @last_stage: if we are at the completion stage
2450  */
2451 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2452                                 bool last_stage)
2453 {
2454     RAMBlock *block = pss->block;
2455     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2456     int res;
2457
2458     if (control_save_page(rs, block, offset, &res)) {
2459         return res;
2460     }
2461
2462     if (save_compress_page(rs, block, offset)) {
2463         return 1;
2464     }
2465
2466     res = save_zero_page(rs, block, offset);
2467     if (res > 0) {
2468         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2469          * page would be stale
2470          */
2471         if (!save_page_use_compression(rs)) {
2472             XBZRLE_cache_lock();
2473             xbzrle_cache_zero_page(rs, block->offset + offset);
2474             XBZRLE_cache_unlock();
2475         }
2476         ram_release_pages(block->idstr, offset, res);
2477         return res;
2478     }
2479
2480     /*
2481      * do not use multifd for compression as the first page in the new
2482      * block should be posted out before sending the compressed page
2483      */
2484     if (!save_page_use_compression(rs) && migrate_use_multifd()) {
2485         return ram_save_multifd_page(rs, block, offset);
2486     }
2487
2488     return ram_save_page(rs, pss, last_stage);
2489 }
2490
2491 /**
2492  * ram_save_host_page: save a whole host page
2493  *
2494  * Starting at *offset send pages up to the end of the current host
2495  * page. It's valid for the initial offset to point into the middle of
2496  * a host page in which case the remainder of the hostpage is sent.
2497  * Only dirty target pages are sent. Note that the host page size may
2498  * be a huge page for this block.
2499  * The saving stops at the boundary of the used_length of the block
2500  * if the RAMBlock isn't a multiple of the host page size.
2501  *
2502  * Returns the number of pages written or negative on error
2503  *
2504  * @rs: current RAM state
2505  * @ms: current migration state
2506  * @pss: data about the page we want to send
2507  * @last_stage: if we are at the completion stage
2508  */
2509 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2510                               bool last_stage)
2511 {
2512     int tmppages, pages = 0;
2513     size_t pagesize_bits =
2514         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2515
2516     if (ramblock_is_ignored(pss->block)) {
2517         error_report("block %s should not be migrated !", pss->block->idstr);
2518         return 0;
2519     }
2520
2521     do {
2522         /* Check the pages is dirty and if it is send it */
2523         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2524             pss->page++;
2525             continue;
2526         }
2527
2528         tmppages = ram_save_target_page(rs, pss, last_stage);
2529         if (tmppages < 0) {
2530             return tmppages;
2531         }
2532
2533         pages += tmppages;
2534         if (pss->block->unsentmap) {
2535             clear_bit(pss->page, pss->block->unsentmap);
2536         }
2537
2538         pss->page++;
2539     } while ((pss->page & (pagesize_bits - 1)) &&
2540              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2541
2542     /* The offset we leave with is the last one we looked at */
2543     pss->page--;
2544     return pages;
2545 }
2546
2547 /**
2548  * ram_find_and_save_block: finds a dirty page and sends it to f
2549  *
2550  * Called within an RCU critical section.
2551  *
2552  * Returns the number of pages written where zero means no dirty pages,
2553  * or negative on error
2554  *
2555  * @rs: current RAM state
2556  * @last_stage: if we are at the completion stage
2557  *
2558  * On systems where host-page-size > target-page-size it will send all the
2559  * pages in a host page that are dirty.
2560  */
2561
2562 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2563 {
2564     PageSearchStatus pss;
2565     int pages = 0;
2566     bool again, found;
2567
2568     /* No dirty page as there is zero RAM */
2569     if (!ram_bytes_total()) {
2570         return pages;
2571     }
2572
2573     pss.block = rs->last_seen_block;
2574     pss.page = rs->last_page;
2575     pss.complete_round = false;
2576
2577     if (!pss.block) {
2578         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2579     }
2580
2581     do {
2582         again = true;
2583         found = get_queued_page(rs, &pss);
2584
2585         if (!found) {
2586             /* priority queue empty, so just search for something dirty */
2587             found = find_dirty_block(rs, &pss, &again);
2588         }
2589
2590         if (found) {
2591             pages = ram_save_host_page(rs, &pss, last_stage);
2592         }
2593     } while (!pages && again);
2594
2595     rs->last_seen_block = pss.block;
2596     rs->last_page = pss.page;
2597
2598     return pages;
2599 }
2600
2601 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2602 {
2603     uint64_t pages = size / TARGET_PAGE_SIZE;
2604
2605     if (zero) {
2606         ram_counters.duplicate += pages;
2607     } else {
2608         ram_counters.normal += pages;
2609         ram_counters.transferred += size;
2610         qemu_update_position(f, size);
2611     }
2612 }
2613
2614 static uint64_t ram_bytes_total_common(bool count_ignored)
2615 {
2616     RAMBlock *block;
2617     uint64_t total = 0;
2618
2619     rcu_read_lock();
2620     if (count_ignored) {
2621         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2622             total += block->used_length;
2623         }
2624     } else {
2625         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2626             total += block->used_length;
2627         }
2628     }
2629     rcu_read_unlock();
2630     return total;
2631 }
2632
2633 uint64_t ram_bytes_total(void)
2634 {
2635     return ram_bytes_total_common(false);
2636 }
2637
2638 static void xbzrle_load_setup(void)
2639 {
2640     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2641 }
2642
2643 static void xbzrle_load_cleanup(void)
2644 {
2645     g_free(XBZRLE.decoded_buf);
2646     XBZRLE.decoded_buf = NULL;
2647 }
2648
2649 static void ram_state_cleanup(RAMState **rsp)
2650 {
2651     if (*rsp) {
2652         migration_page_queue_free(*rsp);
2653         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2654         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2655         g_free(*rsp);
2656         *rsp = NULL;
2657     }
2658 }
2659
2660 static void xbzrle_cleanup(void)
2661 {
2662     XBZRLE_cache_lock();
2663     if (XBZRLE.cache) {
2664         cache_fini(XBZRLE.cache);
2665         g_free(XBZRLE.encoded_buf);
2666         g_free(XBZRLE.current_buf);
2667         g_free(XBZRLE.zero_target_page);
2668         XBZRLE.cache = NULL;
2669         XBZRLE.encoded_buf = NULL;
2670         XBZRLE.current_buf = NULL;
2671         XBZRLE.zero_target_page = NULL;
2672     }
2673     XBZRLE_cache_unlock();
2674 }
2675
2676 static void ram_save_cleanup(void *opaque)
2677 {
2678     RAMState **rsp = opaque;
2679     RAMBlock *block;
2680
2681     /* caller have hold iothread lock or is in a bh, so there is
2682      * no writing race against the migration bitmap
2683      */
2684     memory_global_dirty_log_stop();
2685
2686     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2687         g_free(block->bmap);
2688         block->bmap = NULL;
2689         g_free(block->unsentmap);
2690         block->unsentmap = NULL;
2691     }
2692
2693     xbzrle_cleanup();
2694     compress_threads_save_cleanup();
2695     ram_state_cleanup(rsp);
2696 }
2697
2698 static void ram_state_reset(RAMState *rs)
2699 {
2700     rs->last_seen_block = NULL;
2701     rs->last_sent_block = NULL;
2702     rs->last_page = 0;
2703     rs->last_version = ram_list.version;
2704     rs->ram_bulk_stage = true;
2705     rs->fpo_enabled = false;
2706 }
2707
2708 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2709
2710 /*
2711  * 'expected' is the value you expect the bitmap mostly to be full
2712  * of; it won't bother printing lines that are all this value.
2713  * If 'todump' is null the migration bitmap is dumped.
2714  */
2715 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2716                            unsigned long pages)
2717 {
2718     int64_t cur;
2719     int64_t linelen = 128;
2720     char linebuf[129];
2721
2722     for (cur = 0; cur < pages; cur += linelen) {
2723         int64_t curb;
2724         bool found = false;
2725         /*
2726          * Last line; catch the case where the line length
2727          * is longer than remaining ram
2728          */
2729         if (cur + linelen > pages) {
2730             linelen = pages - cur;
2731         }
2732         for (curb = 0; curb < linelen; curb++) {
2733             bool thisbit = test_bit(cur + curb, todump);
2734             linebuf[curb] = thisbit ? '1' : '.';
2735             found = found || (thisbit != expected);
2736         }
2737         if (found) {
2738             linebuf[curb] = '\0';
2739             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2740         }
2741     }
2742 }
2743
2744 /* **** functions for postcopy ***** */
2745
2746 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2747 {
2748     struct RAMBlock *block;
2749
2750     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2751         unsigned long *bitmap = block->bmap;
2752         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2753         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2754
2755         while (run_start < range) {
2756             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2757             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2758                               (run_end - run_start) << TARGET_PAGE_BITS);
2759             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2760         }
2761     }
2762 }
2763
2764 /**
2765  * postcopy_send_discard_bm_ram: discard a RAMBlock
2766  *
2767  * Returns zero on success
2768  *
2769  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2770  * Note: At this point the 'unsentmap' is the processed bitmap combined
2771  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2772  *
2773  * @ms: current migration state
2774  * @pds: state for postcopy
2775  * @start: RAMBlock starting page
2776  * @length: RAMBlock size
2777  */
2778 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2779                                         PostcopyDiscardState *pds,
2780                                         RAMBlock *block)
2781 {
2782     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2783     unsigned long current;
2784     unsigned long *unsentmap = block->unsentmap;
2785
2786     for (current = 0; current < end; ) {
2787         unsigned long one = find_next_bit(unsentmap, end, current);
2788
2789         if (one <= end) {
2790             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2791             unsigned long discard_length;
2792
2793             if (zero >= end) {
2794                 discard_length = end - one;
2795             } else {
2796                 discard_length = zero - one;
2797             }
2798             if (discard_length) {
2799                 postcopy_discard_send_range(ms, pds, one, discard_length);
2800             }
2801             current = one + discard_length;
2802         } else {
2803             current = one;
2804         }
2805     }
2806
2807     return 0;
2808 }
2809
2810 /**
2811  * postcopy_each_ram_send_discard: discard all RAMBlocks
2812  *
2813  * Returns 0 for success or negative for error
2814  *
2815  * Utility for the outgoing postcopy code.
2816  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2817  *   passing it bitmap indexes and name.
2818  * (qemu_ram_foreach_block ends up passing unscaled lengths
2819  *  which would mean postcopy code would have to deal with target page)
2820  *
2821  * @ms: current migration state
2822  */
2823 static int postcopy_each_ram_send_discard(MigrationState *ms)
2824 {
2825     struct RAMBlock *block;
2826     int ret;
2827
2828     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2829         PostcopyDiscardState *pds =
2830             postcopy_discard_send_init(ms, block->idstr);
2831
2832         /*
2833          * Postcopy sends chunks of bitmap over the wire, but it
2834          * just needs indexes at this point, avoids it having
2835          * target page specific code.
2836          */
2837         ret = postcopy_send_discard_bm_ram(ms, pds, block);
2838         postcopy_discard_send_finish(ms, pds);
2839         if (ret) {
2840             return ret;
2841         }
2842     }
2843
2844     return 0;
2845 }
2846
2847 /**
2848  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2849  *
2850  * Helper for postcopy_chunk_hostpages; it's called twice to
2851  * canonicalize the two bitmaps, that are similar, but one is
2852  * inverted.
2853  *
2854  * Postcopy requires that all target pages in a hostpage are dirty or
2855  * clean, not a mix.  This function canonicalizes the bitmaps.
2856  *
2857  * @ms: current migration state
2858  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2859  *               otherwise we need to canonicalize partially dirty host pages
2860  * @block: block that contains the page we want to canonicalize
2861  * @pds: state for postcopy
2862  */
2863 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2864                                           RAMBlock *block,
2865                                           PostcopyDiscardState *pds)
2866 {
2867     RAMState *rs = ram_state;
2868     unsigned long *bitmap = block->bmap;
2869     unsigned long *unsentmap = block->unsentmap;
2870     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2871     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2872     unsigned long run_start;
2873
2874     if (block->page_size == TARGET_PAGE_SIZE) {
2875         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2876         return;
2877     }
2878
2879     if (unsent_pass) {
2880         /* Find a sent page */
2881         run_start = find_next_zero_bit(unsentmap, pages, 0);
2882     } else {
2883         /* Find a dirty page */
2884         run_start = find_next_bit(bitmap, pages, 0);
2885     }
2886
2887     while (run_start < pages) {
2888         bool do_fixup = false;
2889         unsigned long fixup_start_addr;
2890         unsigned long host_offset;
2891
2892         /*
2893          * If the start of this run of pages is in the middle of a host
2894          * page, then we need to fixup this host page.
2895          */
2896         host_offset = run_start % host_ratio;
2897         if (host_offset) {
2898             do_fixup = true;
2899             run_start -= host_offset;
2900             fixup_start_addr = run_start;
2901             /* For the next pass */
2902             run_start = run_start + host_ratio;
2903         } else {
2904             /* Find the end of this run */
2905             unsigned long run_end;
2906             if (unsent_pass) {
2907                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2908             } else {
2909                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2910             }
2911             /*
2912              * If the end isn't at the start of a host page, then the
2913              * run doesn't finish at the end of a host page
2914              * and we need to discard.
2915              */
2916             host_offset = run_end % host_ratio;
2917             if (host_offset) {
2918                 do_fixup = true;
2919                 fixup_start_addr = run_end - host_offset;
2920                 /*
2921                  * This host page has gone, the next loop iteration starts
2922                  * from after the fixup
2923                  */
2924                 run_start = fixup_start_addr + host_ratio;
2925             } else {
2926                 /*
2927                  * No discards on this iteration, next loop starts from
2928                  * next sent/dirty page
2929                  */
2930                 run_start = run_end + 1;
2931             }
2932         }
2933
2934         if (do_fixup) {
2935             unsigned long page;
2936
2937             /* Tell the destination to discard this page */
2938             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2939                 /* For the unsent_pass we:
2940                  *     discard partially sent pages
2941                  * For the !unsent_pass (dirty) we:
2942                  *     discard partially dirty pages that were sent
2943                  *     (any partially sent pages were already discarded
2944                  *     by the previous unsent_pass)
2945                  */
2946                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2947                                             host_ratio);
2948             }
2949
2950             /* Clean up the bitmap */
2951             for (page = fixup_start_addr;
2952                  page < fixup_start_addr + host_ratio; page++) {
2953                 /* All pages in this host page are now not sent */
2954                 set_bit(page, unsentmap);
2955
2956                 /*
2957                  * Remark them as dirty, updating the count for any pages
2958                  * that weren't previously dirty.
2959                  */
2960                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2961             }
2962         }
2963
2964         if (unsent_pass) {
2965             /* Find the next sent page for the next iteration */
2966             run_start = find_next_zero_bit(unsentmap, pages, run_start);
2967         } else {
2968             /* Find the next dirty page for the next iteration */
2969             run_start = find_next_bit(bitmap, pages, run_start);
2970         }
2971     }
2972 }
2973
2974 /**
2975  * postcopy_chuck_hostpages: discrad any partially sent host page
2976  *
2977  * Utility for the outgoing postcopy code.
2978  *
2979  * Discard any partially sent host-page size chunks, mark any partially
2980  * dirty host-page size chunks as all dirty.  In this case the host-page
2981  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2982  *
2983  * Returns zero on success
2984  *
2985  * @ms: current migration state
2986  * @block: block we want to work with
2987  */
2988 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2989 {
2990     PostcopyDiscardState *pds =
2991         postcopy_discard_send_init(ms, block->idstr);
2992
2993     /* First pass: Discard all partially sent host pages */
2994     postcopy_chunk_hostpages_pass(ms, true, block, pds);
2995     /*
2996      * Second pass: Ensure that all partially dirty host pages are made
2997      * fully dirty.
2998      */
2999     postcopy_chunk_hostpages_pass(ms, false, block, pds);
3000
3001     postcopy_discard_send_finish(ms, pds);
3002     return 0;
3003 }
3004
3005 /**
3006  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
3007  *
3008  * Returns zero on success
3009  *
3010  * Transmit the set of pages to be discarded after precopy to the target
3011  * these are pages that:
3012  *     a) Have been previously transmitted but are now dirty again
3013  *     b) Pages that have never been transmitted, this ensures that
3014  *        any pages on the destination that have been mapped by background
3015  *        tasks get discarded (transparent huge pages is the specific concern)
3016  * Hopefully this is pretty sparse
3017  *
3018  * @ms: current migration state
3019  */
3020 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
3021 {
3022     RAMState *rs = ram_state;
3023     RAMBlock *block;
3024     int ret;
3025
3026     rcu_read_lock();
3027
3028     /* This should be our last sync, the src is now paused */
3029     migration_bitmap_sync(rs);
3030
3031     /* Easiest way to make sure we don't resume in the middle of a host-page */
3032     rs->last_seen_block = NULL;
3033     rs->last_sent_block = NULL;
3034     rs->last_page = 0;
3035
3036     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3037         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
3038         unsigned long *bitmap = block->bmap;
3039         unsigned long *unsentmap = block->unsentmap;
3040
3041         if (!unsentmap) {
3042             /* We don't have a safe way to resize the sentmap, so
3043              * if the bitmap was resized it will be NULL at this
3044              * point.
3045              */
3046             error_report("migration ram resized during precopy phase");
3047             rcu_read_unlock();
3048             return -EINVAL;
3049         }
3050         /* Deal with TPS != HPS and huge pages */
3051         ret = postcopy_chunk_hostpages(ms, block);
3052         if (ret) {
3053             rcu_read_unlock();
3054             return ret;
3055         }
3056
3057         /*
3058          * Update the unsentmap to be unsentmap = unsentmap | dirty
3059          */
3060         bitmap_or(unsentmap, unsentmap, bitmap, pages);
3061 #ifdef DEBUG_POSTCOPY
3062         ram_debug_dump_bitmap(unsentmap, true, pages);
3063 #endif
3064     }
3065     trace_ram_postcopy_send_discard_bitmap();
3066
3067     ret = postcopy_each_ram_send_discard(ms);
3068     rcu_read_unlock();
3069
3070     return ret;
3071 }
3072
3073 /**
3074  * ram_discard_range: discard dirtied pages at the beginning of postcopy
3075  *
3076  * Returns zero on success
3077  *
3078  * @rbname: name of the RAMBlock of the request. NULL means the
3079  *          same that last one.
3080  * @start: RAMBlock starting page
3081  * @length: RAMBlock size
3082  */
3083 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
3084 {
3085     int ret = -1;
3086
3087     trace_ram_discard_range(rbname, start, length);
3088
3089     rcu_read_lock();
3090     RAMBlock *rb = qemu_ram_block_by_name(rbname);
3091
3092     if (!rb) {
3093         error_report("ram_discard_range: Failed to find block '%s'", rbname);
3094         goto err;
3095     }
3096
3097     /*
3098      * On source VM, we don't need to update the received bitmap since
3099      * we don't even have one.
3100      */
3101     if (rb->receivedmap) {
3102         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3103                      length >> qemu_target_page_bits());
3104     }
3105
3106     ret = ram_block_discard_range(rb, start, length);
3107
3108 err:
3109     rcu_read_unlock();
3110
3111     return ret;
3112 }
3113
3114 /*
3115  * For every allocation, we will try not to crash the VM if the
3116  * allocation failed.
3117  */
3118 static int xbzrle_init(void)
3119 {
3120     Error *local_err = NULL;
3121
3122     if (!migrate_use_xbzrle()) {
3123         return 0;
3124     }
3125
3126     XBZRLE_cache_lock();
3127
3128     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3129     if (!XBZRLE.zero_target_page) {
3130         error_report("%s: Error allocating zero page", __func__);
3131         goto err_out;
3132     }
3133
3134     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3135                               TARGET_PAGE_SIZE, &local_err);
3136     if (!XBZRLE.cache) {
3137         error_report_err(local_err);
3138         goto free_zero_page;
3139     }
3140
3141     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3142     if (!XBZRLE.encoded_buf) {
3143         error_report("%s: Error allocating encoded_buf", __func__);
3144         goto free_cache;
3145     }
3146
3147     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3148     if (!XBZRLE.current_buf) {
3149         error_report("%s: Error allocating current_buf", __func__);
3150         goto free_encoded_buf;
3151     }
3152
3153     /* We are all good */
3154     XBZRLE_cache_unlock();
3155     return 0;
3156
3157 free_encoded_buf:
3158     g_free(XBZRLE.encoded_buf);
3159     XBZRLE.encoded_buf = NULL;
3160 free_cache:
3161     cache_fini(XBZRLE.cache);
3162     XBZRLE.cache = NULL;
3163 free_zero_page:
3164     g_free(XBZRLE.zero_target_page);
3165     XBZRLE.zero_target_page = NULL;
3166 err_out:
3167     XBZRLE_cache_unlock();
3168     return -ENOMEM;
3169 }
3170
3171 static int ram_state_init(RAMState **rsp)
3172 {
3173     *rsp = g_try_new0(RAMState, 1);
3174
3175     if (!*rsp) {
3176         error_report("%s: Init ramstate fail", __func__);
3177         return -1;
3178     }
3179
3180     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3181     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3182     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3183
3184     /*
3185      * Count the total number of pages used by ram blocks not including any
3186      * gaps due to alignment or unplugs.
3187      */
3188     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3189
3190     ram_state_reset(*rsp);
3191
3192     return 0;
3193 }
3194
3195 static void ram_list_init_bitmaps(void)
3196 {
3197     RAMBlock *block;
3198     unsigned long pages;
3199
3200     /* Skip setting bitmap if there is no RAM */
3201     if (ram_bytes_total()) {
3202         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3203             pages = block->max_length >> TARGET_PAGE_BITS;
3204             block->bmap = bitmap_new(pages);
3205             bitmap_set(block->bmap, 0, pages);
3206             if (migrate_postcopy_ram()) {
3207                 block->unsentmap = bitmap_new(pages);
3208                 bitmap_set(block->unsentmap, 0, pages);
3209             }
3210         }
3211     }
3212 }
3213
3214 static void ram_init_bitmaps(RAMState *rs)
3215 {
3216     /* For memory_global_dirty_log_start below.  */
3217     qemu_mutex_lock_iothread();
3218     qemu_mutex_lock_ramlist();
3219     rcu_read_lock();
3220
3221     ram_list_init_bitmaps();
3222     memory_global_dirty_log_start();
3223     migration_bitmap_sync_precopy(rs);
3224
3225     rcu_read_unlock();
3226     qemu_mutex_unlock_ramlist();
3227     qemu_mutex_unlock_iothread();
3228 }
3229
3230 static int ram_init_all(RAMState **rsp)
3231 {
3232     if (ram_state_init(rsp)) {
3233         return -1;
3234     }
3235
3236     if (xbzrle_init()) {
3237         ram_state_cleanup(rsp);
3238         return -1;
3239     }
3240
3241     ram_init_bitmaps(*rsp);
3242
3243     return 0;
3244 }
3245
3246 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3247 {
3248     RAMBlock *block;
3249     uint64_t pages = 0;
3250
3251     /*
3252      * Postcopy is not using xbzrle/compression, so no need for that.
3253      * Also, since source are already halted, we don't need to care
3254      * about dirty page logging as well.
3255      */
3256
3257     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3258         pages += bitmap_count_one(block->bmap,
3259                                   block->used_length >> TARGET_PAGE_BITS);
3260     }
3261
3262     /* This may not be aligned with current bitmaps. Recalculate. */
3263     rs->migration_dirty_pages = pages;
3264
3265     rs->last_seen_block = NULL;
3266     rs->last_sent_block = NULL;
3267     rs->last_page = 0;
3268     rs->last_version = ram_list.version;
3269     /*
3270      * Disable the bulk stage, otherwise we'll resend the whole RAM no
3271      * matter what we have sent.
3272      */
3273     rs->ram_bulk_stage = false;
3274
3275     /* Update RAMState cache of output QEMUFile */
3276     rs->f = out;
3277
3278     trace_ram_state_resume_prepare(pages);
3279 }
3280
3281 /*
3282  * This function clears bits of the free pages reported by the caller from the
3283  * migration dirty bitmap. @addr is the host address corresponding to the
3284  * start of the continuous guest free pages, and @len is the total bytes of
3285  * those pages.
3286  */
3287 void qemu_guest_free_page_hint(void *addr, size_t len)
3288 {
3289     RAMBlock *block;
3290     ram_addr_t offset;
3291     size_t used_len, start, npages;
3292     MigrationState *s = migrate_get_current();
3293
3294     /* This function is currently expected to be used during live migration */
3295     if (!migration_is_setup_or_active(s->state)) {
3296         return;
3297     }
3298
3299     for (; len > 0; len -= used_len, addr += used_len) {
3300         block = qemu_ram_block_from_host(addr, false, &offset);
3301         if (unlikely(!block || offset >= block->used_length)) {
3302             /*
3303              * The implementation might not support RAMBlock resize during
3304              * live migration, but it could happen in theory with future
3305              * updates. So we add a check here to capture that case.
3306              */
3307             error_report_once("%s unexpected error", __func__);
3308             return;
3309         }
3310
3311         if (len <= block->used_length - offset) {
3312             used_len = len;
3313         } else {
3314             used_len = block->used_length - offset;
3315         }
3316
3317         start = offset >> TARGET_PAGE_BITS;
3318         npages = used_len >> TARGET_PAGE_BITS;
3319
3320         qemu_mutex_lock(&ram_state->bitmap_mutex);
3321         ram_state->migration_dirty_pages -=
3322                       bitmap_count_one_with_offset(block->bmap, start, npages);
3323         bitmap_clear(block->bmap, start, npages);
3324         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3325     }
3326 }
3327
3328 /*
3329  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3330  * long-running RCU critical section.  When rcu-reclaims in the code
3331  * start to become numerous it will be necessary to reduce the
3332  * granularity of these critical sections.
3333  */
3334
3335 /**
3336  * ram_save_setup: Setup RAM for migration
3337  *
3338  * Returns zero to indicate success and negative for error
3339  *
3340  * @f: QEMUFile where to send the data
3341  * @opaque: RAMState pointer
3342  */
3343 static int ram_save_setup(QEMUFile *f, void *opaque)
3344 {
3345     RAMState **rsp = opaque;
3346     RAMBlock *block;
3347
3348     if (compress_threads_save_setup()) {
3349         return -1;
3350     }
3351
3352     /* migration has already setup the bitmap, reuse it. */
3353     if (!migration_in_colo_state()) {
3354         if (ram_init_all(rsp) != 0) {
3355             compress_threads_save_cleanup();
3356             return -1;
3357         }
3358     }
3359     (*rsp)->f = f;
3360
3361     rcu_read_lock();
3362
3363     qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3364
3365     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3366         qemu_put_byte(f, strlen(block->idstr));
3367         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3368         qemu_put_be64(f, block->used_length);
3369         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3370             qemu_put_be64(f, block->page_size);
3371         }
3372         if (migrate_ignore_shared()) {
3373             qemu_put_be64(f, block->mr->addr);
3374             qemu_put_byte(f, ramblock_is_ignored(block) ? 1 : 0);
3375         }
3376     }
3377
3378     rcu_read_unlock();
3379
3380     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3381     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3382
3383     multifd_send_sync_main();
3384     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3385     qemu_fflush(f);
3386
3387     return 0;
3388 }
3389
3390 /**
3391  * ram_save_iterate: iterative stage for migration
3392  *
3393  * Returns zero to indicate success and negative for error
3394  *
3395  * @f: QEMUFile where to send the data
3396  * @opaque: RAMState pointer
3397  */
3398 static int ram_save_iterate(QEMUFile *f, void *opaque)
3399 {
3400     RAMState **temp = opaque;
3401     RAMState *rs = *temp;
3402     int ret;
3403     int i;
3404     int64_t t0;
3405     int done = 0;
3406
3407     if (blk_mig_bulk_active()) {
3408         /* Avoid transferring ram during bulk phase of block migration as
3409          * the bulk phase will usually take a long time and transferring
3410          * ram updates during that time is pointless. */
3411         goto out;
3412     }
3413
3414     rcu_read_lock();
3415     if (ram_list.version != rs->last_version) {
3416         ram_state_reset(rs);
3417     }
3418
3419     /* Read version before ram_list.blocks */
3420     smp_rmb();
3421
3422     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3423
3424     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3425     i = 0;
3426     while ((ret = qemu_file_rate_limit(f)) == 0 ||
3427             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3428         int pages;
3429
3430         if (qemu_file_get_error(f)) {
3431             break;
3432         }
3433
3434         pages = ram_find_and_save_block(rs, false);
3435         /* no more pages to sent */
3436         if (pages == 0) {
3437             done = 1;
3438             break;
3439         }
3440
3441         if (pages < 0) {
3442             qemu_file_set_error(f, pages);
3443             break;
3444         }
3445
3446         rs->target_page_count += pages;
3447
3448         /* we want to check in the 1st loop, just in case it was the 1st time
3449            and we had to sync the dirty bitmap.
3450            qemu_clock_get_ns() is a bit expensive, so we only check each some
3451            iterations
3452         */
3453         if ((i & 63) == 0) {
3454             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3455             if (t1 > MAX_WAIT) {
3456                 trace_ram_save_iterate_big_wait(t1, i);
3457                 break;
3458             }
3459         }
3460         i++;
3461     }
3462     rcu_read_unlock();
3463
3464     /*
3465      * Must occur before EOS (or any QEMUFile operation)
3466      * because of RDMA protocol.
3467      */
3468     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3469
3470     multifd_send_sync_main();
3471 out:
3472     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3473     qemu_fflush(f);
3474     ram_counters.transferred += 8;
3475
3476     ret = qemu_file_get_error(f);
3477     if (ret < 0) {
3478         return ret;
3479     }
3480
3481     return done;
3482 }
3483
3484 /**
3485  * ram_save_complete: function called to send the remaining amount of ram
3486  *
3487  * Returns zero to indicate success or negative on error
3488  *
3489  * Called with iothread lock
3490  *
3491  * @f: QEMUFile where to send the data
3492  * @opaque: RAMState pointer
3493  */
3494 static int ram_save_complete(QEMUFile *f, void *opaque)
3495 {
3496     RAMState **temp = opaque;
3497     RAMState *rs = *temp;
3498     int ret = 0;
3499
3500     rcu_read_lock();
3501
3502     if (!migration_in_postcopy()) {
3503         migration_bitmap_sync_precopy(rs);
3504     }
3505
3506     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3507
3508     /* try transferring iterative blocks of memory */
3509
3510     /* flush all remaining blocks regardless of rate limiting */
3511     while (true) {
3512         int pages;
3513
3514         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3515         /* no more blocks to sent */
3516         if (pages == 0) {
3517             break;
3518         }
3519         if (pages < 0) {
3520             ret = pages;
3521             break;
3522         }
3523     }
3524
3525     flush_compressed_data(rs);
3526     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3527
3528     rcu_read_unlock();
3529
3530     multifd_send_sync_main();
3531     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3532     qemu_fflush(f);
3533
3534     return ret;
3535 }
3536
3537 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3538                              uint64_t *res_precopy_only,
3539                              uint64_t *res_compatible,
3540                              uint64_t *res_postcopy_only)
3541 {
3542     RAMState **temp = opaque;
3543     RAMState *rs = *temp;
3544     uint64_t remaining_size;
3545
3546     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3547
3548     if (!migration_in_postcopy() &&
3549         remaining_size < max_size) {
3550         qemu_mutex_lock_iothread();
3551         rcu_read_lock();
3552         migration_bitmap_sync_precopy(rs);
3553         rcu_read_unlock();
3554         qemu_mutex_unlock_iothread();
3555         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3556     }
3557
3558     if (migrate_postcopy_ram()) {
3559         /* We can do postcopy, and all the data is postcopiable */
3560         *res_compatible += remaining_size;
3561     } else {
3562         *res_precopy_only += remaining_size;
3563     }
3564 }
3565
3566 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3567 {
3568     unsigned int xh_len;
3569     int xh_flags;
3570     uint8_t *loaded_data;
3571
3572     /* extract RLE header */
3573     xh_flags = qemu_get_byte(f);
3574     xh_len = qemu_get_be16(f);
3575
3576     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3577         error_report("Failed to load XBZRLE page - wrong compression!");
3578         return -1;
3579     }
3580
3581     if (xh_len > TARGET_PAGE_SIZE) {
3582         error_report("Failed to load XBZRLE page - len overflow!");
3583         return -1;
3584     }
3585     loaded_data = XBZRLE.decoded_buf;
3586     /* load data and decode */
3587     /* it can change loaded_data to point to an internal buffer */
3588     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3589
3590     /* decode RLE */
3591     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3592                              TARGET_PAGE_SIZE) == -1) {
3593         error_report("Failed to load XBZRLE page - decode error!");
3594         return -1;
3595     }
3596
3597     return 0;
3598 }
3599
3600 /**
3601  * ram_block_from_stream: read a RAMBlock id from the migration stream
3602  *
3603  * Must be called from within a rcu critical section.
3604  *
3605  * Returns a pointer from within the RCU-protected ram_list.
3606  *
3607  * @f: QEMUFile where to read the data from
3608  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3609  */
3610 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3611 {
3612     static RAMBlock *block = NULL;
3613     char id[256];
3614     uint8_t len;
3615
3616     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3617         if (!block) {
3618             error_report("Ack, bad migration stream!");
3619             return NULL;
3620         }
3621         return block;
3622     }
3623
3624     len = qemu_get_byte(f);
3625     qemu_get_buffer(f, (uint8_t *)id, len);
3626     id[len] = 0;
3627
3628     block = qemu_ram_block_by_name(id);
3629     if (!block) {
3630         error_report("Can't find block %s", id);
3631         return NULL;
3632     }
3633
3634     if (ramblock_is_ignored(block)) {
3635         error_report("block %s should not be migrated !", id);
3636         return NULL;
3637     }
3638
3639     return block;
3640 }
3641
3642 static inline void *host_from_ram_block_offset(RAMBlock *block,
3643                                                ram_addr_t offset)
3644 {
3645     if (!offset_in_ramblock(block, offset)) {
3646         return NULL;
3647     }
3648
3649     return block->host + offset;
3650 }
3651
3652 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3653                                                  ram_addr_t offset)
3654 {
3655     if (!offset_in_ramblock(block, offset)) {
3656         return NULL;
3657     }
3658     if (!block->colo_cache) {
3659         error_report("%s: colo_cache is NULL in block :%s",
3660                      __func__, block->idstr);
3661         return NULL;
3662     }
3663
3664     /*
3665     * During colo checkpoint, we need bitmap of these migrated pages.
3666     * It help us to decide which pages in ram cache should be flushed
3667     * into VM's RAM later.
3668     */
3669     if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3670         ram_state->migration_dirty_pages++;
3671     }
3672     return block->colo_cache + offset;
3673 }
3674
3675 /**
3676  * ram_handle_compressed: handle the zero page case
3677  *
3678  * If a page (or a whole RDMA chunk) has been
3679  * determined to be zero, then zap it.
3680  *
3681  * @host: host address for the zero page
3682  * @ch: what the page is filled from.  We only support zero
3683  * @size: size of the zero page
3684  */
3685 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3686 {
3687     if (ch != 0 || !is_zero_range(host, size)) {
3688         memset(host, ch, size);
3689     }
3690 }
3691
3692 /* return the size after decompression, or negative value on error */
3693 static int
3694 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3695                      const uint8_t *source, size_t source_len)
3696 {
3697     int err;
3698
3699     err = inflateReset(stream);
3700     if (err != Z_OK) {
3701         return -1;
3702     }
3703
3704     stream->avail_in = source_len;
3705     stream->next_in = (uint8_t *)source;
3706     stream->avail_out = dest_len;
3707     stream->next_out = dest;
3708
3709     err = inflate(stream, Z_NO_FLUSH);
3710     if (err != Z_STREAM_END) {
3711         return -1;
3712     }
3713
3714     return stream->total_out;
3715 }
3716
3717 static void *do_data_decompress(void *opaque)
3718 {
3719     DecompressParam *param = opaque;
3720     unsigned long pagesize;
3721     uint8_t *des;
3722     int len, ret;
3723
3724     qemu_mutex_lock(&param->mutex);
3725     while (!param->quit) {
3726         if (param->des) {
3727             des = param->des;
3728             len = param->len;
3729             param->des = 0;
3730             qemu_mutex_unlock(&param->mutex);
3731
3732             pagesize = TARGET_PAGE_SIZE;
3733
3734             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3735                                        param->compbuf, len);
3736             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3737                 error_report("decompress data failed");
3738                 qemu_file_set_error(decomp_file, ret);
3739             }
3740
3741             qemu_mutex_lock(&decomp_done_lock);
3742             param->done = true;
3743             qemu_cond_signal(&decomp_done_cond);
3744             qemu_mutex_unlock(&decomp_done_lock);
3745
3746             qemu_mutex_lock(&param->mutex);
3747         } else {
3748             qemu_cond_wait(&param->cond, &param->mutex);
3749         }
3750     }
3751     qemu_mutex_unlock(&param->mutex);
3752
3753     return NULL;
3754 }
3755
3756 static int wait_for_decompress_done(void)
3757 {
3758     int idx, thread_count;
3759
3760     if (!migrate_use_compression()) {
3761         return 0;
3762     }
3763
3764     thread_count = migrate_decompress_threads();
3765     qemu_mutex_lock(&decomp_done_lock);
3766     for (idx = 0; idx < thread_count; idx++) {
3767         while (!decomp_param[idx].done) {
3768             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3769         }
3770     }
3771     qemu_mutex_unlock(&decomp_done_lock);
3772     return qemu_file_get_error(decomp_file);
3773 }
3774
3775 static void compress_threads_load_cleanup(void)
3776 {
3777     int i, thread_count;
3778
3779     if (!migrate_use_compression()) {
3780         return;
3781     }
3782     thread_count = migrate_decompress_threads();
3783     for (i = 0; i < thread_count; i++) {
3784         /*
3785          * we use it as a indicator which shows if the thread is
3786          * properly init'd or not
3787          */
3788         if (!decomp_param[i].compbuf) {
3789             break;
3790         }
3791
3792         qemu_mutex_lock(&decomp_param[i].mutex);
3793         decomp_param[i].quit = true;
3794         qemu_cond_signal(&decomp_param[i].cond);
3795         qemu_mutex_unlock(&decomp_param[i].mutex);
3796     }
3797     for (i = 0; i < thread_count; i++) {
3798         if (!decomp_param[i].compbuf) {
3799             break;
3800         }
3801
3802         qemu_thread_join(decompress_threads + i);
3803         qemu_mutex_destroy(&decomp_param[i].mutex);
3804         qemu_cond_destroy(&decomp_param[i].cond);
3805         inflateEnd(&decomp_param[i].stream);
3806         g_free(decomp_param[i].compbuf);
3807         decomp_param[i].compbuf = NULL;
3808     }
3809     g_free(decompress_threads);
3810     g_free(decomp_param);
3811     decompress_threads = NULL;
3812     decomp_param = NULL;
3813     decomp_file = NULL;
3814 }
3815
3816 static int compress_threads_load_setup(QEMUFile *f)
3817 {
3818     int i, thread_count;
3819
3820     if (!migrate_use_compression()) {
3821         return 0;
3822     }
3823
3824     thread_count = migrate_decompress_threads();
3825     decompress_threads = g_new0(QemuThread, thread_count);
3826     decomp_param = g_new0(DecompressParam, thread_count);
3827     qemu_mutex_init(&decomp_done_lock);
3828     qemu_cond_init(&decomp_done_cond);
3829     decomp_file = f;
3830     for (i = 0; i < thread_count; i++) {
3831         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3832             goto exit;
3833         }
3834
3835         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3836         qemu_mutex_init(&decomp_param[i].mutex);
3837         qemu_cond_init(&decomp_param[i].cond);
3838         decomp_param[i].done = true;
3839         decomp_param[i].quit = false;
3840         qemu_thread_create(decompress_threads + i, "decompress",
3841                            do_data_decompress, decomp_param + i,
3842                            QEMU_THREAD_JOINABLE);
3843     }
3844     return 0;
3845 exit:
3846     compress_threads_load_cleanup();
3847     return -1;
3848 }
3849
3850 static void decompress_data_with_multi_threads(QEMUFile *f,
3851                                                void *host, int len)
3852 {
3853     int idx, thread_count;
3854
3855     thread_count = migrate_decompress_threads();
3856     qemu_mutex_lock(&decomp_done_lock);
3857     while (true) {
3858         for (idx = 0; idx < thread_count; idx++) {
3859             if (decomp_param[idx].done) {
3860                 decomp_param[idx].done = false;
3861                 qemu_mutex_lock(&decomp_param[idx].mutex);
3862                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3863                 decomp_param[idx].des = host;
3864                 decomp_param[idx].len = len;
3865                 qemu_cond_signal(&decomp_param[idx].cond);
3866                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3867                 break;
3868             }
3869         }
3870         if (idx < thread_count) {
3871             break;
3872         } else {
3873             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3874         }
3875     }
3876     qemu_mutex_unlock(&decomp_done_lock);
3877 }
3878
3879 /*
3880  * colo cache: this is for secondary VM, we cache the whole
3881  * memory of the secondary VM, it is need to hold the global lock
3882  * to call this helper.
3883  */
3884 int colo_init_ram_cache(void)
3885 {
3886     RAMBlock *block;
3887
3888     rcu_read_lock();
3889     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3890         block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3891                                                 NULL,
3892                                                 false);
3893         if (!block->colo_cache) {
3894             error_report("%s: Can't alloc memory for COLO cache of block %s,"
3895                          "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3896                          block->used_length);
3897             goto out_locked;
3898         }
3899         memcpy(block->colo_cache, block->host, block->used_length);
3900     }
3901     rcu_read_unlock();
3902     /*
3903     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3904     * with to decide which page in cache should be flushed into SVM's RAM. Here
3905     * we use the same name 'ram_bitmap' as for migration.
3906     */
3907     if (ram_bytes_total()) {
3908         RAMBlock *block;
3909
3910         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3911             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3912
3913             block->bmap = bitmap_new(pages);
3914             bitmap_set(block->bmap, 0, pages);
3915         }
3916     }
3917     ram_state = g_new0(RAMState, 1);
3918     ram_state->migration_dirty_pages = 0;
3919     qemu_mutex_init(&ram_state->bitmap_mutex);
3920     memory_global_dirty_log_start();
3921
3922     return 0;
3923
3924 out_locked:
3925
3926     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3927         if (block->colo_cache) {
3928             qemu_anon_ram_free(block->colo_cache, block->used_length);
3929             block->colo_cache = NULL;
3930         }
3931     }
3932
3933     rcu_read_unlock();
3934     return -errno;
3935 }
3936
3937 /* It is need to hold the global lock to call this helper */
3938 void colo_release_ram_cache(void)
3939 {
3940     RAMBlock *block;
3941
3942     memory_global_dirty_log_stop();
3943     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3944         g_free(block->bmap);
3945         block->bmap = NULL;
3946     }
3947
3948     rcu_read_lock();
3949
3950     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3951         if (block->colo_cache) {
3952             qemu_anon_ram_free(block->colo_cache, block->used_length);
3953             block->colo_cache = NULL;
3954         }
3955     }
3956
3957     rcu_read_unlock();
3958     qemu_mutex_destroy(&ram_state->bitmap_mutex);
3959     g_free(ram_state);
3960     ram_state = NULL;
3961 }
3962
3963 /**
3964  * ram_load_setup: Setup RAM for migration incoming side
3965  *
3966  * Returns zero to indicate success and negative for error
3967  *
3968  * @f: QEMUFile where to receive the data
3969  * @opaque: RAMState pointer
3970  */
3971 static int ram_load_setup(QEMUFile *f, void *opaque)
3972 {
3973     if (compress_threads_load_setup(f)) {
3974         return -1;
3975     }
3976
3977     xbzrle_load_setup();
3978     ramblock_recv_map_init();
3979
3980     return 0;
3981 }
3982
3983 static int ram_load_cleanup(void *opaque)
3984 {
3985     RAMBlock *rb;
3986
3987     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3988         if (ramblock_is_pmem(rb)) {
3989             pmem_persist(rb->host, rb->used_length);
3990         }
3991     }
3992
3993     xbzrle_load_cleanup();
3994     compress_threads_load_cleanup();
3995
3996     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3997         g_free(rb->receivedmap);
3998         rb->receivedmap = NULL;
3999     }
4000
4001     return 0;
4002 }
4003
4004 /**
4005  * ram_postcopy_incoming_init: allocate postcopy data structures
4006  *
4007  * Returns 0 for success and negative if there was one error
4008  *
4009  * @mis: current migration incoming state
4010  *
4011  * Allocate data structures etc needed by incoming migration with
4012  * postcopy-ram. postcopy-ram's similarly names
4013  * postcopy_ram_incoming_init does the work.
4014  */
4015 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4016 {
4017     return postcopy_ram_incoming_init(mis);
4018 }
4019
4020 /**
4021  * ram_load_postcopy: load a page in postcopy case
4022  *
4023  * Returns 0 for success or -errno in case of error
4024  *
4025  * Called in postcopy mode by ram_load().
4026  * rcu_read_lock is taken prior to this being called.
4027  *
4028  * @f: QEMUFile where to send the data
4029  */
4030 static int ram_load_postcopy(QEMUFile *f)
4031 {
4032     int flags = 0, ret = 0;
4033     bool place_needed = false;
4034     bool matches_target_page_size = false;
4035     MigrationIncomingState *mis = migration_incoming_get_current();
4036     /* Temporary page that is later 'placed' */
4037     void *postcopy_host_page = postcopy_get_tmp_page(mis);
4038     void *last_host = NULL;
4039     bool all_zero = false;
4040
4041     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4042         ram_addr_t addr;
4043         void *host = NULL;
4044         void *page_buffer = NULL;
4045         void *place_source = NULL;
4046         RAMBlock *block = NULL;
4047         uint8_t ch;
4048
4049         addr = qemu_get_be64(f);
4050
4051         /*
4052          * If qemu file error, we should stop here, and then "addr"
4053          * may be invalid
4054          */
4055         ret = qemu_file_get_error(f);
4056         if (ret) {
4057             break;
4058         }
4059
4060         flags = addr & ~TARGET_PAGE_MASK;
4061         addr &= TARGET_PAGE_MASK;
4062
4063         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
4064         place_needed = false;
4065         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
4066             block = ram_block_from_stream(f, flags);
4067
4068             host = host_from_ram_block_offset(block, addr);
4069             if (!host) {
4070                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4071                 ret = -EINVAL;
4072                 break;
4073             }
4074             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4075             /*
4076              * Postcopy requires that we place whole host pages atomically;
4077              * these may be huge pages for RAMBlocks that are backed by
4078              * hugetlbfs.
4079              * To make it atomic, the data is read into a temporary page
4080              * that's moved into place later.
4081              * The migration protocol uses,  possibly smaller, target-pages
4082              * however the source ensures it always sends all the components
4083              * of a host page in order.
4084              */
4085             page_buffer = postcopy_host_page +
4086                           ((uintptr_t)host & (block->page_size - 1));
4087             /* If all TP are zero then we can optimise the place */
4088             if (!((uintptr_t)host & (block->page_size - 1))) {
4089                 all_zero = true;
4090             } else {
4091                 /* not the 1st TP within the HP */
4092                 if (host != (last_host + TARGET_PAGE_SIZE)) {
4093                     error_report("Non-sequential target page %p/%p",
4094                                   host, last_host);
4095                     ret = -EINVAL;
4096                     break;
4097                 }
4098             }
4099
4100
4101             /*
4102              * If it's the last part of a host page then we place the host
4103              * page
4104              */
4105             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
4106                                      (block->page_size - 1)) == 0;
4107             place_source = postcopy_host_page;
4108         }
4109         last_host = host;
4110
4111         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4112         case RAM_SAVE_FLAG_ZERO:
4113             ch = qemu_get_byte(f);
4114             memset(page_buffer, ch, TARGET_PAGE_SIZE);
4115             if (ch) {
4116                 all_zero = false;
4117             }
4118             break;
4119
4120         case RAM_SAVE_FLAG_PAGE:
4121             all_zero = false;
4122             if (!matches_target_page_size) {
4123                 /* For huge pages, we always use temporary buffer */
4124                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4125             } else {
4126                 /*
4127                  * For small pages that matches target page size, we
4128                  * avoid the qemu_file copy.  Instead we directly use
4129                  * the buffer of QEMUFile to place the page.  Note: we
4130                  * cannot do any QEMUFile operation before using that
4131                  * buffer to make sure the buffer is valid when
4132                  * placing the page.
4133                  */
4134                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4135                                          TARGET_PAGE_SIZE);
4136             }
4137             break;
4138         case RAM_SAVE_FLAG_EOS:
4139             /* normal exit */
4140             multifd_recv_sync_main();
4141             break;
4142         default:
4143             error_report("Unknown combination of migration flags: %#x"
4144                          " (postcopy mode)", flags);
4145             ret = -EINVAL;
4146             break;
4147         }
4148
4149         /* Detect for any possible file errors */
4150         if (!ret && qemu_file_get_error(f)) {
4151             ret = qemu_file_get_error(f);
4152         }
4153
4154         if (!ret && place_needed) {
4155             /* This gets called at the last target page in the host page */
4156             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
4157
4158             if (all_zero) {
4159                 ret = postcopy_place_page_zero(mis, place_dest,
4160                                                block);
4161             } else {
4162                 ret = postcopy_place_page(mis, place_dest,
4163                                           place_source, block);
4164             }
4165         }
4166     }
4167
4168     return ret;
4169 }
4170
4171 static bool postcopy_is_advised(void)
4172 {
4173     PostcopyState ps = postcopy_state_get();
4174     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4175 }
4176
4177 static bool postcopy_is_running(void)
4178 {
4179     PostcopyState ps = postcopy_state_get();
4180     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4181 }
4182
4183 /*
4184  * Flush content of RAM cache into SVM's memory.
4185  * Only flush the pages that be dirtied by PVM or SVM or both.
4186  */
4187 static void colo_flush_ram_cache(void)
4188 {
4189     RAMBlock *block = NULL;
4190     void *dst_host;
4191     void *src_host;
4192     unsigned long offset = 0;
4193
4194     memory_global_dirty_log_sync();
4195     rcu_read_lock();
4196     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4197         migration_bitmap_sync_range(ram_state, block, block->used_length);
4198     }
4199     rcu_read_unlock();
4200
4201     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4202     rcu_read_lock();
4203     block = QLIST_FIRST_RCU(&ram_list.blocks);
4204
4205     while (block) {
4206         offset = migration_bitmap_find_dirty(ram_state, block, offset);
4207
4208         if (offset << TARGET_PAGE_BITS >= block->used_length) {
4209             offset = 0;
4210             block = QLIST_NEXT_RCU(block, next);
4211         } else {
4212             migration_bitmap_clear_dirty(ram_state, block, offset);
4213             dst_host = block->host + (offset << TARGET_PAGE_BITS);
4214             src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
4215             memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4216         }
4217     }
4218
4219     rcu_read_unlock();
4220     trace_colo_flush_ram_cache_end();
4221 }
4222
4223 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4224 {
4225     int flags = 0, ret = 0, invalid_flags = 0;
4226     static uint64_t seq_iter;
4227     int len = 0;
4228     /*
4229      * If system is running in postcopy mode, page inserts to host memory must
4230      * be atomic
4231      */
4232     bool postcopy_running = postcopy_is_running();
4233     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4234     bool postcopy_advised = postcopy_is_advised();
4235
4236     seq_iter++;
4237
4238     if (version_id != 4) {
4239         ret = -EINVAL;
4240     }
4241
4242     if (!migrate_use_compression()) {
4243         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4244     }
4245     /* This RCU critical section can be very long running.
4246      * When RCU reclaims in the code start to become numerous,
4247      * it will be necessary to reduce the granularity of this
4248      * critical section.
4249      */
4250     rcu_read_lock();
4251
4252     if (postcopy_running) {
4253         ret = ram_load_postcopy(f);
4254     }
4255
4256     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4257         ram_addr_t addr, total_ram_bytes;
4258         void *host = NULL;
4259         uint8_t ch;
4260
4261         addr = qemu_get_be64(f);
4262         flags = addr & ~TARGET_PAGE_MASK;
4263         addr &= TARGET_PAGE_MASK;
4264
4265         if (flags & invalid_flags) {
4266             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4267                 error_report("Received an unexpected compressed page");
4268             }
4269
4270             ret = -EINVAL;
4271             break;
4272         }
4273
4274         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4275                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4276             RAMBlock *block = ram_block_from_stream(f, flags);
4277
4278             /*
4279              * After going into COLO, we should load the Page into colo_cache.
4280              */
4281             if (migration_incoming_in_colo_state()) {
4282                 host = colo_cache_from_block_offset(block, addr);
4283             } else {
4284                 host = host_from_ram_block_offset(block, addr);
4285             }
4286             if (!host) {
4287                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4288                 ret = -EINVAL;
4289                 break;
4290             }
4291
4292             if (!migration_incoming_in_colo_state()) {
4293                 ramblock_recv_bitmap_set(block, host);
4294             }
4295
4296             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4297         }
4298
4299         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4300         case RAM_SAVE_FLAG_MEM_SIZE:
4301             /* Synchronize RAM block list */
4302             total_ram_bytes = addr;
4303             while (!ret && total_ram_bytes) {
4304                 RAMBlock *block;
4305                 char id[256];
4306                 ram_addr_t length;
4307
4308                 len = qemu_get_byte(f);
4309                 qemu_get_buffer(f, (uint8_t *)id, len);
4310                 id[len] = 0;
4311                 length = qemu_get_be64(f);
4312
4313                 block = qemu_ram_block_by_name(id);
4314                 if (block && !qemu_ram_is_migratable(block)) {
4315                     error_report("block %s should not be migrated !", id);
4316                     ret = -EINVAL;
4317                 } else if (block) {
4318                     if (length != block->used_length) {
4319                         Error *local_err = NULL;
4320
4321                         ret = qemu_ram_resize(block, length,
4322                                               &local_err);
4323                         if (local_err) {
4324                             error_report_err(local_err);
4325                         }
4326                     }
4327                     /* For postcopy we need to check hugepage sizes match */
4328                     if (postcopy_advised &&
4329                         block->page_size != qemu_host_page_size) {
4330                         uint64_t remote_page_size = qemu_get_be64(f);
4331                         if (remote_page_size != block->page_size) {
4332                             error_report("Mismatched RAM page size %s "
4333                                          "(local) %zd != %" PRId64,
4334                                          id, block->page_size,
4335                                          remote_page_size);
4336                             ret = -EINVAL;
4337                         }
4338                     }
4339                     if (migrate_ignore_shared()) {
4340                         hwaddr addr = qemu_get_be64(f);
4341                         bool ignored = qemu_get_byte(f);
4342                         if (ignored != ramblock_is_ignored(block)) {
4343                             error_report("RAM block %s should %s be migrated",
4344                                          id, ignored ? "" : "not");
4345                             ret = -EINVAL;
4346                         }
4347                         if (ramblock_is_ignored(block) &&
4348                             block->mr->addr != addr) {
4349                             error_report("Mismatched GPAs for block %s "
4350                                          "%" PRId64 "!= %" PRId64,
4351                                          id, (uint64_t)addr,
4352                                          (uint64_t)block->mr->addr);
4353                             ret = -EINVAL;
4354                         }
4355                     }
4356                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4357                                           block->idstr);
4358                 } else {
4359                     error_report("Unknown ramblock \"%s\", cannot "
4360                                  "accept migration", id);
4361                     ret = -EINVAL;
4362                 }
4363
4364                 total_ram_bytes -= length;
4365             }
4366             break;
4367
4368         case RAM_SAVE_FLAG_ZERO:
4369             ch = qemu_get_byte(f);
4370             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4371             break;
4372
4373         case RAM_SAVE_FLAG_PAGE:
4374             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4375             break;
4376
4377         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4378             len = qemu_get_be32(f);
4379             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4380                 error_report("Invalid compressed data length: %d", len);
4381                 ret = -EINVAL;
4382                 break;
4383             }
4384             decompress_data_with_multi_threads(f, host, len);
4385             break;
4386
4387         case RAM_SAVE_FLAG_XBZRLE:
4388             if (load_xbzrle(f, addr, host) < 0) {
4389                 error_report("Failed to decompress XBZRLE page at "
4390                              RAM_ADDR_FMT, addr);
4391                 ret = -EINVAL;
4392                 break;
4393             }
4394             break;
4395         case RAM_SAVE_FLAG_EOS:
4396             /* normal exit */
4397             multifd_recv_sync_main();
4398             break;
4399         default:
4400             if (flags & RAM_SAVE_FLAG_HOOK) {
4401                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4402             } else {
4403                 error_report("Unknown combination of migration flags: %#x",
4404                              flags);
4405                 ret = -EINVAL;
4406             }
4407         }
4408         if (!ret) {
4409             ret = qemu_file_get_error(f);
4410         }
4411     }
4412
4413     ret |= wait_for_decompress_done();
4414     rcu_read_unlock();
4415     trace_ram_load_complete(ret, seq_iter);
4416
4417     if (!ret  && migration_incoming_in_colo_state()) {
4418         colo_flush_ram_cache();
4419     }
4420     return ret;
4421 }
4422
4423 static bool ram_has_postcopy(void *opaque)
4424 {
4425     RAMBlock *rb;
4426     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4427         if (ramblock_is_pmem(rb)) {
4428             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4429                          "is not supported now!", rb->idstr, rb->host);
4430             return false;
4431         }
4432     }
4433
4434     return migrate_postcopy_ram();
4435 }
4436
4437 /* Sync all the dirty bitmap with destination VM.  */
4438 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4439 {
4440     RAMBlock *block;
4441     QEMUFile *file = s->to_dst_file;
4442     int ramblock_count = 0;
4443
4444     trace_ram_dirty_bitmap_sync_start();
4445
4446     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4447         qemu_savevm_send_recv_bitmap(file, block->idstr);
4448         trace_ram_dirty_bitmap_request(block->idstr);
4449         ramblock_count++;
4450     }
4451
4452     trace_ram_dirty_bitmap_sync_wait();
4453
4454     /* Wait until all the ramblocks' dirty bitmap synced */
4455     while (ramblock_count--) {
4456         qemu_sem_wait(&s->rp_state.rp_sem);
4457     }
4458
4459     trace_ram_dirty_bitmap_sync_complete();
4460
4461     return 0;
4462 }
4463
4464 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4465 {
4466     qemu_sem_post(&s->rp_state.rp_sem);
4467 }
4468
4469 /*
4470  * Read the received bitmap, revert it as the initial dirty bitmap.
4471  * This is only used when the postcopy migration is paused but wants
4472  * to resume from a middle point.
4473  */
4474 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4475 {
4476     int ret = -EINVAL;
4477     QEMUFile *file = s->rp_state.from_dst_file;
4478     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4479     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4480     uint64_t size, end_mark;
4481
4482     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4483
4484     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4485         error_report("%s: incorrect state %s", __func__,
4486                      MigrationStatus_str(s->state));
4487         return -EINVAL;
4488     }
4489
4490     /*
4491      * Note: see comments in ramblock_recv_bitmap_send() on why we
4492      * need the endianess convertion, and the paddings.
4493      */
4494     local_size = ROUND_UP(local_size, 8);
4495
4496     /* Add paddings */
4497     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4498
4499     size = qemu_get_be64(file);
4500
4501     /* The size of the bitmap should match with our ramblock */
4502     if (size != local_size) {
4503         error_report("%s: ramblock '%s' bitmap size mismatch "
4504                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4505                      block->idstr, size, local_size);
4506         ret = -EINVAL;
4507         goto out;
4508     }
4509
4510     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4511     end_mark = qemu_get_be64(file);
4512
4513     ret = qemu_file_get_error(file);
4514     if (ret || size != local_size) {
4515         error_report("%s: read bitmap failed for ramblock '%s': %d"
4516                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4517                      __func__, block->idstr, ret, local_size, size);
4518         ret = -EIO;
4519         goto out;
4520     }
4521
4522     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4523         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4524                      __func__, block->idstr, end_mark);
4525         ret = -EINVAL;
4526         goto out;
4527     }
4528
4529     /*
4530      * Endianess convertion. We are during postcopy (though paused).
4531      * The dirty bitmap won't change. We can directly modify it.
4532      */
4533     bitmap_from_le(block->bmap, le_bitmap, nbits);
4534
4535     /*
4536      * What we received is "received bitmap". Revert it as the initial
4537      * dirty bitmap for this ramblock.
4538      */
4539     bitmap_complement(block->bmap, block->bmap, nbits);
4540
4541     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4542
4543     /*
4544      * We succeeded to sync bitmap for current ramblock. If this is
4545      * the last one to sync, we need to notify the main send thread.
4546      */
4547     ram_dirty_bitmap_reload_notify(s);
4548
4549     ret = 0;
4550 out:
4551     g_free(le_bitmap);
4552     return ret;
4553 }
4554
4555 static int ram_resume_prepare(MigrationState *s, void *opaque)
4556 {
4557     RAMState *rs = *(RAMState **)opaque;
4558     int ret;
4559
4560     ret = ram_dirty_bitmap_sync_all(s, rs);
4561     if (ret) {
4562         return ret;
4563     }
4564
4565     ram_state_resume_prepare(rs, s->to_dst_file);
4566
4567     return 0;
4568 }
4569
4570 static SaveVMHandlers savevm_ram_handlers = {
4571     .save_setup = ram_save_setup,
4572     .save_live_iterate = ram_save_iterate,
4573     .save_live_complete_postcopy = ram_save_complete,
4574     .save_live_complete_precopy = ram_save_complete,
4575     .has_postcopy = ram_has_postcopy,
4576     .save_live_pending = ram_save_pending,
4577     .load_state = ram_load,
4578     .save_cleanup = ram_save_cleanup,
4579     .load_setup = ram_load_setup,
4580     .load_cleanup = ram_load_cleanup,
4581     .resume_prepare = ram_resume_prepare,
4582 };
4583
4584 void ram_mig_init(void)
4585 {
4586     qemu_mutex_init(&XBZRLE.lock);
4587     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4588 }