migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "qemu/pmem.h"
  37 #include "xbzrle.h"
  38 #include "ram.h"
  39 #include "migration.h"
  40 #include "socket.h"
  41 #include "migration/register.h"
  42 #include "migration/misc.h"
  43 #include "qemu-file.h"
  44 #include "postcopy-ram.h"
  45 #include "page_cache.h"
  46 #include "qemu/error-report.h"
  47 #include "qapi/error.h"
  48 #include "qapi/qapi-events-migration.h"
  49 #include "qapi/qmp/qerror.h"
  50 #include "trace.h"
  51 #include "exec/ram_addr.h"
  52 #include "exec/target_page.h"
  53 #include "qemu/rcu_queue.h"
  54 #include "migration/colo.h"
  55 #include "block.h"
  56 #include "sysemu/sysemu.h"
  57 #include "qemu/uuid.h"
  58 #include "savevm.h"
  59 #include "qemu/iov.h"
  60
  61 /***********************************************************/
  62 /* ram save/restore */
  63
  64 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  65  * worked for pages that where filled with the same char.  We switched
  66  * it to only search for the zero value.  And to avoid confusion with
  67  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  68  */
  69
  70 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  71 #define RAM_SAVE_FLAG_ZERO     0x02
  72 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  73 #define RAM_SAVE_FLAG_PAGE     0x08
  74 #define RAM_SAVE_FLAG_EOS      0x10
  75 #define RAM_SAVE_FLAG_CONTINUE 0x20
  76 #define RAM_SAVE_FLAG_XBZRLE   0x40
  77 /* 0x80 is reserved in migration.h start with 0x100 next */
  78 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  79
  80 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  81 {
  82     return buffer_is_zero(p, size);
  83 }
  84
  85 XBZRLECacheStats xbzrle_counters;
  86
  87 /* struct contains XBZRLE cache and a static page
  88    used by the compression */
  89 static struct {
  90     /* buffer used for XBZRLE encoding */
  91     uint8_t *encoded_buf;
  92     /* buffer for storing page content */
  93     uint8_t *current_buf;
  94     /* Cache for XBZRLE, Protected by lock. */
  95     PageCache *cache;
  96     QemuMutex lock;
  97     /* it will store a page full of zeros */
  98     uint8_t *zero_target_page;
  99     /* buffer used for XBZRLE decoding */
 100     uint8_t *decoded_buf;
 101 } XBZRLE;
 102
 103 static void XBZRLE_cache_lock(void)
 104 {
 105     if (migrate_use_xbzrle())
 106         qemu_mutex_lock(&XBZRLE.lock);
 107 }
 108
 109 static void XBZRLE_cache_unlock(void)
 110 {
 111     if (migrate_use_xbzrle())
 112         qemu_mutex_unlock(&XBZRLE.lock);
 113 }
 114
 115 /**
 116  * xbzrle_cache_resize: resize the xbzrle cache
 117  *
 118  * This function is called from qmp_migrate_set_cache_size in main
 119  * thread, possibly while a migration is in progress.  A running
 120  * migration may be using the cache and might finish during this call,
 121  * hence changes to the cache are protected by XBZRLE.lock().
 122  *
 123  * Returns 0 for success or -1 for error
 124  *
 125  * @new_size: new cache size
 126  * @errp: set *errp if the check failed, with reason
 127  */
 128 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 129 {
 130     PageCache *new_cache;
 131     int64_t ret = 0;
 132
 133     /* Check for truncation */
 134     if (new_size != (size_t)new_size) {
 135         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 136                    "exceeding address space");
 137         return -1;
 138     }
 139
 140     if (new_size == migrate_xbzrle_cache_size()) {
 141         /* nothing to do */
 142         return 0;
 143     }
 144
 145     XBZRLE_cache_lock();
 146
 147     if (XBZRLE.cache != NULL) {
 148         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 149         if (!new_cache) {
 150             ret = -1;
 151             goto out;
 152         }
 153
 154         cache_fini(XBZRLE.cache);
 155         XBZRLE.cache = new_cache;
 156     }
 157 out:
 158     XBZRLE_cache_unlock();
 159     return ret;
 160 }
 161
 162 static bool ramblock_is_ignored(RAMBlock *block)
 163 {
 164     return !qemu_ram_is_migratable(block) ||
 165            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 166 }
 167
 168 /* Should be holding either ram_list.mutex, or the RCU lock. */
 169 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
 170     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 171         if (ramblock_is_ignored(block)) {} else
 172
 173 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 174     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 175         if (!qemu_ram_is_migratable(block)) {} else
 176
 177 #undef RAMBLOCK_FOREACH
 178
 179 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 180 {
 181     RAMBlock *block;
 182     int ret = 0;
 183
 184     rcu_read_lock();
 185     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 186         ret = func(block, opaque);
 187         if (ret) {
 188             break;
 189         }
 190     }
 191     rcu_read_unlock();
 192     return ret;
 193 }
 194
 195 static void ramblock_recv_map_init(void)
 196 {
 197     RAMBlock *rb;
 198
 199     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 200         assert(!rb->receivedmap);
 201         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 202     }
 203 }
 204
 205 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 206 {
 207     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 208                     rb->receivedmap);
 209 }
 210
 211 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 212 {
 213     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 214 }
 215
 216 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 217 {
 218     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 219 }
 220
 221 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 222                                     size_t nr)
 223 {
 224     bitmap_set_atomic(rb->receivedmap,
 225                       ramblock_recv_bitmap_offset(host_addr, rb),
 226                       nr);
 227 }
 228
 229 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 230
 231 /*
 232  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 233  *
 234  * Returns >0 if success with sent bytes, or <0 if error.
 235  */
 236 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 237                                   const char *block_name)
 238 {
 239     RAMBlock *block = qemu_ram_block_by_name(block_name);
 240     unsigned long *le_bitmap, nbits;
 241     uint64_t size;
 242
 243     if (!block) {
 244         error_report("%s: invalid block name: %s", __func__, block_name);
 245         return -1;
 246     }
 247
 248     nbits = block->used_length >> TARGET_PAGE_BITS;
 249
 250     /*
 251      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 252      * machines we may need 4 more bytes for padding (see below
 253      * comment). So extend it a bit before hand.
 254      */
 255     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 256
 257     /*
 258      * Always use little endian when sending the bitmap. This is
 259      * required that when source and destination VMs are not using the
 260      * same endianess. (Note: big endian won't work.)
 261      */
 262     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 263
 264     /* Size of the bitmap, in bytes */
 265     size = DIV_ROUND_UP(nbits, 8);
 266
 267     /*
 268      * size is always aligned to 8 bytes for 64bit machines, but it
 269      * may not be true for 32bit machines. We need this padding to
 270      * make sure the migration can survive even between 32bit and
 271      * 64bit machines.
 272      */
 273     size = ROUND_UP(size, 8);
 274
 275     qemu_put_be64(file, size);
 276     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 277     /*
 278      * Mark as an end, in case the middle part is screwed up due to
 279      * some "misterious" reason.
 280      */
 281     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 282     qemu_fflush(file);
 283
 284     g_free(le_bitmap);
 285
 286     if (qemu_file_get_error(file)) {
 287         return qemu_file_get_error(file);
 288     }
 289
 290     return size + sizeof(size);
 291 }
 292
 293 /*
 294  * An outstanding page request, on the source, having been received
 295  * and queued
 296  */
 297 struct RAMSrcPageRequest {
 298     RAMBlock *rb;
 299     hwaddr    offset;
 300     hwaddr    len;
 301
 302     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 303 };
 304
 305 /* State of RAM for migration */
 306 struct RAMState {
 307     /* QEMUFile used for this migration */
 308     QEMUFile *f;
 309     /* Last block that we have visited searching for dirty pages */
 310     RAMBlock *last_seen_block;
 311     /* Last block from where we have sent data */
 312     RAMBlock *last_sent_block;
 313     /* Last dirty target page we have sent */
 314     ram_addr_t last_page;
 315     /* last ram version we have seen */
 316     uint32_t last_version;
 317     /* We are in the first round */
 318     bool ram_bulk_stage;
 319     /* The free page optimization is enabled */
 320     bool fpo_enabled;
 321     /* How many times we have dirty too many pages */
 322     int dirty_rate_high_cnt;
 323     /* these variables are used for bitmap sync */
 324     /* last time we did a full bitmap_sync */
 325     int64_t time_last_bitmap_sync;
 326     /* bytes transferred at start_time */
 327     uint64_t bytes_xfer_prev;
 328     /* number of dirty pages since start_time */
 329     uint64_t num_dirty_pages_period;
 330     /* xbzrle misses since the beginning of the period */
 331     uint64_t xbzrle_cache_miss_prev;
 332
 333     /* compression statistics since the beginning of the period */
 334     /* amount of count that no free thread to compress data */
 335     uint64_t compress_thread_busy_prev;
 336     /* amount bytes after compression */
 337     uint64_t compressed_size_prev;
 338     /* amount of compressed pages */
 339     uint64_t compress_pages_prev;
 340
 341     /* total handled target pages at the beginning of period */
 342     uint64_t target_page_count_prev;
 343     /* total handled target pages since start */
 344     uint64_t target_page_count;
 345     /* number of dirty bits in the bitmap */
 346     uint64_t migration_dirty_pages;
 347     /* Protects modification of the bitmap and migration dirty pages */
 348     QemuMutex bitmap_mutex;
 349     /* The RAMBlock used in the last src_page_requests */
 350     RAMBlock *last_req_rb;
 351     /* Queue of outstanding page requests from the destination */
 352     QemuMutex src_page_req_mutex;
 353     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 354 };
 355 typedef struct RAMState RAMState;
 356
 357 static RAMState *ram_state;
 358
 359 static NotifierWithReturnList precopy_notifier_list;
 360
 361 void precopy_infrastructure_init(void)
 362 {
 363     notifier_with_return_list_init(&precopy_notifier_list);
 364 }
 365
 366 void precopy_add_notifier(NotifierWithReturn *n)
 367 {
 368     notifier_with_return_list_add(&precopy_notifier_list, n);
 369 }
 370
 371 void precopy_remove_notifier(NotifierWithReturn *n)
 372 {
 373     notifier_with_return_remove(n);
 374 }
 375
 376 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 377 {
 378     PrecopyNotifyData pnd;
 379     pnd.reason = reason;
 380     pnd.errp = errp;
 381
 382     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 383 }
 384
 385 void precopy_enable_free_page_optimization(void)
 386 {
 387     if (!ram_state) {
 388         return;
 389     }
 390
 391     ram_state->fpo_enabled = true;
 392 }
 393
 394 uint64_t ram_bytes_remaining(void)
 395 {
 396     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 397                        0;
 398 }
 399
 400 MigrationStats ram_counters;
 401
 402 /* used by the search for pages to send */
 403 struct PageSearchStatus {
 404     /* Current block being searched */
 405     RAMBlock    *block;
 406     /* Current page to search from */
 407     unsigned long page;
 408     /* Set once we wrap around */
 409     bool         complete_round;
 410 };
 411 typedef struct PageSearchStatus PageSearchStatus;
 412
 413 CompressionStats compression_counters;
 414
 415 struct CompressParam {
 416     bool done;
 417     bool quit;
 418     bool zero_page;
 419     QEMUFile *file;
 420     QemuMutex mutex;
 421     QemuCond cond;
 422     RAMBlock *block;
 423     ram_addr_t offset;
 424
 425     /* internally used fields */
 426     z_stream stream;
 427     uint8_t *originbuf;
 428 };
 429 typedef struct CompressParam CompressParam;
 430
 431 struct DecompressParam {
 432     bool done;
 433     bool quit;
 434     QemuMutex mutex;
 435     QemuCond cond;
 436     void *des;
 437     uint8_t *compbuf;
 438     int len;
 439     z_stream stream;
 440 };
 441 typedef struct DecompressParam DecompressParam;
 442
 443 static CompressParam *comp_param;
 444 static QemuThread *compress_threads;
 445 /* comp_done_cond is used to wake up the migration thread when
 446  * one of the compression threads has finished the compression.
 447  * comp_done_lock is used to co-work with comp_done_cond.
 448  */
 449 static QemuMutex comp_done_lock;
 450 static QemuCond comp_done_cond;
 451 /* The empty QEMUFileOps will be used by file in CompressParam */
 452 static const QEMUFileOps empty_ops = { };
 453
 454 static QEMUFile *decomp_file;
 455 static DecompressParam *decomp_param;
 456 static QemuThread *decompress_threads;
 457 static QemuMutex decomp_done_lock;
 458 static QemuCond decomp_done_cond;
 459
 460 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 461                                  ram_addr_t offset, uint8_t *source_buf);
 462
 463 static void *do_data_compress(void *opaque)
 464 {
 465     CompressParam *param = opaque;
 466     RAMBlock *block;
 467     ram_addr_t offset;
 468     bool zero_page;
 469
 470     qemu_mutex_lock(&param->mutex);
 471     while (!param->quit) {
 472         if (param->block) {
 473             block = param->block;
 474             offset = param->offset;
 475             param->block = NULL;
 476             qemu_mutex_unlock(&param->mutex);
 477
 478             zero_page = do_compress_ram_page(param->file, &param->stream,
 479                                              block, offset, param->originbuf);
 480
 481             qemu_mutex_lock(&comp_done_lock);
 482             param->done = true;
 483             param->zero_page = zero_page;
 484             qemu_cond_signal(&comp_done_cond);
 485             qemu_mutex_unlock(&comp_done_lock);
 486
 487             qemu_mutex_lock(&param->mutex);
 488         } else {
 489             qemu_cond_wait(&param->cond, &param->mutex);
 490         }
 491     }
 492     qemu_mutex_unlock(&param->mutex);
 493
 494     return NULL;
 495 }
 496
 497 static void compress_threads_save_cleanup(void)
 498 {
 499     int i, thread_count;
 500
 501     if (!migrate_use_compression() || !comp_param) {
 502         return;
 503     }
 504
 505     thread_count = migrate_compress_threads();
 506     for (i = 0; i < thread_count; i++) {
 507         /*
 508          * we use it as a indicator which shows if the thread is
 509          * properly init'd or not
 510          */
 511         if (!comp_param[i].file) {
 512             break;
 513         }
 514
 515         qemu_mutex_lock(&comp_param[i].mutex);
 516         comp_param[i].quit = true;
 517         qemu_cond_signal(&comp_param[i].cond);
 518         qemu_mutex_unlock(&comp_param[i].mutex);
 519
 520         qemu_thread_join(compress_threads + i);
 521         qemu_mutex_destroy(&comp_param[i].mutex);
 522         qemu_cond_destroy(&comp_param[i].cond);
 523         deflateEnd(&comp_param[i].stream);
 524         g_free(comp_param[i].originbuf);
 525         qemu_fclose(comp_param[i].file);
 526         comp_param[i].file = NULL;
 527     }
 528     qemu_mutex_destroy(&comp_done_lock);
 529     qemu_cond_destroy(&comp_done_cond);
 530     g_free(compress_threads);
 531     g_free(comp_param);
 532     compress_threads = NULL;
 533     comp_param = NULL;
 534 }
 535
 536 static int compress_threads_save_setup(void)
 537 {
 538     int i, thread_count;
 539
 540     if (!migrate_use_compression()) {
 541         return 0;
 542     }
 543     thread_count = migrate_compress_threads();
 544     compress_threads = g_new0(QemuThread, thread_count);
 545     comp_param = g_new0(CompressParam, thread_count);
 546     qemu_cond_init(&comp_done_cond);
 547     qemu_mutex_init(&comp_done_lock);
 548     for (i = 0; i < thread_count; i++) {
 549         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 550         if (!comp_param[i].originbuf) {
 551             goto exit;
 552         }
 553
 554         if (deflateInit(&comp_param[i].stream,
 555                         migrate_compress_level()) != Z_OK) {
 556             g_free(comp_param[i].originbuf);
 557             goto exit;
 558         }
 559
 560         /* comp_param[i].file is just used as a dummy buffer to save data,
 561          * set its ops to empty.
 562          */
 563         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 564         comp_param[i].done = true;
 565         comp_param[i].quit = false;
 566         qemu_mutex_init(&comp_param[i].mutex);
 567         qemu_cond_init(&comp_param[i].cond);
 568         qemu_thread_create(compress_threads + i, "compress",
 569                            do_data_compress, comp_param + i,
 570                            QEMU_THREAD_JOINABLE);
 571     }
 572     return 0;
 573
 574 exit:
 575     compress_threads_save_cleanup();
 576     return -1;
 577 }
 578
 579 /* Multiple fd's */
 580
 581 #define MULTIFD_MAGIC 0x11223344U
 582 #define MULTIFD_VERSION 1
 583
 584 #define MULTIFD_FLAG_SYNC (1 << 0)
 585
 586 /* This value needs to be a multiple of qemu_target_page_size() */
 587 #define MULTIFD_PACKET_SIZE (512 * 1024)
 588
 589 typedef struct {
 590     uint32_t magic;
 591     uint32_t version;
 592     unsigned char uuid[16]; /* QemuUUID */
 593     uint8_t id;
 594     uint8_t unused1[7];     /* Reserved for future use */
 595     uint64_t unused2[4];    /* Reserved for future use */
 596 } __attribute__((packed)) MultiFDInit_t;
 597
 598 typedef struct {
 599     uint32_t magic;
 600     uint32_t version;
 601     uint32_t flags;
 602     /* maximum number of allocated pages */
 603     uint32_t pages_alloc;
 604     uint32_t pages_used;
 605     /* size of the next packet that contains pages */
 606     uint32_t next_packet_size;
 607     uint64_t packet_num;
 608     uint64_t unused[4];    /* Reserved for future use */
 609     char ramblock[256];
 610     uint64_t offset[];
 611 } __attribute__((packed)) MultiFDPacket_t;
 612
 613 typedef struct {
 614     /* number of used pages */
 615     uint32_t used;
 616     /* number of allocated pages */
 617     uint32_t allocated;
 618     /* global number of generated multifd packets */
 619     uint64_t packet_num;
 620     /* offset of each page */
 621     ram_addr_t *offset;
 622     /* pointer to each page */
 623     struct iovec *iov;
 624     RAMBlock *block;
 625 } MultiFDPages_t;
 626
 627 typedef struct {
 628     /* this fields are not changed once the thread is created */
 629     /* channel number */
 630     uint8_t id;
 631     /* channel thread name */
 632     char *name;
 633     /* channel thread id */
 634     QemuThread thread;
 635     /* communication channel */
 636     QIOChannel *c;
 637     /* sem where to wait for more work */
 638     QemuSemaphore sem;
 639     /* this mutex protects the following parameters */
 640     QemuMutex mutex;
 641     /* is this channel thread running */
 642     bool running;
 643     /* should this thread finish */
 644     bool quit;
 645     /* thread has work to do */
 646     int pending_job;
 647     /* array of pages to sent */
 648     MultiFDPages_t *pages;
 649     /* packet allocated len */
 650     uint32_t packet_len;
 651     /* pointer to the packet */
 652     MultiFDPacket_t *packet;
 653     /* multifd flags for each packet */
 654     uint32_t flags;
 655     /* size of the next packet that contains pages */
 656     uint32_t next_packet_size;
 657     /* global number of generated multifd packets */
 658     uint64_t packet_num;
 659     /* thread local variables */
 660     /* packets sent through this channel */
 661     uint64_t num_packets;
 662     /* pages sent through this channel */
 663     uint64_t num_pages;
 664     /* syncs main thread and channels */
 665     QemuSemaphore sem_sync;
 666 }  MultiFDSendParams;
 667
 668 typedef struct {
 669     /* this fields are not changed once the thread is created */
 670     /* channel number */
 671     uint8_t id;
 672     /* channel thread name */
 673     char *name;
 674     /* channel thread id */
 675     QemuThread thread;
 676     /* communication channel */
 677     QIOChannel *c;
 678     /* this mutex protects the following parameters */
 679     QemuMutex mutex;
 680     /* is this channel thread running */
 681     bool running;
 682     /* should this thread finish */
 683     bool quit;
 684     /* array of pages to receive */
 685     MultiFDPages_t *pages;
 686     /* packet allocated len */
 687     uint32_t packet_len;
 688     /* pointer to the packet */
 689     MultiFDPacket_t *packet;
 690     /* multifd flags for each packet */
 691     uint32_t flags;
 692     /* global number of generated multifd packets */
 693     uint64_t packet_num;
 694     /* thread local variables */
 695     /* size of the next packet that contains pages */
 696     uint32_t next_packet_size;
 697     /* packets sent through this channel */
 698     uint64_t num_packets;
 699     /* pages sent through this channel */
 700     uint64_t num_pages;
 701     /* syncs main thread and channels */
 702     QemuSemaphore sem_sync;
 703 } MultiFDRecvParams;
 704
 705 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 706 {
 707     MultiFDInit_t msg;
 708     int ret;
 709
 710     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 711     msg.version = cpu_to_be32(MULTIFD_VERSION);
 712     msg.id = p->id;
 713     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 714
 715     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 716     if (ret != 0) {
 717         return -1;
 718     }
 719     return 0;
 720 }
 721
 722 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 723 {
 724     MultiFDInit_t msg;
 725     int ret;
 726
 727     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 728     if (ret != 0) {
 729         return -1;
 730     }
 731
 732     msg.magic = be32_to_cpu(msg.magic);
 733     msg.version = be32_to_cpu(msg.version);
 734
 735     if (msg.magic != MULTIFD_MAGIC) {
 736         error_setg(errp, "multifd: received packet magic %x "
 737                    "expected %x", msg.magic, MULTIFD_MAGIC);
 738         return -1;
 739     }
 740
 741     if (msg.version != MULTIFD_VERSION) {
 742         error_setg(errp, "multifd: received packet version %d "
 743                    "expected %d", msg.version, MULTIFD_VERSION);
 744         return -1;
 745     }
 746
 747     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 748         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 749         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 750
 751         error_setg(errp, "multifd: received uuid '%s' and expected "
 752                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 753         g_free(uuid);
 754         g_free(msg_uuid);
 755         return -1;
 756     }
 757
 758     if (msg.id > migrate_multifd_channels()) {
 759         error_setg(errp, "multifd: received channel version %d "
 760                    "expected %d", msg.version, MULTIFD_VERSION);
 761         return -1;
 762     }
 763
 764     return msg.id;
 765 }
 766
 767 static MultiFDPages_t *multifd_pages_init(size_t size)
 768 {
 769     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 770
 771     pages->allocated = size;
 772     pages->iov = g_new0(struct iovec, size);
 773     pages->offset = g_new0(ram_addr_t, size);
 774
 775     return pages;
 776 }
 777
 778 static void multifd_pages_clear(MultiFDPages_t *pages)
 779 {
 780     pages->used = 0;
 781     pages->allocated = 0;
 782     pages->packet_num = 0;
 783     pages->block = NULL;
 784     g_free(pages->iov);
 785     pages->iov = NULL;
 786     g_free(pages->offset);
 787     pages->offset = NULL;
 788     g_free(pages);
 789 }
 790
 791 static void multifd_send_fill_packet(MultiFDSendParams *p)
 792 {
 793     MultiFDPacket_t *packet = p->packet;
 794     uint32_t page_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
 795     int i;
 796
 797     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
 798     packet->version = cpu_to_be32(MULTIFD_VERSION);
 799     packet->flags = cpu_to_be32(p->flags);
 800     packet->pages_alloc = cpu_to_be32(page_max);
 801     packet->pages_used = cpu_to_be32(p->pages->used);
 802     packet->next_packet_size = cpu_to_be32(p->next_packet_size);
 803     packet->packet_num = cpu_to_be64(p->packet_num);
 804
 805     if (p->pages->block) {
 806         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 807     }
 808
 809     for (i = 0; i < p->pages->used; i++) {
 810         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 811     }
 812 }
 813
 814 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 815 {
 816     MultiFDPacket_t *packet = p->packet;
 817     uint32_t pages_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
 818     RAMBlock *block;
 819     int i;
 820
 821     packet->magic = be32_to_cpu(packet->magic);
 822     if (packet->magic != MULTIFD_MAGIC) {
 823         error_setg(errp, "multifd: received packet "
 824                    "magic %x and expected magic %x",
 825                    packet->magic, MULTIFD_MAGIC);
 826         return -1;
 827     }
 828
 829     packet->version = be32_to_cpu(packet->version);
 830     if (packet->version != MULTIFD_VERSION) {
 831         error_setg(errp, "multifd: received packet "
 832                    "version %d and expected version %d",
 833                    packet->version, MULTIFD_VERSION);
 834         return -1;
 835     }
 836
 837     p->flags = be32_to_cpu(packet->flags);
 838
 839     packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
 840     /*
 841      * If we recevied a packet that is 100 times bigger than expected
 842      * just stop migration.  It is a magic number.
 843      */
 844     if (packet->pages_alloc > pages_max * 100) {
 845         error_setg(errp, "multifd: received packet "
 846                    "with size %d and expected a maximum size of %d",
 847                    packet->pages_alloc, pages_max * 100) ;
 848         return -1;
 849     }
 850     /*
 851      * We received a packet that is bigger than expected but inside
 852      * reasonable limits (see previous comment).  Just reallocate.
 853      */
 854     if (packet->pages_alloc > p->pages->allocated) {
 855         multifd_pages_clear(p->pages);
 856         p->pages = multifd_pages_init(packet->pages_alloc);
 857     }
 858
 859     p->pages->used = be32_to_cpu(packet->pages_used);
 860     if (p->pages->used > packet->pages_alloc) {
 861         error_setg(errp, "multifd: received packet "
 862                    "with %d pages and expected maximum pages are %d",
 863                    p->pages->used, packet->pages_alloc) ;
 864         return -1;
 865     }
 866
 867     p->next_packet_size = be32_to_cpu(packet->next_packet_size);
 868     p->packet_num = be64_to_cpu(packet->packet_num);
 869
 870     if (p->pages->used) {
 871         /* make sure that ramblock is 0 terminated */
 872         packet->ramblock[255] = 0;
 873         block = qemu_ram_block_by_name(packet->ramblock);
 874         if (!block) {
 875             error_setg(errp, "multifd: unknown ram block %s",
 876                        packet->ramblock);
 877             return -1;
 878         }
 879     }
 880
 881     for (i = 0; i < p->pages->used; i++) {
 882         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 883
 884         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 885             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 886                        " (max " RAM_ADDR_FMT ")",
 887                        offset, block->max_length);
 888             return -1;
 889         }
 890         p->pages->iov[i].iov_base = block->host + offset;
 891         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 892     }
 893
 894     return 0;
 895 }
 896
 897 struct {
 898     MultiFDSendParams *params;
 899     /* array of pages to sent */
 900     MultiFDPages_t *pages;
 901     /* global number of generated multifd packets */
 902     uint64_t packet_num;
 903     /* send channels ready */
 904     QemuSemaphore channels_ready;
 905 } *multifd_send_state;
 906
 907 /*
 908  * How we use multifd_send_state->pages and channel->pages?
 909  *
 910  * We create a pages for each channel, and a main one.  Each time that
 911  * we need to send a batch of pages we interchange the ones between
 912  * multifd_send_state and the channel that is sending it.  There are
 913  * two reasons for that:
 914  *    - to not have to do so many mallocs during migration
 915  *    - to make easier to know what to free at the end of migration
 916  *
 917  * This way we always know who is the owner of each "pages" struct,
 918  * and we don't need any locking.  It belongs to the migration thread
 919  * or to the channel thread.  Switching is safe because the migration
 920  * thread is using the channel mutex when changing it, and the channel
 921  * have to had finish with its own, otherwise pending_job can't be
 922  * false.
 923  */
 924
 925 static int multifd_send_pages(RAMState *rs)
 926 {
 927     int i;
 928     static int next_channel;
 929     MultiFDSendParams *p = NULL; /* make happy gcc */
 930     MultiFDPages_t *pages = multifd_send_state->pages;
 931     uint64_t transferred;
 932
 933     qemu_sem_wait(&multifd_send_state->channels_ready);
 934     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 935         p = &multifd_send_state->params[i];
 936
 937         qemu_mutex_lock(&p->mutex);
 938         if (p->quit) {
 939             error_report("%s: channel %d has already quit!", __func__, i);
 940             qemu_mutex_unlock(&p->mutex);
 941             return -1;
 942         }
 943         if (!p->pending_job) {
 944             p->pending_job++;
 945             next_channel = (i + 1) % migrate_multifd_channels();
 946             break;
 947         }
 948         qemu_mutex_unlock(&p->mutex);
 949     }
 950     p->pages->used = 0;
 951
 952     p->packet_num = multifd_send_state->packet_num++;
 953     p->pages->block = NULL;
 954     multifd_send_state->pages = p->pages;
 955     p->pages = pages;
 956     transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
 957     qemu_file_update_transfer(rs->f, transferred);
 958     ram_counters.multifd_bytes += transferred;
 959     ram_counters.transferred += transferred;;
 960     qemu_mutex_unlock(&p->mutex);
 961     qemu_sem_post(&p->sem);
 962
 963     return 1;
 964 }
 965
 966 static int multifd_queue_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
 967 {
 968     MultiFDPages_t *pages = multifd_send_state->pages;
 969
 970     if (!pages->block) {
 971         pages->block = block;
 972     }
 973
 974     if (pages->block == block) {
 975         pages->offset[pages->used] = offset;
 976         pages->iov[pages->used].iov_base = block->host + offset;
 977         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
 978         pages->used++;
 979
 980         if (pages->used < pages->allocated) {
 981             return 1;
 982         }
 983     }
 984
 985     if (multifd_send_pages(rs) < 0) {
 986         return -1;
 987     }
 988
 989     if (pages->block != block) {
 990         return  multifd_queue_page(rs, block, offset);
 991     }
 992
 993     return 1;
 994 }
 995
 996 static void multifd_send_terminate_threads(Error *err)
 997 {
 998     int i;
 999
1000     trace_multifd_send_terminate_threads(err != NULL);
1001
1002     if (err) {
1003         MigrationState *s = migrate_get_current();
1004         migrate_set_error(s, err);
1005         if (s->state == MIGRATION_STATUS_SETUP ||
1006             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
1007             s->state == MIGRATION_STATUS_DEVICE ||
1008             s->state == MIGRATION_STATUS_ACTIVE) {
1009             migrate_set_state(&s->state, s->state,
1010                               MIGRATION_STATUS_FAILED);
1011         }
1012     }
1013
1014     for (i = 0; i < migrate_multifd_channels(); i++) {
1015         MultiFDSendParams *p = &multifd_send_state->params[i];
1016
1017         qemu_mutex_lock(&p->mutex);
1018         p->quit = true;
1019         qemu_sem_post(&p->sem);
1020         qemu_mutex_unlock(&p->mutex);
1021     }
1022 }
1023
1024 void multifd_save_cleanup(void)
1025 {
1026     int i;
1027
1028     if (!migrate_use_multifd()) {
1029         return;
1030     }
1031     multifd_send_terminate_threads(NULL);
1032     for (i = 0; i < migrate_multifd_channels(); i++) {
1033         MultiFDSendParams *p = &multifd_send_state->params[i];
1034
1035         if (p->running) {
1036             qemu_thread_join(&p->thread);
1037         }
1038         socket_send_channel_destroy(p->c);
1039         p->c = NULL;
1040         qemu_mutex_destroy(&p->mutex);
1041         qemu_sem_destroy(&p->sem);
1042         qemu_sem_destroy(&p->sem_sync);
1043         g_free(p->name);
1044         p->name = NULL;
1045         multifd_pages_clear(p->pages);
1046         p->pages = NULL;
1047         p->packet_len = 0;
1048         g_free(p->packet);
1049         p->packet = NULL;
1050     }
1051     qemu_sem_destroy(&multifd_send_state->channels_ready);
1052     g_free(multifd_send_state->params);
1053     multifd_send_state->params = NULL;
1054     multifd_pages_clear(multifd_send_state->pages);
1055     multifd_send_state->pages = NULL;
1056     g_free(multifd_send_state);
1057     multifd_send_state = NULL;
1058 }
1059
1060 static void multifd_send_sync_main(RAMState *rs)
1061 {
1062     int i;
1063
1064     if (!migrate_use_multifd()) {
1065         return;
1066     }
1067     if (multifd_send_state->pages->used) {
1068         if (multifd_send_pages(rs) < 0) {
1069             error_report("%s: multifd_send_pages fail", __func__);
1070             return;
1071         }
1072     }
1073     for (i = 0; i < migrate_multifd_channels(); i++) {
1074         MultiFDSendParams *p = &multifd_send_state->params[i];
1075
1076         trace_multifd_send_sync_main_signal(p->id);
1077
1078         qemu_mutex_lock(&p->mutex);
1079
1080         if (p->quit) {
1081             error_report("%s: channel %d has already quit", __func__, i);
1082             qemu_mutex_unlock(&p->mutex);
1083             return;
1084         }
1085
1086         p->packet_num = multifd_send_state->packet_num++;
1087         p->flags |= MULTIFD_FLAG_SYNC;
1088         p->pending_job++;
1089         qemu_file_update_transfer(rs->f, p->packet_len);
1090         ram_counters.multifd_bytes += p->packet_len;
1091         ram_counters.transferred += p->packet_len;
1092         qemu_mutex_unlock(&p->mutex);
1093         qemu_sem_post(&p->sem);
1094     }
1095     for (i = 0; i < migrate_multifd_channels(); i++) {
1096         MultiFDSendParams *p = &multifd_send_state->params[i];
1097
1098         trace_multifd_send_sync_main_wait(p->id);
1099         qemu_sem_wait(&p->sem_sync);
1100     }
1101     trace_multifd_send_sync_main(multifd_send_state->packet_num);
1102 }
1103
1104 static void *multifd_send_thread(void *opaque)
1105 {
1106     MultiFDSendParams *p = opaque;
1107     Error *local_err = NULL;
1108     int ret = 0;
1109     uint32_t flags = 0;
1110
1111     trace_multifd_send_thread_start(p->id);
1112     rcu_register_thread();
1113
1114     if (multifd_send_initial_packet(p, &local_err) < 0) {
1115         goto out;
1116     }
1117     /* initial packet */
1118     p->num_packets = 1;
1119
1120     while (true) {
1121         qemu_sem_wait(&p->sem);
1122         qemu_mutex_lock(&p->mutex);
1123
1124         if (p->pending_job) {
1125             uint32_t used = p->pages->used;
1126             uint64_t packet_num = p->packet_num;
1127             flags = p->flags;
1128
1129             p->next_packet_size = used * qemu_target_page_size();
1130             multifd_send_fill_packet(p);
1131             p->flags = 0;
1132             p->num_packets++;
1133             p->num_pages += used;
1134             p->pages->used = 0;
1135             qemu_mutex_unlock(&p->mutex);
1136
1137             trace_multifd_send(p->id, packet_num, used, flags,
1138                                p->next_packet_size);
1139
1140             ret = qio_channel_write_all(p->c, (void *)p->packet,
1141                                         p->packet_len, &local_err);
1142             if (ret != 0) {
1143                 break;
1144             }
1145
1146             if (used) {
1147                 ret = qio_channel_writev_all(p->c, p->pages->iov,
1148                                              used, &local_err);
1149                 if (ret != 0) {
1150                     break;
1151                 }
1152             }
1153
1154             qemu_mutex_lock(&p->mutex);
1155             p->pending_job--;
1156             qemu_mutex_unlock(&p->mutex);
1157
1158             if (flags & MULTIFD_FLAG_SYNC) {
1159                 qemu_sem_post(&p->sem_sync);
1160             }
1161             qemu_sem_post(&multifd_send_state->channels_ready);
1162         } else if (p->quit) {
1163             qemu_mutex_unlock(&p->mutex);
1164             break;
1165         } else {
1166             qemu_mutex_unlock(&p->mutex);
1167             /* sometimes there are spurious wakeups */
1168         }
1169     }
1170
1171 out:
1172     if (local_err) {
1173         trace_multifd_send_error(p->id);
1174         multifd_send_terminate_threads(local_err);
1175     }
1176
1177     /*
1178      * Error happen, I will exit, but I can't just leave, tell
1179      * who pay attention to me.
1180      */
1181     if (ret != 0) {
1182         if (flags & MULTIFD_FLAG_SYNC) {
1183             qemu_sem_post(&p->sem_sync);
1184         }
1185         qemu_sem_post(&multifd_send_state->channels_ready);
1186     }
1187
1188     qemu_mutex_lock(&p->mutex);
1189     p->running = false;
1190     qemu_mutex_unlock(&p->mutex);
1191
1192     rcu_unregister_thread();
1193     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1194
1195     return NULL;
1196 }
1197
1198 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1199 {
1200     MultiFDSendParams *p = opaque;
1201     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1202     Error *local_err = NULL;
1203
1204     trace_multifd_new_send_channel_async(p->id);
1205     if (qio_task_propagate_error(task, &local_err)) {
1206         migrate_set_error(migrate_get_current(), local_err);
1207         multifd_save_cleanup();
1208     } else {
1209         p->c = QIO_CHANNEL(sioc);
1210         qio_channel_set_delay(p->c, false);
1211         p->running = true;
1212         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1213                            QEMU_THREAD_JOINABLE);
1214     }
1215 }
1216
1217 int multifd_save_setup(void)
1218 {
1219     int thread_count;
1220     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1221     uint8_t i;
1222
1223     if (!migrate_use_multifd()) {
1224         return 0;
1225     }
1226     thread_count = migrate_multifd_channels();
1227     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1228     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1229     multifd_send_state->pages = multifd_pages_init(page_count);
1230     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1231
1232     for (i = 0; i < thread_count; i++) {
1233         MultiFDSendParams *p = &multifd_send_state->params[i];
1234
1235         qemu_mutex_init(&p->mutex);
1236         qemu_sem_init(&p->sem, 0);
1237         qemu_sem_init(&p->sem_sync, 0);
1238         p->quit = false;
1239         p->pending_job = 0;
1240         p->id = i;
1241         p->pages = multifd_pages_init(page_count);
1242         p->packet_len = sizeof(MultiFDPacket_t)
1243                       + sizeof(ram_addr_t) * page_count;
1244         p->packet = g_malloc0(p->packet_len);
1245         p->name = g_strdup_printf("multifdsend_%d", i);
1246         socket_send_channel_create(multifd_new_send_channel_async, p);
1247     }
1248     return 0;
1249 }
1250
1251 struct {
1252     MultiFDRecvParams *params;
1253     /* number of created threads */
1254     int count;
1255     /* syncs main thread and channels */
1256     QemuSemaphore sem_sync;
1257     /* global number of generated multifd packets */
1258     uint64_t packet_num;
1259 } *multifd_recv_state;
1260
1261 static void multifd_recv_terminate_threads(Error *err)
1262 {
1263     int i;
1264
1265     trace_multifd_recv_terminate_threads(err != NULL);
1266
1267     if (err) {
1268         MigrationState *s = migrate_get_current();
1269         migrate_set_error(s, err);
1270         if (s->state == MIGRATION_STATUS_SETUP ||
1271             s->state == MIGRATION_STATUS_ACTIVE) {
1272             migrate_set_state(&s->state, s->state,
1273                               MIGRATION_STATUS_FAILED);
1274         }
1275     }
1276
1277     for (i = 0; i < migrate_multifd_channels(); i++) {
1278         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1279
1280         qemu_mutex_lock(&p->mutex);
1281         p->quit = true;
1282         /* We could arrive here for two reasons:
1283            - normal quit, i.e. everything went fine, just finished
1284            - error quit: We close the channels so the channel threads
1285              finish the qio_channel_read_all_eof() */
1286         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1287         qemu_mutex_unlock(&p->mutex);
1288     }
1289 }
1290
1291 int multifd_load_cleanup(Error **errp)
1292 {
1293     int i;
1294     int ret = 0;
1295
1296     if (!migrate_use_multifd()) {
1297         return 0;
1298     }
1299     multifd_recv_terminate_threads(NULL);
1300     for (i = 0; i < migrate_multifd_channels(); i++) {
1301         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1302
1303         if (p->running) {
1304             p->quit = true;
1305             /*
1306              * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code,
1307              * however try to wakeup it without harm in cleanup phase.
1308              */
1309             qemu_sem_post(&p->sem_sync);
1310             qemu_thread_join(&p->thread);
1311         }
1312         object_unref(OBJECT(p->c));
1313         p->c = NULL;
1314         qemu_mutex_destroy(&p->mutex);
1315         qemu_sem_destroy(&p->sem_sync);
1316         g_free(p->name);
1317         p->name = NULL;
1318         multifd_pages_clear(p->pages);
1319         p->pages = NULL;
1320         p->packet_len = 0;
1321         g_free(p->packet);
1322         p->packet = NULL;
1323     }
1324     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1325     g_free(multifd_recv_state->params);
1326     multifd_recv_state->params = NULL;
1327     g_free(multifd_recv_state);
1328     multifd_recv_state = NULL;
1329
1330     return ret;
1331 }
1332
1333 static void multifd_recv_sync_main(void)
1334 {
1335     int i;
1336
1337     if (!migrate_use_multifd()) {
1338         return;
1339     }
1340     for (i = 0; i < migrate_multifd_channels(); i++) {
1341         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1342
1343         trace_multifd_recv_sync_main_wait(p->id);
1344         qemu_sem_wait(&multifd_recv_state->sem_sync);
1345     }
1346     for (i = 0; i < migrate_multifd_channels(); i++) {
1347         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1348
1349         qemu_mutex_lock(&p->mutex);
1350         if (multifd_recv_state->packet_num < p->packet_num) {
1351             multifd_recv_state->packet_num = p->packet_num;
1352         }
1353         qemu_mutex_unlock(&p->mutex);
1354         trace_multifd_recv_sync_main_signal(p->id);
1355         qemu_sem_post(&p->sem_sync);
1356     }
1357     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1358 }
1359
1360 static void *multifd_recv_thread(void *opaque)
1361 {
1362     MultiFDRecvParams *p = opaque;
1363     Error *local_err = NULL;
1364     int ret;
1365
1366     trace_multifd_recv_thread_start(p->id);
1367     rcu_register_thread();
1368
1369     while (true) {
1370         uint32_t used;
1371         uint32_t flags;
1372
1373         if (p->quit) {
1374             break;
1375         }
1376
1377         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1378                                        p->packet_len, &local_err);
1379         if (ret == 0) {   /* EOF */
1380             break;
1381         }
1382         if (ret == -1) {   /* Error */
1383             break;
1384         }
1385
1386         qemu_mutex_lock(&p->mutex);
1387         ret = multifd_recv_unfill_packet(p, &local_err);
1388         if (ret) {
1389             qemu_mutex_unlock(&p->mutex);
1390             break;
1391         }
1392
1393         used = p->pages->used;
1394         flags = p->flags;
1395         trace_multifd_recv(p->id, p->packet_num, used, flags,
1396                            p->next_packet_size);
1397         p->num_packets++;
1398         p->num_pages += used;
1399         qemu_mutex_unlock(&p->mutex);
1400
1401         if (used) {
1402             ret = qio_channel_readv_all(p->c, p->pages->iov,
1403                                         used, &local_err);
1404             if (ret != 0) {
1405                 break;
1406             }
1407         }
1408
1409         if (flags & MULTIFD_FLAG_SYNC) {
1410             qemu_sem_post(&multifd_recv_state->sem_sync);
1411             qemu_sem_wait(&p->sem_sync);
1412         }
1413     }
1414
1415     if (local_err) {
1416         multifd_recv_terminate_threads(local_err);
1417     }
1418     qemu_mutex_lock(&p->mutex);
1419     p->running = false;
1420     qemu_mutex_unlock(&p->mutex);
1421
1422     rcu_unregister_thread();
1423     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1424
1425     return NULL;
1426 }
1427
1428 int multifd_load_setup(void)
1429 {
1430     int thread_count;
1431     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1432     uint8_t i;
1433
1434     if (!migrate_use_multifd()) {
1435         return 0;
1436     }
1437     thread_count = migrate_multifd_channels();
1438     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1439     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1440     atomic_set(&multifd_recv_state->count, 0);
1441     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1442
1443     for (i = 0; i < thread_count; i++) {
1444         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1445
1446         qemu_mutex_init(&p->mutex);
1447         qemu_sem_init(&p->sem_sync, 0);
1448         p->quit = false;
1449         p->id = i;
1450         p->pages = multifd_pages_init(page_count);
1451         p->packet_len = sizeof(MultiFDPacket_t)
1452                       + sizeof(ram_addr_t) * page_count;
1453         p->packet = g_malloc0(p->packet_len);
1454         p->name = g_strdup_printf("multifdrecv_%d", i);
1455     }
1456     return 0;
1457 }
1458
1459 bool multifd_recv_all_channels_created(void)
1460 {
1461     int thread_count = migrate_multifd_channels();
1462
1463     if (!migrate_use_multifd()) {
1464         return true;
1465     }
1466
1467     return thread_count == atomic_read(&multifd_recv_state->count);
1468 }
1469
1470 /*
1471  * Try to receive all multifd channels to get ready for the migration.
1472  * - Return true and do not set @errp when correctly receving all channels;
1473  * - Return false and do not set @errp when correctly receiving the current one;
1474  * - Return false and set @errp when failing to receive the current channel.
1475  */
1476 bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
1477 {
1478     MultiFDRecvParams *p;
1479     Error *local_err = NULL;
1480     int id;
1481
1482     id = multifd_recv_initial_packet(ioc, &local_err);
1483     if (id < 0) {
1484         multifd_recv_terminate_threads(local_err);
1485         error_propagate_prepend(errp, local_err,
1486                                 "failed to receive packet"
1487                                 " via multifd channel %d: ",
1488                                 atomic_read(&multifd_recv_state->count));
1489         return false;
1490     }
1491     trace_multifd_recv_new_channel(id);
1492
1493     p = &multifd_recv_state->params[id];
1494     if (p->c != NULL) {
1495         error_setg(&local_err, "multifd: received id '%d' already setup'",
1496                    id);
1497         multifd_recv_terminate_threads(local_err);
1498         error_propagate(errp, local_err);
1499         return false;
1500     }
1501     p->c = ioc;
1502     object_ref(OBJECT(ioc));
1503     /* initial packet */
1504     p->num_packets = 1;
1505
1506     p->running = true;
1507     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1508                        QEMU_THREAD_JOINABLE);
1509     atomic_inc(&multifd_recv_state->count);
1510     return atomic_read(&multifd_recv_state->count) ==
1511            migrate_multifd_channels();
1512 }
1513
1514 /**
1515  * save_page_header: write page header to wire
1516  *
1517  * If this is the 1st block, it also writes the block identification
1518  *
1519  * Returns the number of bytes written
1520  *
1521  * @f: QEMUFile where to send the data
1522  * @block: block that contains the page we want to send
1523  * @offset: offset inside the block for the page
1524  *          in the lower bits, it contains flags
1525  */
1526 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1527                                ram_addr_t offset)
1528 {
1529     size_t size, len;
1530
1531     if (block == rs->last_sent_block) {
1532         offset |= RAM_SAVE_FLAG_CONTINUE;
1533     }
1534     qemu_put_be64(f, offset);
1535     size = 8;
1536
1537     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1538         len = strlen(block->idstr);
1539         qemu_put_byte(f, len);
1540         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1541         size += 1 + len;
1542         rs->last_sent_block = block;
1543     }
1544     return size;
1545 }
1546
1547 /**
1548  * mig_throttle_guest_down: throotle down the guest
1549  *
1550  * Reduce amount of guest cpu execution to hopefully slow down memory
1551  * writes. If guest dirty memory rate is reduced below the rate at
1552  * which we can transfer pages to the destination then we should be
1553  * able to complete migration. Some workloads dirty memory way too
1554  * fast and will not effectively converge, even with auto-converge.
1555  */
1556 static void mig_throttle_guest_down(void)
1557 {
1558     MigrationState *s = migrate_get_current();
1559     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1560     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1561     int pct_max = s->parameters.max_cpu_throttle;
1562
1563     /* We have not started throttling yet. Let's start it. */
1564     if (!cpu_throttle_active()) {
1565         cpu_throttle_set(pct_initial);
1566     } else {
1567         /* Throttling already on, just increase the rate */
1568         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1569                          pct_max));
1570     }
1571 }
1572
1573 /**
1574  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1575  *
1576  * @rs: current RAM state
1577  * @current_addr: address for the zero page
1578  *
1579  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1580  * The important thing is that a stale (not-yet-0'd) page be replaced
1581  * by the new data.
1582  * As a bonus, if the page wasn't in the cache it gets added so that
1583  * when a small write is made into the 0'd page it gets XBZRLE sent.
1584  */
1585 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1586 {
1587     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1588         return;
1589     }
1590
1591     /* We don't care if this fails to allocate a new cache page
1592      * as long as it updated an old one */
1593     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1594                  ram_counters.dirty_sync_count);
1595 }
1596
1597 #define ENCODING_FLAG_XBZRLE 0x1
1598
1599 /**
1600  * save_xbzrle_page: compress and send current page
1601  *
1602  * Returns: 1 means that we wrote the page
1603  *          0 means that page is identical to the one already sent
1604  *          -1 means that xbzrle would be longer than normal
1605  *
1606  * @rs: current RAM state
1607  * @current_data: pointer to the address of the page contents
1608  * @current_addr: addr of the page
1609  * @block: block that contains the page we want to send
1610  * @offset: offset inside the block for the page
1611  * @last_stage: if we are at the completion stage
1612  */
1613 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1614                             ram_addr_t current_addr, RAMBlock *block,
1615                             ram_addr_t offset, bool last_stage)
1616 {
1617     int encoded_len = 0, bytes_xbzrle;
1618     uint8_t *prev_cached_page;
1619
1620     if (!cache_is_cached(XBZRLE.cache, current_addr,
1621                          ram_counters.dirty_sync_count)) {
1622         xbzrle_counters.cache_miss++;
1623         if (!last_stage) {
1624             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1625                              ram_counters.dirty_sync_count) == -1) {
1626                 return -1;
1627             } else {
1628                 /* update *current_data when the page has been
1629                    inserted into cache */
1630                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1631             }
1632         }
1633         return -1;
1634     }
1635
1636     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1637
1638     /* save current buffer into memory */
1639     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1640
1641     /* XBZRLE encoding (if there is no overflow) */
1642     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1643                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1644                                        TARGET_PAGE_SIZE);
1645
1646     /*
1647      * Update the cache contents, so that it corresponds to the data
1648      * sent, in all cases except where we skip the page.
1649      */
1650     if (!last_stage && encoded_len != 0) {
1651         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1652         /*
1653          * In the case where we couldn't compress, ensure that the caller
1654          * sends the data from the cache, since the guest might have
1655          * changed the RAM since we copied it.
1656          */
1657         *current_data = prev_cached_page;
1658     }
1659
1660     if (encoded_len == 0) {
1661         trace_save_xbzrle_page_skipping();
1662         return 0;
1663     } else if (encoded_len == -1) {
1664         trace_save_xbzrle_page_overflow();
1665         xbzrle_counters.overflow++;
1666         return -1;
1667     }
1668
1669     /* Send XBZRLE based compressed page */
1670     bytes_xbzrle = save_page_header(rs, rs->f, block,
1671                                     offset | RAM_SAVE_FLAG_XBZRLE);
1672     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1673     qemu_put_be16(rs->f, encoded_len);
1674     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1675     bytes_xbzrle += encoded_len + 1 + 2;
1676     xbzrle_counters.pages++;
1677     xbzrle_counters.bytes += bytes_xbzrle;
1678     ram_counters.transferred += bytes_xbzrle;
1679
1680     return 1;
1681 }
1682
1683 /**
1684  * migration_bitmap_find_dirty: find the next dirty page from start
1685  *
1686  * Returns the page offset within memory region of the start of a dirty page
1687  *
1688  * @rs: current RAM state
1689  * @rb: RAMBlock where to search for dirty pages
1690  * @start: page where we start the search
1691  */
1692 static inline
1693 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1694                                           unsigned long start)
1695 {
1696     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1697     unsigned long *bitmap = rb->bmap;
1698     unsigned long next;
1699
1700     if (ramblock_is_ignored(rb)) {
1701         return size;
1702     }
1703
1704     /*
1705      * When the free page optimization is enabled, we need to check the bitmap
1706      * to send the non-free pages rather than all the pages in the bulk stage.
1707      */
1708     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
1709         next = start + 1;
1710     } else {
1711         next = find_next_bit(bitmap, size, start);
1712     }
1713
1714     return next;
1715 }
1716
1717 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1718                                                 RAMBlock *rb,
1719                                                 unsigned long page)
1720 {
1721     bool ret;
1722
1723     qemu_mutex_lock(&rs->bitmap_mutex);
1724
1725     /*
1726      * Clear dirty bitmap if needed.  This _must_ be called before we
1727      * send any of the page in the chunk because we need to make sure
1728      * we can capture further page content changes when we sync dirty
1729      * log the next time.  So as long as we are going to send any of
1730      * the page in the chunk we clear the remote dirty bitmap for all.
1731      * Clearing it earlier won't be a problem, but too late will.
1732      */
1733     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
1734         uint8_t shift = rb->clear_bmap_shift;
1735         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
1736         hwaddr start = (page << TARGET_PAGE_BITS) & (-size);
1737
1738         /*
1739          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
1740          * can make things easier sometimes since then start address
1741          * of the small chunk will always be 64 pages aligned so the
1742          * bitmap will always be aligned to unsigned long.  We should
1743          * even be able to remove this restriction but I'm simply
1744          * keeping it.
1745          */
1746         assert(shift >= 6);
1747         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
1748         memory_region_clear_dirty_bitmap(rb->mr, start, size);
1749     }
1750
1751     ret = test_and_clear_bit(page, rb->bmap);
1752
1753     if (ret) {
1754         rs->migration_dirty_pages--;
1755     }
1756     qemu_mutex_unlock(&rs->bitmap_mutex);
1757
1758     return ret;
1759 }
1760
1761 /* Called with RCU critical section */
1762 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1763 {
1764     rs->migration_dirty_pages +=
1765         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
1766                                               &rs->num_dirty_pages_period);
1767 }
1768
1769 /**
1770  * ram_pagesize_summary: calculate all the pagesizes of a VM
1771  *
1772  * Returns a summary bitmap of the page sizes of all RAMBlocks
1773  *
1774  * For VMs with just normal pages this is equivalent to the host page
1775  * size. If it's got some huge pages then it's the OR of all the
1776  * different page sizes.
1777  */
1778 uint64_t ram_pagesize_summary(void)
1779 {
1780     RAMBlock *block;
1781     uint64_t summary = 0;
1782
1783     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1784         summary |= block->page_size;
1785     }
1786
1787     return summary;
1788 }
1789
1790 uint64_t ram_get_total_transferred_pages(void)
1791 {
1792     return  ram_counters.normal + ram_counters.duplicate +
1793                 compression_counters.pages + xbzrle_counters.pages;
1794 }
1795
1796 static void migration_update_rates(RAMState *rs, int64_t end_time)
1797 {
1798     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1799     double compressed_size;
1800
1801     /* calculate period counters */
1802     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1803                 / (end_time - rs->time_last_bitmap_sync);
1804
1805     if (!page_count) {
1806         return;
1807     }
1808
1809     if (migrate_use_xbzrle()) {
1810         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1811             rs->xbzrle_cache_miss_prev) / page_count;
1812         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1813     }
1814
1815     if (migrate_use_compression()) {
1816         compression_counters.busy_rate = (double)(compression_counters.busy -
1817             rs->compress_thread_busy_prev) / page_count;
1818         rs->compress_thread_busy_prev = compression_counters.busy;
1819
1820         compressed_size = compression_counters.compressed_size -
1821                           rs->compressed_size_prev;
1822         if (compressed_size) {
1823             double uncompressed_size = (compression_counters.pages -
1824                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1825
1826             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1827             compression_counters.compression_rate =
1828                                         uncompressed_size / compressed_size;
1829
1830             rs->compress_pages_prev = compression_counters.pages;
1831             rs->compressed_size_prev = compression_counters.compressed_size;
1832         }
1833     }
1834 }
1835
1836 static void migration_bitmap_sync(RAMState *rs)
1837 {
1838     RAMBlock *block;
1839     int64_t end_time;
1840     uint64_t bytes_xfer_now;
1841
1842     ram_counters.dirty_sync_count++;
1843
1844     if (!rs->time_last_bitmap_sync) {
1845         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1846     }
1847
1848     trace_migration_bitmap_sync_start();
1849     memory_global_dirty_log_sync();
1850
1851     qemu_mutex_lock(&rs->bitmap_mutex);
1852     rcu_read_lock();
1853     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1854         ramblock_sync_dirty_bitmap(rs, block);
1855     }
1856     ram_counters.remaining = ram_bytes_remaining();
1857     rcu_read_unlock();
1858     qemu_mutex_unlock(&rs->bitmap_mutex);
1859
1860     memory_global_after_dirty_log_sync();
1861     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1862
1863     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1864
1865     /* more than 1 second = 1000 millisecons */
1866     if (end_time > rs->time_last_bitmap_sync + 1000) {
1867         bytes_xfer_now = ram_counters.transferred;
1868
1869         /* During block migration the auto-converge logic incorrectly detects
1870          * that ram migration makes no progress. Avoid this by disabling the
1871          * throttling logic during the bulk phase of block migration. */
1872         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1873             /* The following detection logic can be refined later. For now:
1874                Check to see if the dirtied bytes is 50% more than the approx.
1875                amount of bytes that just got transferred since the last time we
1876                were in this routine. If that happens twice, start or increase
1877                throttling */
1878
1879             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1880                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1881                 (++rs->dirty_rate_high_cnt >= 2)) {
1882                     trace_migration_throttle();
1883                     rs->dirty_rate_high_cnt = 0;
1884                     mig_throttle_guest_down();
1885             }
1886         }
1887
1888         migration_update_rates(rs, end_time);
1889
1890         rs->target_page_count_prev = rs->target_page_count;
1891
1892         /* reset period counters */
1893         rs->time_last_bitmap_sync = end_time;
1894         rs->num_dirty_pages_period = 0;
1895         rs->bytes_xfer_prev = bytes_xfer_now;
1896     }
1897     if (migrate_use_events()) {
1898         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1899     }
1900 }
1901
1902 static void migration_bitmap_sync_precopy(RAMState *rs)
1903 {
1904     Error *local_err = NULL;
1905
1906     /*
1907      * The current notifier usage is just an optimization to migration, so we
1908      * don't stop the normal migration process in the error case.
1909      */
1910     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1911         error_report_err(local_err);
1912     }
1913
1914     migration_bitmap_sync(rs);
1915
1916     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1917         error_report_err(local_err);
1918     }
1919 }
1920
1921 /**
1922  * save_zero_page_to_file: send the zero page to the file
1923  *
1924  * Returns the size of data written to the file, 0 means the page is not
1925  * a zero page
1926  *
1927  * @rs: current RAM state
1928  * @file: the file where the data is saved
1929  * @block: block that contains the page we want to send
1930  * @offset: offset inside the block for the page
1931  */
1932 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1933                                   RAMBlock *block, ram_addr_t offset)
1934 {
1935     uint8_t *p = block->host + offset;
1936     int len = 0;
1937
1938     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1939         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1940         qemu_put_byte(file, 0);
1941         len += 1;
1942     }
1943     return len;
1944 }
1945
1946 /**
1947  * save_zero_page: send the zero page to the stream
1948  *
1949  * Returns the number of pages written.
1950  *
1951  * @rs: current RAM state
1952  * @block: block that contains the page we want to send
1953  * @offset: offset inside the block for the page
1954  */
1955 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1956 {
1957     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1958
1959     if (len) {
1960         ram_counters.duplicate++;
1961         ram_counters.transferred += len;
1962         return 1;
1963     }
1964     return -1;
1965 }
1966
1967 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1968 {
1969     if (!migrate_release_ram() || !migration_in_postcopy()) {
1970         return;
1971     }
1972
1973     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1974 }
1975
1976 /*
1977  * @pages: the number of pages written by the control path,
1978  *        < 0 - error
1979  *        > 0 - number of pages written
1980  *
1981  * Return true if the pages has been saved, otherwise false is returned.
1982  */
1983 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1984                               int *pages)
1985 {
1986     uint64_t bytes_xmit = 0;
1987     int ret;
1988
1989     *pages = -1;
1990     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1991                                 &bytes_xmit);
1992     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1993         return false;
1994     }
1995
1996     if (bytes_xmit) {
1997         ram_counters.transferred += bytes_xmit;
1998         *pages = 1;
1999     }
2000
2001     if (ret == RAM_SAVE_CONTROL_DELAYED) {
2002         return true;
2003     }
2004
2005     if (bytes_xmit > 0) {
2006         ram_counters.normal++;
2007     } else if (bytes_xmit == 0) {
2008         ram_counters.duplicate++;
2009     }
2010
2011     return true;
2012 }
2013
2014 /*
2015  * directly send the page to the stream
2016  *
2017  * Returns the number of pages written.
2018  *
2019  * @rs: current RAM state
2020  * @block: block that contains the page we want to send
2021  * @offset: offset inside the block for the page
2022  * @buf: the page to be sent
2023  * @async: send to page asyncly
2024  */
2025 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
2026                             uint8_t *buf, bool async)
2027 {
2028     ram_counters.transferred += save_page_header(rs, rs->f, block,
2029                                                  offset | RAM_SAVE_FLAG_PAGE);
2030     if (async) {
2031         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
2032                               migrate_release_ram() &
2033                               migration_in_postcopy());
2034     } else {
2035         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
2036     }
2037     ram_counters.transferred += TARGET_PAGE_SIZE;
2038     ram_counters.normal++;
2039     return 1;
2040 }
2041
2042 /**
2043  * ram_save_page: send the given page to the stream
2044  *
2045  * Returns the number of pages written.
2046  *          < 0 - error
2047  *          >=0 - Number of pages written - this might legally be 0
2048  *                if xbzrle noticed the page was the same.
2049  *
2050  * @rs: current RAM state
2051  * @block: block that contains the page we want to send
2052  * @offset: offset inside the block for the page
2053  * @last_stage: if we are at the completion stage
2054  */
2055 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
2056 {
2057     int pages = -1;
2058     uint8_t *p;
2059     bool send_async = true;
2060     RAMBlock *block = pss->block;
2061     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2062     ram_addr_t current_addr = block->offset + offset;
2063
2064     p = block->host + offset;
2065     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
2066
2067     XBZRLE_cache_lock();
2068     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
2069         migrate_use_xbzrle()) {
2070         pages = save_xbzrle_page(rs, &p, current_addr, block,
2071                                  offset, last_stage);
2072         if (!last_stage) {
2073             /* Can't send this cached data async, since the cache page
2074              * might get updated before it gets to the wire
2075              */
2076             send_async = false;
2077         }
2078     }
2079
2080     /* XBZRLE overflow or normal page */
2081     if (pages == -1) {
2082         pages = save_normal_page(rs, block, offset, p, send_async);
2083     }
2084
2085     XBZRLE_cache_unlock();
2086
2087     return pages;
2088 }
2089
2090 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
2091                                  ram_addr_t offset)
2092 {
2093     if (multifd_queue_page(rs, block, offset) < 0) {
2094         return -1;
2095     }
2096     ram_counters.normal++;
2097
2098     return 1;
2099 }
2100
2101 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
2102                                  ram_addr_t offset, uint8_t *source_buf)
2103 {
2104     RAMState *rs = ram_state;
2105     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
2106     bool zero_page = false;
2107     int ret;
2108
2109     if (save_zero_page_to_file(rs, f, block, offset)) {
2110         zero_page = true;
2111         goto exit;
2112     }
2113
2114     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
2115
2116     /*
2117      * copy it to a internal buffer to avoid it being modified by VM
2118      * so that we can catch up the error during compression and
2119      * decompression
2120      */
2121     memcpy(source_buf, p, TARGET_PAGE_SIZE);
2122     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
2123     if (ret < 0) {
2124         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
2125         error_report("compressed data failed!");
2126         return false;
2127     }
2128
2129 exit:
2130     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
2131     return zero_page;
2132 }
2133
2134 static void
2135 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
2136 {
2137     ram_counters.transferred += bytes_xmit;
2138
2139     if (param->zero_page) {
2140         ram_counters.duplicate++;
2141         return;
2142     }
2143
2144     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
2145     compression_counters.compressed_size += bytes_xmit - 8;
2146     compression_counters.pages++;
2147 }
2148
2149 static bool save_page_use_compression(RAMState *rs);
2150
2151 static void flush_compressed_data(RAMState *rs)
2152 {
2153     int idx, len, thread_count;
2154
2155     if (!save_page_use_compression(rs)) {
2156         return;
2157     }
2158     thread_count = migrate_compress_threads();
2159
2160     qemu_mutex_lock(&comp_done_lock);
2161     for (idx = 0; idx < thread_count; idx++) {
2162         while (!comp_param[idx].done) {
2163             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2164         }
2165     }
2166     qemu_mutex_unlock(&comp_done_lock);
2167
2168     for (idx = 0; idx < thread_count; idx++) {
2169         qemu_mutex_lock(&comp_param[idx].mutex);
2170         if (!comp_param[idx].quit) {
2171             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2172             /*
2173              * it's safe to fetch zero_page without holding comp_done_lock
2174              * as there is no further request submitted to the thread,
2175              * i.e, the thread should be waiting for a request at this point.
2176              */
2177             update_compress_thread_counts(&comp_param[idx], len);
2178         }
2179         qemu_mutex_unlock(&comp_param[idx].mutex);
2180     }
2181 }
2182
2183 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
2184                                        ram_addr_t offset)
2185 {
2186     param->block = block;
2187     param->offset = offset;
2188 }
2189
2190 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
2191                                            ram_addr_t offset)
2192 {
2193     int idx, thread_count, bytes_xmit = -1, pages = -1;
2194     bool wait = migrate_compress_wait_thread();
2195
2196     thread_count = migrate_compress_threads();
2197     qemu_mutex_lock(&comp_done_lock);
2198 retry:
2199     for (idx = 0; idx < thread_count; idx++) {
2200         if (comp_param[idx].done) {
2201             comp_param[idx].done = false;
2202             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2203             qemu_mutex_lock(&comp_param[idx].mutex);
2204             set_compress_params(&comp_param[idx], block, offset);
2205             qemu_cond_signal(&comp_param[idx].cond);
2206             qemu_mutex_unlock(&comp_param[idx].mutex);
2207             pages = 1;
2208             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
2209             break;
2210         }
2211     }
2212
2213     /*
2214      * wait for the free thread if the user specifies 'compress-wait-thread',
2215      * otherwise we will post the page out in the main thread as normal page.
2216      */
2217     if (pages < 0 && wait) {
2218         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2219         goto retry;
2220     }
2221     qemu_mutex_unlock(&comp_done_lock);
2222
2223     return pages;
2224 }
2225
2226 /**
2227  * find_dirty_block: find the next dirty page and update any state
2228  * associated with the search process.
2229  *
2230  * Returns true if a page is found
2231  *
2232  * @rs: current RAM state
2233  * @pss: data about the state of the current dirty page scan
2234  * @again: set to false if the search has scanned the whole of RAM
2235  */
2236 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
2237 {
2238     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2239     if (pss->complete_round && pss->block == rs->last_seen_block &&
2240         pss->page >= rs->last_page) {
2241         /*
2242          * We've been once around the RAM and haven't found anything.
2243          * Give up.
2244          */
2245         *again = false;
2246         return false;
2247     }
2248     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
2249         /* Didn't find anything in this RAM Block */
2250         pss->page = 0;
2251         pss->block = QLIST_NEXT_RCU(pss->block, next);
2252         if (!pss->block) {
2253             /*
2254              * If memory migration starts over, we will meet a dirtied page
2255              * which may still exists in compression threads's ring, so we
2256              * should flush the compressed data to make sure the new page
2257              * is not overwritten by the old one in the destination.
2258              *
2259              * Also If xbzrle is on, stop using the data compression at this
2260              * point. In theory, xbzrle can do better than compression.
2261              */
2262             flush_compressed_data(rs);
2263
2264             /* Hit the end of the list */
2265             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2266             /* Flag that we've looped */
2267             pss->complete_round = true;
2268             rs->ram_bulk_stage = false;
2269         }
2270         /* Didn't find anything this time, but try again on the new block */
2271         *again = true;
2272         return false;
2273     } else {
2274         /* Can go around again, but... */
2275         *again = true;
2276         /* We've found something so probably don't need to */
2277         return true;
2278     }
2279 }
2280
2281 /**
2282  * unqueue_page: gets a page of the queue
2283  *
2284  * Helper for 'get_queued_page' - gets a page off the queue
2285  *
2286  * Returns the block of the page (or NULL if none available)
2287  *
2288  * @rs: current RAM state
2289  * @offset: used to return the offset within the RAMBlock
2290  */
2291 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
2292 {
2293     RAMBlock *block = NULL;
2294
2295     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2296         return NULL;
2297     }
2298
2299     qemu_mutex_lock(&rs->src_page_req_mutex);
2300     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2301         struct RAMSrcPageRequest *entry =
2302                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
2303         block = entry->rb;
2304         *offset = entry->offset;
2305
2306         if (entry->len > TARGET_PAGE_SIZE) {
2307             entry->len -= TARGET_PAGE_SIZE;
2308             entry->offset += TARGET_PAGE_SIZE;
2309         } else {
2310             memory_region_unref(block->mr);
2311             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2312             g_free(entry);
2313             migration_consume_urgent_request();
2314         }
2315     }
2316     qemu_mutex_unlock(&rs->src_page_req_mutex);
2317
2318     return block;
2319 }
2320
2321 /**
2322  * get_queued_page: unqueue a page from the postcopy requests
2323  *
2324  * Skips pages that are already sent (!dirty)
2325  *
2326  * Returns true if a queued page is found
2327  *
2328  * @rs: current RAM state
2329  * @pss: data about the state of the current dirty page scan
2330  */
2331 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2332 {
2333     RAMBlock  *block;
2334     ram_addr_t offset;
2335     bool dirty;
2336
2337     do {
2338         block = unqueue_page(rs, &offset);
2339         /*
2340          * We're sending this page, and since it's postcopy nothing else
2341          * will dirty it, and we must make sure it doesn't get sent again
2342          * even if this queue request was received after the background
2343          * search already sent it.
2344          */
2345         if (block) {
2346             unsigned long page;
2347
2348             page = offset >> TARGET_PAGE_BITS;
2349             dirty = test_bit(page, block->bmap);
2350             if (!dirty) {
2351                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2352                        page, test_bit(page, block->unsentmap));
2353             } else {
2354                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2355             }
2356         }
2357
2358     } while (block && !dirty);
2359
2360     if (block) {
2361         /*
2362          * As soon as we start servicing pages out of order, then we have
2363          * to kill the bulk stage, since the bulk stage assumes
2364          * in (migration_bitmap_find_and_reset_dirty) that every page is
2365          * dirty, that's no longer true.
2366          */
2367         rs->ram_bulk_stage = false;
2368
2369         /*
2370          * We want the background search to continue from the queued page
2371          * since the guest is likely to want other pages near to the page
2372          * it just requested.
2373          */
2374         pss->block = block;
2375         pss->page = offset >> TARGET_PAGE_BITS;
2376
2377         /*
2378          * This unqueued page would break the "one round" check, even is
2379          * really rare.
2380          */
2381         pss->complete_round = false;
2382     }
2383
2384     return !!block;
2385 }
2386
2387 /**
2388  * migration_page_queue_free: drop any remaining pages in the ram
2389  * request queue
2390  *
2391  * It should be empty at the end anyway, but in error cases there may
2392  * be some left.  in case that there is any page left, we drop it.
2393  *
2394  */
2395 static void migration_page_queue_free(RAMState *rs)
2396 {
2397     struct RAMSrcPageRequest *mspr, *next_mspr;
2398     /* This queue generally should be empty - but in the case of a failed
2399      * migration might have some droppings in.
2400      */
2401     rcu_read_lock();
2402     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2403         memory_region_unref(mspr->rb->mr);
2404         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2405         g_free(mspr);
2406     }
2407     rcu_read_unlock();
2408 }
2409
2410 /**
2411  * ram_save_queue_pages: queue the page for transmission
2412  *
2413  * A request from postcopy destination for example.
2414  *
2415  * Returns zero on success or negative on error
2416  *
2417  * @rbname: Name of the RAMBLock of the request. NULL means the
2418  *          same that last one.
2419  * @start: starting address from the start of the RAMBlock
2420  * @len: length (in bytes) to send
2421  */
2422 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2423 {
2424     RAMBlock *ramblock;
2425     RAMState *rs = ram_state;
2426
2427     ram_counters.postcopy_requests++;
2428     rcu_read_lock();
2429     if (!rbname) {
2430         /* Reuse last RAMBlock */
2431         ramblock = rs->last_req_rb;
2432
2433         if (!ramblock) {
2434             /*
2435              * Shouldn't happen, we can't reuse the last RAMBlock if
2436              * it's the 1st request.
2437              */
2438             error_report("ram_save_queue_pages no previous block");
2439             goto err;
2440         }
2441     } else {
2442         ramblock = qemu_ram_block_by_name(rbname);
2443
2444         if (!ramblock) {
2445             /* We shouldn't be asked for a non-existent RAMBlock */
2446             error_report("ram_save_queue_pages no block '%s'", rbname);
2447             goto err;
2448         }
2449         rs->last_req_rb = ramblock;
2450     }
2451     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2452     if (start+len > ramblock->used_length) {
2453         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2454                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2455                      __func__, start, len, ramblock->used_length);
2456         goto err;
2457     }
2458
2459     struct RAMSrcPageRequest *new_entry =
2460         g_malloc0(sizeof(struct RAMSrcPageRequest));
2461     new_entry->rb = ramblock;
2462     new_entry->offset = start;
2463     new_entry->len = len;
2464
2465     memory_region_ref(ramblock->mr);
2466     qemu_mutex_lock(&rs->src_page_req_mutex);
2467     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2468     migration_make_urgent_request();
2469     qemu_mutex_unlock(&rs->src_page_req_mutex);
2470     rcu_read_unlock();
2471
2472     return 0;
2473
2474 err:
2475     rcu_read_unlock();
2476     return -1;
2477 }
2478
2479 static bool save_page_use_compression(RAMState *rs)
2480 {
2481     if (!migrate_use_compression()) {
2482         return false;
2483     }
2484
2485     /*
2486      * If xbzrle is on, stop using the data compression after first
2487      * round of migration even if compression is enabled. In theory,
2488      * xbzrle can do better than compression.
2489      */
2490     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2491         return true;
2492     }
2493
2494     return false;
2495 }
2496
2497 /*
2498  * try to compress the page before posting it out, return true if the page
2499  * has been properly handled by compression, otherwise needs other
2500  * paths to handle it
2501  */
2502 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2503 {
2504     if (!save_page_use_compression(rs)) {
2505         return false;
2506     }
2507
2508     /*
2509      * When starting the process of a new block, the first page of
2510      * the block should be sent out before other pages in the same
2511      * block, and all the pages in last block should have been sent
2512      * out, keeping this order is important, because the 'cont' flag
2513      * is used to avoid resending the block name.
2514      *
2515      * We post the fist page as normal page as compression will take
2516      * much CPU resource.
2517      */
2518     if (block != rs->last_sent_block) {
2519         flush_compressed_data(rs);
2520         return false;
2521     }
2522
2523     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2524         return true;
2525     }
2526
2527     compression_counters.busy++;
2528     return false;
2529 }
2530
2531 /**
2532  * ram_save_target_page: save one target page
2533  *
2534  * Returns the number of pages written
2535  *
2536  * @rs: current RAM state
2537  * @pss: data about the page we want to send
2538  * @last_stage: if we are at the completion stage
2539  */
2540 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2541                                 bool last_stage)
2542 {
2543     RAMBlock *block = pss->block;
2544     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2545     int res;
2546
2547     if (control_save_page(rs, block, offset, &res)) {
2548         return res;
2549     }
2550
2551     if (save_compress_page(rs, block, offset)) {
2552         return 1;
2553     }
2554
2555     res = save_zero_page(rs, block, offset);
2556     if (res > 0) {
2557         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2558          * page would be stale
2559          */
2560         if (!save_page_use_compression(rs)) {
2561             XBZRLE_cache_lock();
2562             xbzrle_cache_zero_page(rs, block->offset + offset);
2563             XBZRLE_cache_unlock();
2564         }
2565         ram_release_pages(block->idstr, offset, res);
2566         return res;
2567     }
2568
2569     /*
2570      * do not use multifd for compression as the first page in the new
2571      * block should be posted out before sending the compressed page
2572      */
2573     if (!save_page_use_compression(rs) && migrate_use_multifd()) {
2574         return ram_save_multifd_page(rs, block, offset);
2575     }
2576
2577     return ram_save_page(rs, pss, last_stage);
2578 }
2579
2580 /**
2581  * ram_save_host_page: save a whole host page
2582  *
2583  * Starting at *offset send pages up to the end of the current host
2584  * page. It's valid for the initial offset to point into the middle of
2585  * a host page in which case the remainder of the hostpage is sent.
2586  * Only dirty target pages are sent. Note that the host page size may
2587  * be a huge page for this block.
2588  * The saving stops at the boundary of the used_length of the block
2589  * if the RAMBlock isn't a multiple of the host page size.
2590  *
2591  * Returns the number of pages written or negative on error
2592  *
2593  * @rs: current RAM state
2594  * @ms: current migration state
2595  * @pss: data about the page we want to send
2596  * @last_stage: if we are at the completion stage
2597  */
2598 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2599                               bool last_stage)
2600 {
2601     int tmppages, pages = 0;
2602     size_t pagesize_bits =
2603         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2604
2605     if (ramblock_is_ignored(pss->block)) {
2606         error_report("block %s should not be migrated !", pss->block->idstr);
2607         return 0;
2608     }
2609
2610     do {
2611         /* Check the pages is dirty and if it is send it */
2612         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2613             pss->page++;
2614             continue;
2615         }
2616
2617         tmppages = ram_save_target_page(rs, pss, last_stage);
2618         if (tmppages < 0) {
2619             return tmppages;
2620         }
2621
2622         pages += tmppages;
2623         if (pss->block->unsentmap) {
2624             clear_bit(pss->page, pss->block->unsentmap);
2625         }
2626
2627         pss->page++;
2628     } while ((pss->page & (pagesize_bits - 1)) &&
2629              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2630
2631     /* The offset we leave with is the last one we looked at */
2632     pss->page--;
2633     return pages;
2634 }
2635
2636 /**
2637  * ram_find_and_save_block: finds a dirty page and sends it to f
2638  *
2639  * Called within an RCU critical section.
2640  *
2641  * Returns the number of pages written where zero means no dirty pages,
2642  * or negative on error
2643  *
2644  * @rs: current RAM state
2645  * @last_stage: if we are at the completion stage
2646  *
2647  * On systems where host-page-size > target-page-size it will send all the
2648  * pages in a host page that are dirty.
2649  */
2650
2651 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2652 {
2653     PageSearchStatus pss;
2654     int pages = 0;
2655     bool again, found;
2656
2657     /* No dirty page as there is zero RAM */
2658     if (!ram_bytes_total()) {
2659         return pages;
2660     }
2661
2662     pss.block = rs->last_seen_block;
2663     pss.page = rs->last_page;
2664     pss.complete_round = false;
2665
2666     if (!pss.block) {
2667         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2668     }
2669
2670     do {
2671         again = true;
2672         found = get_queued_page(rs, &pss);
2673
2674         if (!found) {
2675             /* priority queue empty, so just search for something dirty */
2676             found = find_dirty_block(rs, &pss, &again);
2677         }
2678
2679         if (found) {
2680             pages = ram_save_host_page(rs, &pss, last_stage);
2681         }
2682     } while (!pages && again);
2683
2684     rs->last_seen_block = pss.block;
2685     rs->last_page = pss.page;
2686
2687     return pages;
2688 }
2689
2690 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2691 {
2692     uint64_t pages = size / TARGET_PAGE_SIZE;
2693
2694     if (zero) {
2695         ram_counters.duplicate += pages;
2696     } else {
2697         ram_counters.normal += pages;
2698         ram_counters.transferred += size;
2699         qemu_update_position(f, size);
2700     }
2701 }
2702
2703 static uint64_t ram_bytes_total_common(bool count_ignored)
2704 {
2705     RAMBlock *block;
2706     uint64_t total = 0;
2707
2708     rcu_read_lock();
2709     if (count_ignored) {
2710         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2711             total += block->used_length;
2712         }
2713     } else {
2714         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2715             total += block->used_length;
2716         }
2717     }
2718     rcu_read_unlock();
2719     return total;
2720 }
2721
2722 uint64_t ram_bytes_total(void)
2723 {
2724     return ram_bytes_total_common(false);
2725 }
2726
2727 static void xbzrle_load_setup(void)
2728 {
2729     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2730 }
2731
2732 static void xbzrle_load_cleanup(void)
2733 {
2734     g_free(XBZRLE.decoded_buf);
2735     XBZRLE.decoded_buf = NULL;
2736 }
2737
2738 static void ram_state_cleanup(RAMState **rsp)
2739 {
2740     if (*rsp) {
2741         migration_page_queue_free(*rsp);
2742         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2743         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2744         g_free(*rsp);
2745         *rsp = NULL;
2746     }
2747 }
2748
2749 static void xbzrle_cleanup(void)
2750 {
2751     XBZRLE_cache_lock();
2752     if (XBZRLE.cache) {
2753         cache_fini(XBZRLE.cache);
2754         g_free(XBZRLE.encoded_buf);
2755         g_free(XBZRLE.current_buf);
2756         g_free(XBZRLE.zero_target_page);
2757         XBZRLE.cache = NULL;
2758         XBZRLE.encoded_buf = NULL;
2759         XBZRLE.current_buf = NULL;
2760         XBZRLE.zero_target_page = NULL;
2761     }
2762     XBZRLE_cache_unlock();
2763 }
2764
2765 static void ram_save_cleanup(void *opaque)
2766 {
2767     RAMState **rsp = opaque;
2768     RAMBlock *block;
2769
2770     /* caller have hold iothread lock or is in a bh, so there is
2771      * no writing race against the migration bitmap
2772      */
2773     memory_global_dirty_log_stop();
2774
2775     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2776         g_free(block->clear_bmap);
2777         block->clear_bmap = NULL;
2778         g_free(block->bmap);
2779         block->bmap = NULL;
2780         g_free(block->unsentmap);
2781         block->unsentmap = NULL;
2782     }
2783
2784     xbzrle_cleanup();
2785     compress_threads_save_cleanup();
2786     ram_state_cleanup(rsp);
2787 }
2788
2789 static void ram_state_reset(RAMState *rs)
2790 {
2791     rs->last_seen_block = NULL;
2792     rs->last_sent_block = NULL;
2793     rs->last_page = 0;
2794     rs->last_version = ram_list.version;
2795     rs->ram_bulk_stage = true;
2796     rs->fpo_enabled = false;
2797 }
2798
2799 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2800
2801 /*
2802  * 'expected' is the value you expect the bitmap mostly to be full
2803  * of; it won't bother printing lines that are all this value.
2804  * If 'todump' is null the migration bitmap is dumped.
2805  */
2806 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2807                            unsigned long pages)
2808 {
2809     int64_t cur;
2810     int64_t linelen = 128;
2811     char linebuf[129];
2812
2813     for (cur = 0; cur < pages; cur += linelen) {
2814         int64_t curb;
2815         bool found = false;
2816         /*
2817          * Last line; catch the case where the line length
2818          * is longer than remaining ram
2819          */
2820         if (cur + linelen > pages) {
2821             linelen = pages - cur;
2822         }
2823         for (curb = 0; curb < linelen; curb++) {
2824             bool thisbit = test_bit(cur + curb, todump);
2825             linebuf[curb] = thisbit ? '1' : '.';
2826             found = found || (thisbit != expected);
2827         }
2828         if (found) {
2829             linebuf[curb] = '\0';
2830             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2831         }
2832     }
2833 }
2834
2835 /* **** functions for postcopy ***** */
2836
2837 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2838 {
2839     struct RAMBlock *block;
2840
2841     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2842         unsigned long *bitmap = block->bmap;
2843         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2844         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2845
2846         while (run_start < range) {
2847             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2848             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2849                               (run_end - run_start) << TARGET_PAGE_BITS);
2850             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2851         }
2852     }
2853 }
2854
2855 /**
2856  * postcopy_send_discard_bm_ram: discard a RAMBlock
2857  *
2858  * Returns zero on success
2859  *
2860  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2861  * Note: At this point the 'unsentmap' is the processed bitmap combined
2862  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2863  *
2864  * @ms: current migration state
2865  * @block: RAMBlock to discard
2866  */
2867 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2868 {
2869     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2870     unsigned long current;
2871     unsigned long *unsentmap = block->unsentmap;
2872
2873     for (current = 0; current < end; ) {
2874         unsigned long one = find_next_bit(unsentmap, end, current);
2875         unsigned long zero, discard_length;
2876
2877         if (one >= end) {
2878             break;
2879         }
2880
2881         zero = find_next_zero_bit(unsentmap, end, one + 1);
2882
2883         if (zero >= end) {
2884             discard_length = end - one;
2885         } else {
2886             discard_length = zero - one;
2887         }
2888         postcopy_discard_send_range(ms, one, discard_length);
2889         current = one + discard_length;
2890     }
2891
2892     return 0;
2893 }
2894
2895 /**
2896  * postcopy_each_ram_send_discard: discard all RAMBlocks
2897  *
2898  * Returns 0 for success or negative for error
2899  *
2900  * Utility for the outgoing postcopy code.
2901  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2902  *   passing it bitmap indexes and name.
2903  * (qemu_ram_foreach_block ends up passing unscaled lengths
2904  *  which would mean postcopy code would have to deal with target page)
2905  *
2906  * @ms: current migration state
2907  */
2908 static int postcopy_each_ram_send_discard(MigrationState *ms)
2909 {
2910     struct RAMBlock *block;
2911     int ret;
2912
2913     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2914         postcopy_discard_send_init(ms, block->idstr);
2915
2916         /*
2917          * Postcopy sends chunks of bitmap over the wire, but it
2918          * just needs indexes at this point, avoids it having
2919          * target page specific code.
2920          */
2921         ret = postcopy_send_discard_bm_ram(ms, block);
2922         postcopy_discard_send_finish(ms);
2923         if (ret) {
2924             return ret;
2925         }
2926     }
2927
2928     return 0;
2929 }
2930
2931 /**
2932  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2933  *
2934  * Helper for postcopy_chunk_hostpages; it's called twice to
2935  * canonicalize the two bitmaps, that are similar, but one is
2936  * inverted.
2937  *
2938  * Postcopy requires that all target pages in a hostpage are dirty or
2939  * clean, not a mix.  This function canonicalizes the bitmaps.
2940  *
2941  * @ms: current migration state
2942  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2943  *               otherwise we need to canonicalize partially dirty host pages
2944  * @block: block that contains the page we want to canonicalize
2945  */
2946 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2947                                           RAMBlock *block)
2948 {
2949     RAMState *rs = ram_state;
2950     unsigned long *bitmap = block->bmap;
2951     unsigned long *unsentmap = block->unsentmap;
2952     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2953     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2954     unsigned long run_start;
2955
2956     if (block->page_size == TARGET_PAGE_SIZE) {
2957         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2958         return;
2959     }
2960
2961     if (unsent_pass) {
2962         /* Find a sent page */
2963         run_start = find_next_zero_bit(unsentmap, pages, 0);
2964     } else {
2965         /* Find a dirty page */
2966         run_start = find_next_bit(bitmap, pages, 0);
2967     }
2968
2969     while (run_start < pages) {
2970
2971         /*
2972          * If the start of this run of pages is in the middle of a host
2973          * page, then we need to fixup this host page.
2974          */
2975         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2976             /* Find the end of this run */
2977             if (unsent_pass) {
2978                 run_start = find_next_bit(unsentmap, pages, run_start + 1);
2979             } else {
2980                 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2981             }
2982             /*
2983              * If the end isn't at the start of a host page, then the
2984              * run doesn't finish at the end of a host page
2985              * and we need to discard.
2986              */
2987         }
2988
2989         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2990             unsigned long page;
2991             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2992                                                              host_ratio);
2993             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2994
2995             /* Tell the destination to discard this page */
2996             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2997                 /* For the unsent_pass we:
2998                  *     discard partially sent pages
2999                  * For the !unsent_pass (dirty) we:
3000                  *     discard partially dirty pages that were sent
3001                  *     (any partially sent pages were already discarded
3002                  *     by the previous unsent_pass)
3003                  */
3004                 postcopy_discard_send_range(ms, fixup_start_addr, host_ratio);
3005             }
3006
3007             /* Clean up the bitmap */
3008             for (page = fixup_start_addr;
3009                  page < fixup_start_addr + host_ratio; page++) {
3010                 /* All pages in this host page are now not sent */
3011                 set_bit(page, unsentmap);
3012
3013                 /*
3014                  * Remark them as dirty, updating the count for any pages
3015                  * that weren't previously dirty.
3016                  */
3017                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
3018             }
3019         }
3020
3021         if (unsent_pass) {
3022             /* Find the next sent page for the next iteration */
3023             run_start = find_next_zero_bit(unsentmap, pages, run_start);
3024         } else {
3025             /* Find the next dirty page for the next iteration */
3026             run_start = find_next_bit(bitmap, pages, run_start);
3027         }
3028     }
3029 }
3030
3031 /**
3032  * postcopy_chunk_hostpages: discard any partially sent host page
3033  *
3034  * Utility for the outgoing postcopy code.
3035  *
3036  * Discard any partially sent host-page size chunks, mark any partially
3037  * dirty host-page size chunks as all dirty.  In this case the host-page
3038  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
3039  *
3040  * Returns zero on success
3041  *
3042  * @ms: current migration state
3043  * @block: block we want to work with
3044  */
3045 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
3046 {
3047     postcopy_discard_send_init(ms, block->idstr);
3048
3049     /* First pass: Discard all partially sent host pages */
3050     postcopy_chunk_hostpages_pass(ms, true, block);
3051     /*
3052      * Second pass: Ensure that all partially dirty host pages are made
3053      * fully dirty.
3054      */
3055     postcopy_chunk_hostpages_pass(ms, false, block);
3056
3057     postcopy_discard_send_finish(ms);
3058     return 0;
3059 }
3060
3061 /**
3062  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
3063  *
3064  * Returns zero on success
3065  *
3066  * Transmit the set of pages to be discarded after precopy to the target
3067  * these are pages that:
3068  *     a) Have been previously transmitted but are now dirty again
3069  *     b) Pages that have never been transmitted, this ensures that
3070  *        any pages on the destination that have been mapped by background
3071  *        tasks get discarded (transparent huge pages is the specific concern)
3072  * Hopefully this is pretty sparse
3073  *
3074  * @ms: current migration state
3075  */
3076 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
3077 {
3078     RAMState *rs = ram_state;
3079     RAMBlock *block;
3080     int ret;
3081
3082     rcu_read_lock();
3083
3084     /* This should be our last sync, the src is now paused */
3085     migration_bitmap_sync(rs);
3086
3087     /* Easiest way to make sure we don't resume in the middle of a host-page */
3088     rs->last_seen_block = NULL;
3089     rs->last_sent_block = NULL;
3090     rs->last_page = 0;
3091
3092     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3093         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
3094         unsigned long *bitmap = block->bmap;
3095         unsigned long *unsentmap = block->unsentmap;
3096
3097         if (!unsentmap) {
3098             /* We don't have a safe way to resize the sentmap, so
3099              * if the bitmap was resized it will be NULL at this
3100              * point.
3101              */
3102             error_report("migration ram resized during precopy phase");
3103             rcu_read_unlock();
3104             return -EINVAL;
3105         }
3106         /* Deal with TPS != HPS and huge pages */
3107         ret = postcopy_chunk_hostpages(ms, block);
3108         if (ret) {
3109             rcu_read_unlock();
3110             return ret;
3111         }
3112
3113         /*
3114          * Update the unsentmap to be unsentmap = unsentmap | dirty
3115          */
3116         bitmap_or(unsentmap, unsentmap, bitmap, pages);
3117 #ifdef DEBUG_POSTCOPY
3118         ram_debug_dump_bitmap(unsentmap, true, pages);
3119 #endif
3120     }
3121     trace_ram_postcopy_send_discard_bitmap();
3122
3123     ret = postcopy_each_ram_send_discard(ms);
3124     rcu_read_unlock();
3125
3126     return ret;
3127 }
3128
3129 /**
3130  * ram_discard_range: discard dirtied pages at the beginning of postcopy
3131  *
3132  * Returns zero on success
3133  *
3134  * @rbname: name of the RAMBlock of the request. NULL means the
3135  *          same that last one.
3136  * @start: RAMBlock starting page
3137  * @length: RAMBlock size
3138  */
3139 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
3140 {
3141     int ret = -1;
3142
3143     trace_ram_discard_range(rbname, start, length);
3144
3145     rcu_read_lock();
3146     RAMBlock *rb = qemu_ram_block_by_name(rbname);
3147
3148     if (!rb) {
3149         error_report("ram_discard_range: Failed to find block '%s'", rbname);
3150         goto err;
3151     }
3152
3153     /*
3154      * On source VM, we don't need to update the received bitmap since
3155      * we don't even have one.
3156      */
3157     if (rb->receivedmap) {
3158         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3159                      length >> qemu_target_page_bits());
3160     }
3161
3162     ret = ram_block_discard_range(rb, start, length);
3163
3164 err:
3165     rcu_read_unlock();
3166
3167     return ret;
3168 }
3169
3170 /*
3171  * For every allocation, we will try not to crash the VM if the
3172  * allocation failed.
3173  */
3174 static int xbzrle_init(void)
3175 {
3176     Error *local_err = NULL;
3177
3178     if (!migrate_use_xbzrle()) {
3179         return 0;
3180     }
3181
3182     XBZRLE_cache_lock();
3183
3184     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3185     if (!XBZRLE.zero_target_page) {
3186         error_report("%s: Error allocating zero page", __func__);
3187         goto err_out;
3188     }
3189
3190     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3191                               TARGET_PAGE_SIZE, &local_err);
3192     if (!XBZRLE.cache) {
3193         error_report_err(local_err);
3194         goto free_zero_page;
3195     }
3196
3197     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3198     if (!XBZRLE.encoded_buf) {
3199         error_report("%s: Error allocating encoded_buf", __func__);
3200         goto free_cache;
3201     }
3202
3203     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3204     if (!XBZRLE.current_buf) {
3205         error_report("%s: Error allocating current_buf", __func__);
3206         goto free_encoded_buf;
3207     }
3208
3209     /* We are all good */
3210     XBZRLE_cache_unlock();
3211     return 0;
3212
3213 free_encoded_buf:
3214     g_free(XBZRLE.encoded_buf);
3215     XBZRLE.encoded_buf = NULL;
3216 free_cache:
3217     cache_fini(XBZRLE.cache);
3218     XBZRLE.cache = NULL;
3219 free_zero_page:
3220     g_free(XBZRLE.zero_target_page);
3221     XBZRLE.zero_target_page = NULL;
3222 err_out:
3223     XBZRLE_cache_unlock();
3224     return -ENOMEM;
3225 }
3226
3227 static int ram_state_init(RAMState **rsp)
3228 {
3229     *rsp = g_try_new0(RAMState, 1);
3230
3231     if (!*rsp) {
3232         error_report("%s: Init ramstate fail", __func__);
3233         return -1;
3234     }
3235
3236     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3237     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3238     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3239
3240     /*
3241      * Count the total number of pages used by ram blocks not including any
3242      * gaps due to alignment or unplugs.
3243      * This must match with the initial values of dirty bitmap.
3244      */
3245     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3246     ram_state_reset(*rsp);
3247
3248     return 0;
3249 }
3250
3251 static void ram_list_init_bitmaps(void)
3252 {
3253     MigrationState *ms = migrate_get_current();
3254     RAMBlock *block;
3255     unsigned long pages;
3256     uint8_t shift;
3257
3258     /* Skip setting bitmap if there is no RAM */
3259     if (ram_bytes_total()) {
3260         shift = ms->clear_bitmap_shift;
3261         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3262             error_report("clear_bitmap_shift (%u) too big, using "
3263                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3264             shift = CLEAR_BITMAP_SHIFT_MAX;
3265         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3266             error_report("clear_bitmap_shift (%u) too small, using "
3267                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3268             shift = CLEAR_BITMAP_SHIFT_MIN;
3269         }
3270
3271         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3272             pages = block->max_length >> TARGET_PAGE_BITS;
3273             /*
3274              * The initial dirty bitmap for migration must be set with all
3275              * ones to make sure we'll migrate every guest RAM page to
3276              * destination.
3277              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3278              * new migration after a failed migration, ram_list.
3279              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3280              * guest memory.
3281              */
3282             block->bmap = bitmap_new(pages);
3283             bitmap_set(block->bmap, 0, pages);
3284             block->clear_bmap_shift = shift;
3285             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3286             if (migrate_postcopy_ram()) {
3287                 block->unsentmap = bitmap_new(pages);
3288                 bitmap_set(block->unsentmap, 0, pages);
3289             }
3290         }
3291     }
3292 }
3293
3294 static void ram_init_bitmaps(RAMState *rs)
3295 {
3296     /* For memory_global_dirty_log_start below.  */
3297     qemu_mutex_lock_iothread();
3298     qemu_mutex_lock_ramlist();
3299     rcu_read_lock();
3300
3301     ram_list_init_bitmaps();
3302     memory_global_dirty_log_start();
3303     migration_bitmap_sync_precopy(rs);
3304
3305     rcu_read_unlock();
3306     qemu_mutex_unlock_ramlist();
3307     qemu_mutex_unlock_iothread();
3308 }
3309
3310 static int ram_init_all(RAMState **rsp)
3311 {
3312     if (ram_state_init(rsp)) {
3313         return -1;
3314     }
3315
3316     if (xbzrle_init()) {
3317         ram_state_cleanup(rsp);
3318         return -1;
3319     }
3320
3321     ram_init_bitmaps(*rsp);
3322
3323     return 0;
3324 }
3325
3326 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3327 {
3328     RAMBlock *block;
3329     uint64_t pages = 0;
3330
3331     /*
3332      * Postcopy is not using xbzrle/compression, so no need for that.
3333      * Also, since source are already halted, we don't need to care
3334      * about dirty page logging as well.
3335      */
3336
3337     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3338         pages += bitmap_count_one(block->bmap,
3339                                   block->used_length >> TARGET_PAGE_BITS);
3340     }
3341
3342     /* This may not be aligned with current bitmaps. Recalculate. */
3343     rs->migration_dirty_pages = pages;
3344
3345     rs->last_seen_block = NULL;
3346     rs->last_sent_block = NULL;
3347     rs->last_page = 0;
3348     rs->last_version = ram_list.version;
3349     /*
3350      * Disable the bulk stage, otherwise we'll resend the whole RAM no
3351      * matter what we have sent.
3352      */
3353     rs->ram_bulk_stage = false;
3354
3355     /* Update RAMState cache of output QEMUFile */
3356     rs->f = out;
3357
3358     trace_ram_state_resume_prepare(pages);
3359 }
3360
3361 /*
3362  * This function clears bits of the free pages reported by the caller from the
3363  * migration dirty bitmap. @addr is the host address corresponding to the
3364  * start of the continuous guest free pages, and @len is the total bytes of
3365  * those pages.
3366  */
3367 void qemu_guest_free_page_hint(void *addr, size_t len)
3368 {
3369     RAMBlock *block;
3370     ram_addr_t offset;
3371     size_t used_len, start, npages;
3372     MigrationState *s = migrate_get_current();
3373
3374     /* This function is currently expected to be used during live migration */
3375     if (!migration_is_setup_or_active(s->state)) {
3376         return;
3377     }
3378
3379     for (; len > 0; len -= used_len, addr += used_len) {
3380         block = qemu_ram_block_from_host(addr, false, &offset);
3381         if (unlikely(!block || offset >= block->used_length)) {
3382             /*
3383              * The implementation might not support RAMBlock resize during
3384              * live migration, but it could happen in theory with future
3385              * updates. So we add a check here to capture that case.
3386              */
3387             error_report_once("%s unexpected error", __func__);
3388             return;
3389         }
3390
3391         if (len <= block->used_length - offset) {
3392             used_len = len;
3393         } else {
3394             used_len = block->used_length - offset;
3395         }
3396
3397         start = offset >> TARGET_PAGE_BITS;
3398         npages = used_len >> TARGET_PAGE_BITS;
3399
3400         qemu_mutex_lock(&ram_state->bitmap_mutex);
3401         ram_state->migration_dirty_pages -=
3402                       bitmap_count_one_with_offset(block->bmap, start, npages);
3403         bitmap_clear(block->bmap, start, npages);
3404         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3405     }
3406 }
3407
3408 /*
3409  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3410  * long-running RCU critical section.  When rcu-reclaims in the code
3411  * start to become numerous it will be necessary to reduce the
3412  * granularity of these critical sections.
3413  */
3414
3415 /**
3416  * ram_save_setup: Setup RAM for migration
3417  *
3418  * Returns zero to indicate success and negative for error
3419  *
3420  * @f: QEMUFile where to send the data
3421  * @opaque: RAMState pointer
3422  */
3423 static int ram_save_setup(QEMUFile *f, void *opaque)
3424 {
3425     RAMState **rsp = opaque;
3426     RAMBlock *block;
3427
3428     if (compress_threads_save_setup()) {
3429         return -1;
3430     }
3431
3432     /* migration has already setup the bitmap, reuse it. */
3433     if (!migration_in_colo_state()) {
3434         if (ram_init_all(rsp) != 0) {
3435             compress_threads_save_cleanup();
3436             return -1;
3437         }
3438     }
3439     (*rsp)->f = f;
3440
3441     rcu_read_lock();
3442
3443     qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3444
3445     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3446         qemu_put_byte(f, strlen(block->idstr));
3447         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3448         qemu_put_be64(f, block->used_length);
3449         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3450             qemu_put_be64(f, block->page_size);
3451         }
3452         if (migrate_ignore_shared()) {
3453             qemu_put_be64(f, block->mr->addr);
3454         }
3455     }
3456
3457     rcu_read_unlock();
3458
3459     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3460     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3461
3462     multifd_send_sync_main(*rsp);
3463     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3464     qemu_fflush(f);
3465
3466     return 0;
3467 }
3468
3469 /**
3470  * ram_save_iterate: iterative stage for migration
3471  *
3472  * Returns zero to indicate success and negative for error
3473  *
3474  * @f: QEMUFile where to send the data
3475  * @opaque: RAMState pointer
3476  */
3477 static int ram_save_iterate(QEMUFile *f, void *opaque)
3478 {
3479     RAMState **temp = opaque;
3480     RAMState *rs = *temp;
3481     int ret;
3482     int i;
3483     int64_t t0;
3484     int done = 0;
3485
3486     if (blk_mig_bulk_active()) {
3487         /* Avoid transferring ram during bulk phase of block migration as
3488          * the bulk phase will usually take a long time and transferring
3489          * ram updates during that time is pointless. */
3490         goto out;
3491     }
3492
3493     rcu_read_lock();
3494     if (ram_list.version != rs->last_version) {
3495         ram_state_reset(rs);
3496     }
3497
3498     /* Read version before ram_list.blocks */
3499     smp_rmb();
3500
3501     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3502
3503     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3504     i = 0;
3505     while ((ret = qemu_file_rate_limit(f)) == 0 ||
3506             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3507         int pages;
3508
3509         if (qemu_file_get_error(f)) {
3510             break;
3511         }
3512
3513         pages = ram_find_and_save_block(rs, false);
3514         /* no more pages to sent */
3515         if (pages == 0) {
3516             done = 1;
3517             break;
3518         }
3519
3520         if (pages < 0) {
3521             qemu_file_set_error(f, pages);
3522             break;
3523         }
3524
3525         rs->target_page_count += pages;
3526
3527         /* we want to check in the 1st loop, just in case it was the 1st time
3528            and we had to sync the dirty bitmap.
3529            qemu_clock_get_ns() is a bit expensive, so we only check each some
3530            iterations
3531         */
3532         if ((i & 63) == 0) {
3533             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3534             if (t1 > MAX_WAIT) {
3535                 trace_ram_save_iterate_big_wait(t1, i);
3536                 break;
3537             }
3538         }
3539         i++;
3540     }
3541     rcu_read_unlock();
3542
3543     /*
3544      * Must occur before EOS (or any QEMUFile operation)
3545      * because of RDMA protocol.
3546      */
3547     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3548
3549 out:
3550     multifd_send_sync_main(rs);
3551     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3552     qemu_fflush(f);
3553     ram_counters.transferred += 8;
3554
3555     ret = qemu_file_get_error(f);
3556     if (ret < 0) {
3557         return ret;
3558     }
3559
3560     return done;
3561 }
3562
3563 /**
3564  * ram_save_complete: function called to send the remaining amount of ram
3565  *
3566  * Returns zero to indicate success or negative on error
3567  *
3568  * Called with iothread lock
3569  *
3570  * @f: QEMUFile where to send the data
3571  * @opaque: RAMState pointer
3572  */
3573 static int ram_save_complete(QEMUFile *f, void *opaque)
3574 {
3575     RAMState **temp = opaque;
3576     RAMState *rs = *temp;
3577     int ret = 0;
3578
3579     rcu_read_lock();
3580
3581     if (!migration_in_postcopy()) {
3582         migration_bitmap_sync_precopy(rs);
3583     }
3584
3585     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3586
3587     /* try transferring iterative blocks of memory */
3588
3589     /* flush all remaining blocks regardless of rate limiting */
3590     while (true) {
3591         int pages;
3592
3593         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3594         /* no more blocks to sent */
3595         if (pages == 0) {
3596             break;
3597         }
3598         if (pages < 0) {
3599             ret = pages;
3600             break;
3601         }
3602     }
3603
3604     flush_compressed_data(rs);
3605     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3606
3607     rcu_read_unlock();
3608
3609     multifd_send_sync_main(rs);
3610     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3611     qemu_fflush(f);
3612
3613     return ret;
3614 }
3615
3616 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3617                              uint64_t *res_precopy_only,
3618                              uint64_t *res_compatible,
3619                              uint64_t *res_postcopy_only)
3620 {
3621     RAMState **temp = opaque;
3622     RAMState *rs = *temp;
3623     uint64_t remaining_size;
3624
3625     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3626
3627     if (!migration_in_postcopy() &&
3628         remaining_size < max_size) {
3629         qemu_mutex_lock_iothread();
3630         rcu_read_lock();
3631         migration_bitmap_sync_precopy(rs);
3632         rcu_read_unlock();
3633         qemu_mutex_unlock_iothread();
3634         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3635     }
3636
3637     if (migrate_postcopy_ram()) {
3638         /* We can do postcopy, and all the data is postcopiable */
3639         *res_compatible += remaining_size;
3640     } else {
3641         *res_precopy_only += remaining_size;
3642     }
3643 }
3644
3645 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3646 {
3647     unsigned int xh_len;
3648     int xh_flags;
3649     uint8_t *loaded_data;
3650
3651     /* extract RLE header */
3652     xh_flags = qemu_get_byte(f);
3653     xh_len = qemu_get_be16(f);
3654
3655     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3656         error_report("Failed to load XBZRLE page - wrong compression!");
3657         return -1;
3658     }
3659
3660     if (xh_len > TARGET_PAGE_SIZE) {
3661         error_report("Failed to load XBZRLE page - len overflow!");
3662         return -1;
3663     }
3664     loaded_data = XBZRLE.decoded_buf;
3665     /* load data and decode */
3666     /* it can change loaded_data to point to an internal buffer */
3667     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3668
3669     /* decode RLE */
3670     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3671                              TARGET_PAGE_SIZE) == -1) {
3672         error_report("Failed to load XBZRLE page - decode error!");
3673         return -1;
3674     }
3675
3676     return 0;
3677 }
3678
3679 /**
3680  * ram_block_from_stream: read a RAMBlock id from the migration stream
3681  *
3682  * Must be called from within a rcu critical section.
3683  *
3684  * Returns a pointer from within the RCU-protected ram_list.
3685  *
3686  * @f: QEMUFile where to read the data from
3687  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3688  */
3689 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3690 {
3691     static RAMBlock *block = NULL;
3692     char id[256];
3693     uint8_t len;
3694
3695     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3696         if (!block) {
3697             error_report("Ack, bad migration stream!");
3698             return NULL;
3699         }
3700         return block;
3701     }
3702
3703     len = qemu_get_byte(f);
3704     qemu_get_buffer(f, (uint8_t *)id, len);
3705     id[len] = 0;
3706
3707     block = qemu_ram_block_by_name(id);
3708     if (!block) {
3709         error_report("Can't find block %s", id);
3710         return NULL;
3711     }
3712
3713     if (ramblock_is_ignored(block)) {
3714         error_report("block %s should not be migrated !", id);
3715         return NULL;
3716     }
3717
3718     return block;
3719 }
3720
3721 static inline void *host_from_ram_block_offset(RAMBlock *block,
3722                                                ram_addr_t offset)
3723 {
3724     if (!offset_in_ramblock(block, offset)) {
3725         return NULL;
3726     }
3727
3728     return block->host + offset;
3729 }
3730
3731 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3732                                                  ram_addr_t offset)
3733 {
3734     if (!offset_in_ramblock(block, offset)) {
3735         return NULL;
3736     }
3737     if (!block->colo_cache) {
3738         error_report("%s: colo_cache is NULL in block :%s",
3739                      __func__, block->idstr);
3740         return NULL;
3741     }
3742
3743     /*
3744     * During colo checkpoint, we need bitmap of these migrated pages.
3745     * It help us to decide which pages in ram cache should be flushed
3746     * into VM's RAM later.
3747     */
3748     if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3749         ram_state->migration_dirty_pages++;
3750     }
3751     return block->colo_cache + offset;
3752 }
3753
3754 /**
3755  * ram_handle_compressed: handle the zero page case
3756  *
3757  * If a page (or a whole RDMA chunk) has been
3758  * determined to be zero, then zap it.
3759  *
3760  * @host: host address for the zero page
3761  * @ch: what the page is filled from.  We only support zero
3762  * @size: size of the zero page
3763  */
3764 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3765 {
3766     if (ch != 0 || !is_zero_range(host, size)) {
3767         memset(host, ch, size);
3768     }
3769 }
3770
3771 /* return the size after decompression, or negative value on error */
3772 static int
3773 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3774                      const uint8_t *source, size_t source_len)
3775 {
3776     int err;
3777
3778     err = inflateReset(stream);
3779     if (err != Z_OK) {
3780         return -1;
3781     }
3782
3783     stream->avail_in = source_len;
3784     stream->next_in = (uint8_t *)source;
3785     stream->avail_out = dest_len;
3786     stream->next_out = dest;
3787
3788     err = inflate(stream, Z_NO_FLUSH);
3789     if (err != Z_STREAM_END) {
3790         return -1;
3791     }
3792
3793     return stream->total_out;
3794 }
3795
3796 static void *do_data_decompress(void *opaque)
3797 {
3798     DecompressParam *param = opaque;
3799     unsigned long pagesize;
3800     uint8_t *des;
3801     int len, ret;
3802
3803     qemu_mutex_lock(&param->mutex);
3804     while (!param->quit) {
3805         if (param->des) {
3806             des = param->des;
3807             len = param->len;
3808             param->des = 0;
3809             qemu_mutex_unlock(&param->mutex);
3810
3811             pagesize = TARGET_PAGE_SIZE;
3812
3813             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3814                                        param->compbuf, len);
3815             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3816                 error_report("decompress data failed");
3817                 qemu_file_set_error(decomp_file, ret);
3818             }
3819
3820             qemu_mutex_lock(&decomp_done_lock);
3821             param->done = true;
3822             qemu_cond_signal(&decomp_done_cond);
3823             qemu_mutex_unlock(&decomp_done_lock);
3824
3825             qemu_mutex_lock(&param->mutex);
3826         } else {
3827             qemu_cond_wait(&param->cond, &param->mutex);
3828         }
3829     }
3830     qemu_mutex_unlock(&param->mutex);
3831
3832     return NULL;
3833 }
3834
3835 static int wait_for_decompress_done(void)
3836 {
3837     int idx, thread_count;
3838
3839     if (!migrate_use_compression()) {
3840         return 0;
3841     }
3842
3843     thread_count = migrate_decompress_threads();
3844     qemu_mutex_lock(&decomp_done_lock);
3845     for (idx = 0; idx < thread_count; idx++) {
3846         while (!decomp_param[idx].done) {
3847             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3848         }
3849     }
3850     qemu_mutex_unlock(&decomp_done_lock);
3851     return qemu_file_get_error(decomp_file);
3852 }
3853
3854 static void compress_threads_load_cleanup(void)
3855 {
3856     int i, thread_count;
3857
3858     if (!migrate_use_compression()) {
3859         return;
3860     }
3861     thread_count = migrate_decompress_threads();
3862     for (i = 0; i < thread_count; i++) {
3863         /*
3864          * we use it as a indicator which shows if the thread is
3865          * properly init'd or not
3866          */
3867         if (!decomp_param[i].compbuf) {
3868             break;
3869         }
3870
3871         qemu_mutex_lock(&decomp_param[i].mutex);
3872         decomp_param[i].quit = true;
3873         qemu_cond_signal(&decomp_param[i].cond);
3874         qemu_mutex_unlock(&decomp_param[i].mutex);
3875     }
3876     for (i = 0; i < thread_count; i++) {
3877         if (!decomp_param[i].compbuf) {
3878             break;
3879         }
3880
3881         qemu_thread_join(decompress_threads + i);
3882         qemu_mutex_destroy(&decomp_param[i].mutex);
3883         qemu_cond_destroy(&decomp_param[i].cond);
3884         inflateEnd(&decomp_param[i].stream);
3885         g_free(decomp_param[i].compbuf);
3886         decomp_param[i].compbuf = NULL;
3887     }
3888     g_free(decompress_threads);
3889     g_free(decomp_param);
3890     decompress_threads = NULL;
3891     decomp_param = NULL;
3892     decomp_file = NULL;
3893 }
3894
3895 static int compress_threads_load_setup(QEMUFile *f)
3896 {
3897     int i, thread_count;
3898
3899     if (!migrate_use_compression()) {
3900         return 0;
3901     }
3902
3903     thread_count = migrate_decompress_threads();
3904     decompress_threads = g_new0(QemuThread, thread_count);
3905     decomp_param = g_new0(DecompressParam, thread_count);
3906     qemu_mutex_init(&decomp_done_lock);
3907     qemu_cond_init(&decomp_done_cond);
3908     decomp_file = f;
3909     for (i = 0; i < thread_count; i++) {
3910         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3911             goto exit;
3912         }
3913
3914         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3915         qemu_mutex_init(&decomp_param[i].mutex);
3916         qemu_cond_init(&decomp_param[i].cond);
3917         decomp_param[i].done = true;
3918         decomp_param[i].quit = false;
3919         qemu_thread_create(decompress_threads + i, "decompress",
3920                            do_data_decompress, decomp_param + i,
3921                            QEMU_THREAD_JOINABLE);
3922     }
3923     return 0;
3924 exit:
3925     compress_threads_load_cleanup();
3926     return -1;
3927 }
3928
3929 static void decompress_data_with_multi_threads(QEMUFile *f,
3930                                                void *host, int len)
3931 {
3932     int idx, thread_count;
3933
3934     thread_count = migrate_decompress_threads();
3935     qemu_mutex_lock(&decomp_done_lock);
3936     while (true) {
3937         for (idx = 0; idx < thread_count; idx++) {
3938             if (decomp_param[idx].done) {
3939                 decomp_param[idx].done = false;
3940                 qemu_mutex_lock(&decomp_param[idx].mutex);
3941                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3942                 decomp_param[idx].des = host;
3943                 decomp_param[idx].len = len;
3944                 qemu_cond_signal(&decomp_param[idx].cond);
3945                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3946                 break;
3947             }
3948         }
3949         if (idx < thread_count) {
3950             break;
3951         } else {
3952             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3953         }
3954     }
3955     qemu_mutex_unlock(&decomp_done_lock);
3956 }
3957
3958 /*
3959  * colo cache: this is for secondary VM, we cache the whole
3960  * memory of the secondary VM, it is need to hold the global lock
3961  * to call this helper.
3962  */
3963 int colo_init_ram_cache(void)
3964 {
3965     RAMBlock *block;
3966
3967     rcu_read_lock();
3968     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3969         block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3970                                                 NULL,
3971                                                 false);
3972         if (!block->colo_cache) {
3973             error_report("%s: Can't alloc memory for COLO cache of block %s,"
3974                          "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3975                          block->used_length);
3976             goto out_locked;
3977         }
3978         memcpy(block->colo_cache, block->host, block->used_length);
3979     }
3980     rcu_read_unlock();
3981     /*
3982     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3983     * with to decide which page in cache should be flushed into SVM's RAM. Here
3984     * we use the same name 'ram_bitmap' as for migration.
3985     */
3986     if (ram_bytes_total()) {
3987         RAMBlock *block;
3988
3989         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3990             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3991
3992             block->bmap = bitmap_new(pages);
3993             bitmap_set(block->bmap, 0, pages);
3994         }
3995     }
3996     ram_state = g_new0(RAMState, 1);
3997     ram_state->migration_dirty_pages = 0;
3998     qemu_mutex_init(&ram_state->bitmap_mutex);
3999     memory_global_dirty_log_start();
4000
4001     return 0;
4002
4003 out_locked:
4004
4005     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4006         if (block->colo_cache) {
4007             qemu_anon_ram_free(block->colo_cache, block->used_length);
4008             block->colo_cache = NULL;
4009         }
4010     }
4011
4012     rcu_read_unlock();
4013     return -errno;
4014 }
4015
4016 /* It is need to hold the global lock to call this helper */
4017 void colo_release_ram_cache(void)
4018 {
4019     RAMBlock *block;
4020
4021     memory_global_dirty_log_stop();
4022     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4023         g_free(block->bmap);
4024         block->bmap = NULL;
4025     }
4026
4027     rcu_read_lock();
4028
4029     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4030         if (block->colo_cache) {
4031             qemu_anon_ram_free(block->colo_cache, block->used_length);
4032             block->colo_cache = NULL;
4033         }
4034     }
4035
4036     rcu_read_unlock();
4037     qemu_mutex_destroy(&ram_state->bitmap_mutex);
4038     g_free(ram_state);
4039     ram_state = NULL;
4040 }
4041
4042 /**
4043  * ram_load_setup: Setup RAM for migration incoming side
4044  *
4045  * Returns zero to indicate success and negative for error
4046  *
4047  * @f: QEMUFile where to receive the data
4048  * @opaque: RAMState pointer
4049  */
4050 static int ram_load_setup(QEMUFile *f, void *opaque)
4051 {
4052     if (compress_threads_load_setup(f)) {
4053         return -1;
4054     }
4055
4056     xbzrle_load_setup();
4057     ramblock_recv_map_init();
4058
4059     return 0;
4060 }
4061
4062 static int ram_load_cleanup(void *opaque)
4063 {
4064     RAMBlock *rb;
4065
4066     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4067         if (ramblock_is_pmem(rb)) {
4068             pmem_persist(rb->host, rb->used_length);
4069         }
4070     }
4071
4072     xbzrle_load_cleanup();
4073     compress_threads_load_cleanup();
4074
4075     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4076         g_free(rb->receivedmap);
4077         rb->receivedmap = NULL;
4078     }
4079
4080     return 0;
4081 }
4082
4083 /**
4084  * ram_postcopy_incoming_init: allocate postcopy data structures
4085  *
4086  * Returns 0 for success and negative if there was one error
4087  *
4088  * @mis: current migration incoming state
4089  *
4090  * Allocate data structures etc needed by incoming migration with
4091  * postcopy-ram. postcopy-ram's similarly names
4092  * postcopy_ram_incoming_init does the work.
4093  */
4094 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4095 {
4096     return postcopy_ram_incoming_init(mis);
4097 }
4098
4099 /**
4100  * ram_load_postcopy: load a page in postcopy case
4101  *
4102  * Returns 0 for success or -errno in case of error
4103  *
4104  * Called in postcopy mode by ram_load().
4105  * rcu_read_lock is taken prior to this being called.
4106  *
4107  * @f: QEMUFile where to send the data
4108  */
4109 static int ram_load_postcopy(QEMUFile *f)
4110 {
4111     int flags = 0, ret = 0;
4112     bool place_needed = false;
4113     bool matches_target_page_size = false;
4114     MigrationIncomingState *mis = migration_incoming_get_current();
4115     /* Temporary page that is later 'placed' */
4116     void *postcopy_host_page = postcopy_get_tmp_page(mis);
4117     void *last_host = NULL;
4118     bool all_zero = false;
4119
4120     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4121         ram_addr_t addr;
4122         void *host = NULL;
4123         void *page_buffer = NULL;
4124         void *place_source = NULL;
4125         RAMBlock *block = NULL;
4126         uint8_t ch;
4127
4128         addr = qemu_get_be64(f);
4129
4130         /*
4131          * If qemu file error, we should stop here, and then "addr"
4132          * may be invalid
4133          */
4134         ret = qemu_file_get_error(f);
4135         if (ret) {
4136             break;
4137         }
4138
4139         flags = addr & ~TARGET_PAGE_MASK;
4140         addr &= TARGET_PAGE_MASK;
4141
4142         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
4143         place_needed = false;
4144         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
4145             block = ram_block_from_stream(f, flags);
4146
4147             host = host_from_ram_block_offset(block, addr);
4148             if (!host) {
4149                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4150                 ret = -EINVAL;
4151                 break;
4152             }
4153             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4154             /*
4155              * Postcopy requires that we place whole host pages atomically;
4156              * these may be huge pages for RAMBlocks that are backed by
4157              * hugetlbfs.
4158              * To make it atomic, the data is read into a temporary page
4159              * that's moved into place later.
4160              * The migration protocol uses,  possibly smaller, target-pages
4161              * however the source ensures it always sends all the components
4162              * of a host page in order.
4163              */
4164             page_buffer = postcopy_host_page +
4165                           ((uintptr_t)host & (block->page_size - 1));
4166             /* If all TP are zero then we can optimise the place */
4167             if (!((uintptr_t)host & (block->page_size - 1))) {
4168                 all_zero = true;
4169             } else {
4170                 /* not the 1st TP within the HP */
4171                 if (host != (last_host + TARGET_PAGE_SIZE)) {
4172                     error_report("Non-sequential target page %p/%p",
4173                                   host, last_host);
4174                     ret = -EINVAL;
4175                     break;
4176                 }
4177             }
4178
4179
4180             /*
4181              * If it's the last part of a host page then we place the host
4182              * page
4183              */
4184             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
4185                                      (block->page_size - 1)) == 0;
4186             place_source = postcopy_host_page;
4187         }
4188         last_host = host;
4189
4190         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4191         case RAM_SAVE_FLAG_ZERO:
4192             ch = qemu_get_byte(f);
4193             memset(page_buffer, ch, TARGET_PAGE_SIZE);
4194             if (ch) {
4195                 all_zero = false;
4196             }
4197             break;
4198
4199         case RAM_SAVE_FLAG_PAGE:
4200             all_zero = false;
4201             if (!matches_target_page_size) {
4202                 /* For huge pages, we always use temporary buffer */
4203                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4204             } else {
4205                 /*
4206                  * For small pages that matches target page size, we
4207                  * avoid the qemu_file copy.  Instead we directly use
4208                  * the buffer of QEMUFile to place the page.  Note: we
4209                  * cannot do any QEMUFile operation before using that
4210                  * buffer to make sure the buffer is valid when
4211                  * placing the page.
4212                  */
4213                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4214                                          TARGET_PAGE_SIZE);
4215             }
4216             break;
4217         case RAM_SAVE_FLAG_EOS:
4218             /* normal exit */
4219             multifd_recv_sync_main();
4220             break;
4221         default:
4222             error_report("Unknown combination of migration flags: %#x"
4223                          " (postcopy mode)", flags);
4224             ret = -EINVAL;
4225             break;
4226         }
4227
4228         /* Detect for any possible file errors */
4229         if (!ret && qemu_file_get_error(f)) {
4230             ret = qemu_file_get_error(f);
4231         }
4232
4233         if (!ret && place_needed) {
4234             /* This gets called at the last target page in the host page */
4235             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
4236
4237             if (all_zero) {
4238                 ret = postcopy_place_page_zero(mis, place_dest,
4239                                                block);
4240             } else {
4241                 ret = postcopy_place_page(mis, place_dest,
4242                                           place_source, block);
4243             }
4244         }
4245     }
4246
4247     return ret;
4248 }
4249
4250 static bool postcopy_is_advised(void)
4251 {
4252     PostcopyState ps = postcopy_state_get();
4253     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4254 }
4255
4256 static bool postcopy_is_running(void)
4257 {
4258     PostcopyState ps = postcopy_state_get();
4259     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4260 }
4261
4262 /*
4263  * Flush content of RAM cache into SVM's memory.
4264  * Only flush the pages that be dirtied by PVM or SVM or both.
4265  */
4266 static void colo_flush_ram_cache(void)
4267 {
4268     RAMBlock *block = NULL;
4269     void *dst_host;
4270     void *src_host;
4271     unsigned long offset = 0;
4272
4273     memory_global_dirty_log_sync();
4274     rcu_read_lock();
4275     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4276         ramblock_sync_dirty_bitmap(ram_state, block);
4277     }
4278     rcu_read_unlock();
4279
4280     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4281     rcu_read_lock();
4282     block = QLIST_FIRST_RCU(&ram_list.blocks);
4283
4284     while (block) {
4285         offset = migration_bitmap_find_dirty(ram_state, block, offset);
4286
4287         if (offset << TARGET_PAGE_BITS >= block->used_length) {
4288             offset = 0;
4289             block = QLIST_NEXT_RCU(block, next);
4290         } else {
4291             migration_bitmap_clear_dirty(ram_state, block, offset);
4292             dst_host = block->host + (offset << TARGET_PAGE_BITS);
4293             src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
4294             memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4295         }
4296     }
4297
4298     rcu_read_unlock();
4299     trace_colo_flush_ram_cache_end();
4300 }
4301
4302 /**
4303  * ram_load_precopy: load pages in precopy case
4304  *
4305  * Returns 0 for success or -errno in case of error
4306  *
4307  * Called in precopy mode by ram_load().
4308  * rcu_read_lock is taken prior to this being called.
4309  *
4310  * @f: QEMUFile where to send the data
4311  */
4312 static int ram_load_precopy(QEMUFile *f)
4313 {
4314     int flags = 0, ret = 0, invalid_flags = 0, len = 0;
4315     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4316     bool postcopy_advised = postcopy_is_advised();
4317     if (!migrate_use_compression()) {
4318         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4319     }
4320
4321     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4322         ram_addr_t addr, total_ram_bytes;
4323         void *host = NULL;
4324         uint8_t ch;
4325
4326         addr = qemu_get_be64(f);
4327         flags = addr & ~TARGET_PAGE_MASK;
4328         addr &= TARGET_PAGE_MASK;
4329
4330         if (flags & invalid_flags) {
4331             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4332                 error_report("Received an unexpected compressed page");
4333             }
4334
4335             ret = -EINVAL;
4336             break;
4337         }
4338
4339         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4340                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4341             RAMBlock *block = ram_block_from_stream(f, flags);
4342
4343             /*
4344              * After going into COLO, we should load the Page into colo_cache.
4345              */
4346             if (migration_incoming_in_colo_state()) {
4347                 host = colo_cache_from_block_offset(block, addr);
4348             } else {
4349                 host = host_from_ram_block_offset(block, addr);
4350             }
4351             if (!host) {
4352                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4353                 ret = -EINVAL;
4354                 break;
4355             }
4356
4357             if (!migration_incoming_in_colo_state()) {
4358                 ramblock_recv_bitmap_set(block, host);
4359             }
4360
4361             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4362         }
4363
4364         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4365         case RAM_SAVE_FLAG_MEM_SIZE:
4366             /* Synchronize RAM block list */
4367             total_ram_bytes = addr;
4368             while (!ret && total_ram_bytes) {
4369                 RAMBlock *block;
4370                 char id[256];
4371                 ram_addr_t length;
4372
4373                 len = qemu_get_byte(f);
4374                 qemu_get_buffer(f, (uint8_t *)id, len);
4375                 id[len] = 0;
4376                 length = qemu_get_be64(f);
4377
4378                 block = qemu_ram_block_by_name(id);
4379                 if (block && !qemu_ram_is_migratable(block)) {
4380                     error_report("block %s should not be migrated !", id);
4381                     ret = -EINVAL;
4382                 } else if (block) {
4383                     if (length != block->used_length) {
4384                         Error *local_err = NULL;
4385
4386                         ret = qemu_ram_resize(block, length,
4387                                               &local_err);
4388                         if (local_err) {
4389                             error_report_err(local_err);
4390                         }
4391                     }
4392                     /* For postcopy we need to check hugepage sizes match */
4393                     if (postcopy_advised &&
4394                         block->page_size != qemu_host_page_size) {
4395                         uint64_t remote_page_size = qemu_get_be64(f);
4396                         if (remote_page_size != block->page_size) {
4397                             error_report("Mismatched RAM page size %s "
4398                                          "(local) %zd != %" PRId64,
4399                                          id, block->page_size,
4400                                          remote_page_size);
4401                             ret = -EINVAL;
4402                         }
4403                     }
4404                     if (migrate_ignore_shared()) {
4405                         hwaddr addr = qemu_get_be64(f);
4406                         if (ramblock_is_ignored(block) &&
4407                             block->mr->addr != addr) {
4408                             error_report("Mismatched GPAs for block %s "
4409                                          "%" PRId64 "!= %" PRId64,
4410                                          id, (uint64_t)addr,
4411                                          (uint64_t)block->mr->addr);
4412                             ret = -EINVAL;
4413                         }
4414                     }
4415                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4416                                           block->idstr);
4417                 } else {
4418                     error_report("Unknown ramblock \"%s\", cannot "
4419                                  "accept migration", id);
4420                     ret = -EINVAL;
4421                 }
4422
4423                 total_ram_bytes -= length;
4424             }
4425             break;
4426
4427         case RAM_SAVE_FLAG_ZERO:
4428             ch = qemu_get_byte(f);
4429             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4430             break;
4431
4432         case RAM_SAVE_FLAG_PAGE:
4433             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4434             break;
4435
4436         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4437             len = qemu_get_be32(f);
4438             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4439                 error_report("Invalid compressed data length: %d", len);
4440                 ret = -EINVAL;
4441                 break;
4442             }
4443             decompress_data_with_multi_threads(f, host, len);
4444             break;
4445
4446         case RAM_SAVE_FLAG_XBZRLE:
4447             if (load_xbzrle(f, addr, host) < 0) {
4448                 error_report("Failed to decompress XBZRLE page at "
4449                              RAM_ADDR_FMT, addr);
4450                 ret = -EINVAL;
4451                 break;
4452             }
4453             break;
4454         case RAM_SAVE_FLAG_EOS:
4455             /* normal exit */
4456             multifd_recv_sync_main();
4457             break;
4458         default:
4459             if (flags & RAM_SAVE_FLAG_HOOK) {
4460                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4461             } else {
4462                 error_report("Unknown combination of migration flags: %#x",
4463                              flags);
4464                 ret = -EINVAL;
4465             }
4466         }
4467         if (!ret) {
4468             ret = qemu_file_get_error(f);
4469         }
4470     }
4471
4472     return ret;
4473 }
4474
4475 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4476 {
4477     int ret = 0;
4478     static uint64_t seq_iter;
4479     /*
4480      * If system is running in postcopy mode, page inserts to host memory must
4481      * be atomic
4482      */
4483     bool postcopy_running = postcopy_is_running();
4484
4485     seq_iter++;
4486
4487     if (version_id != 4) {
4488         return -EINVAL;
4489     }
4490
4491     /*
4492      * This RCU critical section can be very long running.
4493      * When RCU reclaims in the code start to become numerous,
4494      * it will be necessary to reduce the granularity of this
4495      * critical section.
4496      */
4497     rcu_read_lock();
4498
4499     if (postcopy_running) {
4500         ret = ram_load_postcopy(f);
4501     } else {
4502         ret = ram_load_precopy(f);
4503     }
4504
4505     ret |= wait_for_decompress_done();
4506     rcu_read_unlock();
4507     trace_ram_load_complete(ret, seq_iter);
4508
4509     if (!ret  && migration_incoming_in_colo_state()) {
4510         colo_flush_ram_cache();
4511     }
4512     return ret;
4513 }
4514
4515 static bool ram_has_postcopy(void *opaque)
4516 {
4517     RAMBlock *rb;
4518     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4519         if (ramblock_is_pmem(rb)) {
4520             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4521                          "is not supported now!", rb->idstr, rb->host);
4522             return false;
4523         }
4524     }
4525
4526     return migrate_postcopy_ram();
4527 }
4528
4529 /* Sync all the dirty bitmap with destination VM.  */
4530 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4531 {
4532     RAMBlock *block;
4533     QEMUFile *file = s->to_dst_file;
4534     int ramblock_count = 0;
4535
4536     trace_ram_dirty_bitmap_sync_start();
4537
4538     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4539         qemu_savevm_send_recv_bitmap(file, block->idstr);
4540         trace_ram_dirty_bitmap_request(block->idstr);
4541         ramblock_count++;
4542     }
4543
4544     trace_ram_dirty_bitmap_sync_wait();
4545
4546     /* Wait until all the ramblocks' dirty bitmap synced */
4547     while (ramblock_count--) {
4548         qemu_sem_wait(&s->rp_state.rp_sem);
4549     }
4550
4551     trace_ram_dirty_bitmap_sync_complete();
4552
4553     return 0;
4554 }
4555
4556 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4557 {
4558     qemu_sem_post(&s->rp_state.rp_sem);
4559 }
4560
4561 /*
4562  * Read the received bitmap, revert it as the initial dirty bitmap.
4563  * This is only used when the postcopy migration is paused but wants
4564  * to resume from a middle point.
4565  */
4566 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4567 {
4568     int ret = -EINVAL;
4569     QEMUFile *file = s->rp_state.from_dst_file;
4570     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4571     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4572     uint64_t size, end_mark;
4573
4574     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4575
4576     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4577         error_report("%s: incorrect state %s", __func__,
4578                      MigrationStatus_str(s->state));
4579         return -EINVAL;
4580     }
4581
4582     /*
4583      * Note: see comments in ramblock_recv_bitmap_send() on why we
4584      * need the endianess convertion, and the paddings.
4585      */
4586     local_size = ROUND_UP(local_size, 8);
4587
4588     /* Add paddings */
4589     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4590
4591     size = qemu_get_be64(file);
4592
4593     /* The size of the bitmap should match with our ramblock */
4594     if (size != local_size) {
4595         error_report("%s: ramblock '%s' bitmap size mismatch "
4596                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4597                      block->idstr, size, local_size);
4598         ret = -EINVAL;
4599         goto out;
4600     }
4601
4602     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4603     end_mark = qemu_get_be64(file);
4604
4605     ret = qemu_file_get_error(file);
4606     if (ret || size != local_size) {
4607         error_report("%s: read bitmap failed for ramblock '%s': %d"
4608                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4609                      __func__, block->idstr, ret, local_size, size);
4610         ret = -EIO;
4611         goto out;
4612     }
4613
4614     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4615         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4616                      __func__, block->idstr, end_mark);
4617         ret = -EINVAL;
4618         goto out;
4619     }
4620
4621     /*
4622      * Endianess convertion. We are during postcopy (though paused).
4623      * The dirty bitmap won't change. We can directly modify it.
4624      */
4625     bitmap_from_le(block->bmap, le_bitmap, nbits);
4626
4627     /*
4628      * What we received is "received bitmap". Revert it as the initial
4629      * dirty bitmap for this ramblock.
4630      */
4631     bitmap_complement(block->bmap, block->bmap, nbits);
4632
4633     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4634
4635     /*
4636      * We succeeded to sync bitmap for current ramblock. If this is
4637      * the last one to sync, we need to notify the main send thread.
4638      */
4639     ram_dirty_bitmap_reload_notify(s);
4640
4641     ret = 0;
4642 out:
4643     g_free(le_bitmap);
4644     return ret;
4645 }
4646
4647 static int ram_resume_prepare(MigrationState *s, void *opaque)
4648 {
4649     RAMState *rs = *(RAMState **)opaque;
4650     int ret;
4651
4652     ret = ram_dirty_bitmap_sync_all(s, rs);
4653     if (ret) {
4654         return ret;
4655     }
4656
4657     ram_state_resume_prepare(rs, s->to_dst_file);
4658
4659     return 0;
4660 }
4661
4662 static SaveVMHandlers savevm_ram_handlers = {
4663     .save_setup = ram_save_setup,
4664     .save_live_iterate = ram_save_iterate,
4665     .save_live_complete_postcopy = ram_save_complete,
4666     .save_live_complete_precopy = ram_save_complete,
4667     .has_postcopy = ram_has_postcopy,
4668     .save_live_pending = ram_save_pending,
4669     .load_state = ram_load,
4670     .save_cleanup = ram_save_cleanup,
4671     .load_setup = ram_load_setup,
4672     .load_cleanup = ram_load_cleanup,
4673     .resume_prepare = ram_resume_prepare,
4674 };
4675
4676 void ram_mig_init(void)
4677 {
4678     qemu_mutex_init(&XBZRLE.lock);
4679     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4680 }