migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "socket.h"
  40 #include "migration/register.h"
  41 #include "migration/misc.h"
  42 #include "qemu-file.h"
  43 #include "postcopy-ram.h"
  44 #include "page_cache.h"
  45 #include "qemu/error-report.h"
  46 #include "qapi/error.h"
  47 #include "qapi/qapi-events-migration.h"
  48 #include "qapi/qmp/qerror.h"
  49 #include "trace.h"
  50 #include "exec/ram_addr.h"
  51 #include "exec/target_page.h"
  52 #include "qemu/rcu_queue.h"
  53 #include "migration/colo.h"
  54 #include "block.h"
  55 #include "sysemu/sysemu.h"
  56 #include "qemu/uuid.h"
  57 #include "savevm.h"
  58 #include "qemu/iov.h"
  59
  60 /***********************************************************/
  61 /* ram save/restore */
  62
  63 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  64  * worked for pages that where filled with the same char.  We switched
  65  * it to only search for the zero value.  And to avoid confusion with
  66  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  67  */
  68
  69 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  70 #define RAM_SAVE_FLAG_ZERO     0x02
  71 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  72 #define RAM_SAVE_FLAG_PAGE     0x08
  73 #define RAM_SAVE_FLAG_EOS      0x10
  74 #define RAM_SAVE_FLAG_CONTINUE 0x20
  75 #define RAM_SAVE_FLAG_XBZRLE   0x40
  76 /* 0x80 is reserved in migration.h start with 0x100 next */
  77 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  78
  79 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  80 {
  81     return buffer_is_zero(p, size);
  82 }
  83
  84 XBZRLECacheStats xbzrle_counters;
  85
  86 /* struct contains XBZRLE cache and a static page
  87    used by the compression */
  88 static struct {
  89     /* buffer used for XBZRLE encoding */
  90     uint8_t *encoded_buf;
  91     /* buffer for storing page content */
  92     uint8_t *current_buf;
  93     /* Cache for XBZRLE, Protected by lock. */
  94     PageCache *cache;
  95     QemuMutex lock;
  96     /* it will store a page full of zeros */
  97     uint8_t *zero_target_page;
  98     /* buffer used for XBZRLE decoding */
  99     uint8_t *decoded_buf;
 100 } XBZRLE;
 101
 102 static void XBZRLE_cache_lock(void)
 103 {
 104     if (migrate_use_xbzrle())
 105         qemu_mutex_lock(&XBZRLE.lock);
 106 }
 107
 108 static void XBZRLE_cache_unlock(void)
 109 {
 110     if (migrate_use_xbzrle())
 111         qemu_mutex_unlock(&XBZRLE.lock);
 112 }
 113
 114 /**
 115  * xbzrle_cache_resize: resize the xbzrle cache
 116  *
 117  * This function is called from qmp_migrate_set_cache_size in main
 118  * thread, possibly while a migration is in progress.  A running
 119  * migration may be using the cache and might finish during this call,
 120  * hence changes to the cache are protected by XBZRLE.lock().
 121  *
 122  * Returns 0 for success or -1 for error
 123  *
 124  * @new_size: new cache size
 125  * @errp: set *errp if the check failed, with reason
 126  */
 127 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 128 {
 129     PageCache *new_cache;
 130     int64_t ret = 0;
 131
 132     /* Check for truncation */
 133     if (new_size != (size_t)new_size) {
 134         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 135                    "exceeding address space");
 136         return -1;
 137     }
 138
 139     if (new_size == migrate_xbzrle_cache_size()) {
 140         /* nothing to do */
 141         return 0;
 142     }
 143
 144     XBZRLE_cache_lock();
 145
 146     if (XBZRLE.cache != NULL) {
 147         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 148         if (!new_cache) {
 149             ret = -1;
 150             goto out;
 151         }
 152
 153         cache_fini(XBZRLE.cache);
 154         XBZRLE.cache = new_cache;
 155     }
 156 out:
 157     XBZRLE_cache_unlock();
 158     return ret;
 159 }
 160
 161 static bool ramblock_is_ignored(RAMBlock *block)
 162 {
 163     return !qemu_ram_is_migratable(block) ||
 164            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 165 }
 166
 167 /* Should be holding either ram_list.mutex, or the RCU lock. */
 168 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
 169     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 170         if (ramblock_is_ignored(block)) {} else
 171
 172 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 173     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 174         if (!qemu_ram_is_migratable(block)) {} else
 175
 176 #undef RAMBLOCK_FOREACH
 177
 178 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 179 {
 180     RAMBlock *block;
 181     int ret = 0;
 182
 183     RCU_READ_LOCK_GUARD();
 184
 185     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 186         ret = func(block, opaque);
 187         if (ret) {
 188             break;
 189         }
 190     }
 191     return ret;
 192 }
 193
 194 static void ramblock_recv_map_init(void)
 195 {
 196     RAMBlock *rb;
 197
 198     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 199         assert(!rb->receivedmap);
 200         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 201     }
 202 }
 203
 204 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 205 {
 206     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 207                     rb->receivedmap);
 208 }
 209
 210 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 211 {
 212     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 213 }
 214
 215 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 216 {
 217     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 218 }
 219
 220 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 221                                     size_t nr)
 222 {
 223     bitmap_set_atomic(rb->receivedmap,
 224                       ramblock_recv_bitmap_offset(host_addr, rb),
 225                       nr);
 226 }
 227
 228 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 229
 230 /*
 231  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 232  *
 233  * Returns >0 if success with sent bytes, or <0 if error.
 234  */
 235 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 236                                   const char *block_name)
 237 {
 238     RAMBlock *block = qemu_ram_block_by_name(block_name);
 239     unsigned long *le_bitmap, nbits;
 240     uint64_t size;
 241
 242     if (!block) {
 243         error_report("%s: invalid block name: %s", __func__, block_name);
 244         return -1;
 245     }
 246
 247     nbits = block->used_length >> TARGET_PAGE_BITS;
 248
 249     /*
 250      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 251      * machines we may need 4 more bytes for padding (see below
 252      * comment). So extend it a bit before hand.
 253      */
 254     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 255
 256     /*
 257      * Always use little endian when sending the bitmap. This is
 258      * required that when source and destination VMs are not using the
 259      * same endianess. (Note: big endian won't work.)
 260      */
 261     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 262
 263     /* Size of the bitmap, in bytes */
 264     size = DIV_ROUND_UP(nbits, 8);
 265
 266     /*
 267      * size is always aligned to 8 bytes for 64bit machines, but it
 268      * may not be true for 32bit machines. We need this padding to
 269      * make sure the migration can survive even between 32bit and
 270      * 64bit machines.
 271      */
 272     size = ROUND_UP(size, 8);
 273
 274     qemu_put_be64(file, size);
 275     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 276     /*
 277      * Mark as an end, in case the middle part is screwed up due to
 278      * some "misterious" reason.
 279      */
 280     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 281     qemu_fflush(file);
 282
 283     g_free(le_bitmap);
 284
 285     if (qemu_file_get_error(file)) {
 286         return qemu_file_get_error(file);
 287     }
 288
 289     return size + sizeof(size);
 290 }
 291
 292 /*
 293  * An outstanding page request, on the source, having been received
 294  * and queued
 295  */
 296 struct RAMSrcPageRequest {
 297     RAMBlock *rb;
 298     hwaddr    offset;
 299     hwaddr    len;
 300
 301     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 302 };
 303
 304 /* State of RAM for migration */
 305 struct RAMState {
 306     /* QEMUFile used for this migration */
 307     QEMUFile *f;
 308     /* Last block that we have visited searching for dirty pages */
 309     RAMBlock *last_seen_block;
 310     /* Last block from where we have sent data */
 311     RAMBlock *last_sent_block;
 312     /* Last dirty target page we have sent */
 313     ram_addr_t last_page;
 314     /* last ram version we have seen */
 315     uint32_t last_version;
 316     /* We are in the first round */
 317     bool ram_bulk_stage;
 318     /* The free page optimization is enabled */
 319     bool fpo_enabled;
 320     /* How many times we have dirty too many pages */
 321     int dirty_rate_high_cnt;
 322     /* these variables are used for bitmap sync */
 323     /* last time we did a full bitmap_sync */
 324     int64_t time_last_bitmap_sync;
 325     /* bytes transferred at start_time */
 326     uint64_t bytes_xfer_prev;
 327     /* number of dirty pages since start_time */
 328     uint64_t num_dirty_pages_period;
 329     /* xbzrle misses since the beginning of the period */
 330     uint64_t xbzrle_cache_miss_prev;
 331
 332     /* compression statistics since the beginning of the period */
 333     /* amount of count that no free thread to compress data */
 334     uint64_t compress_thread_busy_prev;
 335     /* amount bytes after compression */
 336     uint64_t compressed_size_prev;
 337     /* amount of compressed pages */
 338     uint64_t compress_pages_prev;
 339
 340     /* total handled target pages at the beginning of period */
 341     uint64_t target_page_count_prev;
 342     /* total handled target pages since start */
 343     uint64_t target_page_count;
 344     /* number of dirty bits in the bitmap */
 345     uint64_t migration_dirty_pages;
 346     /* Protects modification of the bitmap and migration dirty pages */
 347     QemuMutex bitmap_mutex;
 348     /* The RAMBlock used in the last src_page_requests */
 349     RAMBlock *last_req_rb;
 350     /* Queue of outstanding page requests from the destination */
 351     QemuMutex src_page_req_mutex;
 352     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 353 };
 354 typedef struct RAMState RAMState;
 355
 356 static RAMState *ram_state;
 357
 358 static NotifierWithReturnList precopy_notifier_list;
 359
 360 void precopy_infrastructure_init(void)
 361 {
 362     notifier_with_return_list_init(&precopy_notifier_list);
 363 }
 364
 365 void precopy_add_notifier(NotifierWithReturn *n)
 366 {
 367     notifier_with_return_list_add(&precopy_notifier_list, n);
 368 }
 369
 370 void precopy_remove_notifier(NotifierWithReturn *n)
 371 {
 372     notifier_with_return_remove(n);
 373 }
 374
 375 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 376 {
 377     PrecopyNotifyData pnd;
 378     pnd.reason = reason;
 379     pnd.errp = errp;
 380
 381     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 382 }
 383
 384 void precopy_enable_free_page_optimization(void)
 385 {
 386     if (!ram_state) {
 387         return;
 388     }
 389
 390     ram_state->fpo_enabled = true;
 391 }
 392
 393 uint64_t ram_bytes_remaining(void)
 394 {
 395     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 396                        0;
 397 }
 398
 399 MigrationStats ram_counters;
 400
 401 /* used by the search for pages to send */
 402 struct PageSearchStatus {
 403     /* Current block being searched */
 404     RAMBlock    *block;
 405     /* Current page to search from */
 406     unsigned long page;
 407     /* Set once we wrap around */
 408     bool         complete_round;
 409 };
 410 typedef struct PageSearchStatus PageSearchStatus;
 411
 412 CompressionStats compression_counters;
 413
 414 struct CompressParam {
 415     bool done;
 416     bool quit;
 417     bool zero_page;
 418     QEMUFile *file;
 419     QemuMutex mutex;
 420     QemuCond cond;
 421     RAMBlock *block;
 422     ram_addr_t offset;
 423
 424     /* internally used fields */
 425     z_stream stream;
 426     uint8_t *originbuf;
 427 };
 428 typedef struct CompressParam CompressParam;
 429
 430 struct DecompressParam {
 431     bool done;
 432     bool quit;
 433     QemuMutex mutex;
 434     QemuCond cond;
 435     void *des;
 436     uint8_t *compbuf;
 437     int len;
 438     z_stream stream;
 439 };
 440 typedef struct DecompressParam DecompressParam;
 441
 442 static CompressParam *comp_param;
 443 static QemuThread *compress_threads;
 444 /* comp_done_cond is used to wake up the migration thread when
 445  * one of the compression threads has finished the compression.
 446  * comp_done_lock is used to co-work with comp_done_cond.
 447  */
 448 static QemuMutex comp_done_lock;
 449 static QemuCond comp_done_cond;
 450 /* The empty QEMUFileOps will be used by file in CompressParam */
 451 static const QEMUFileOps empty_ops = { };
 452
 453 static QEMUFile *decomp_file;
 454 static DecompressParam *decomp_param;
 455 static QemuThread *decompress_threads;
 456 static QemuMutex decomp_done_lock;
 457 static QemuCond decomp_done_cond;
 458
 459 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 460                                  ram_addr_t offset, uint8_t *source_buf);
 461
 462 static void *do_data_compress(void *opaque)
 463 {
 464     CompressParam *param = opaque;
 465     RAMBlock *block;
 466     ram_addr_t offset;
 467     bool zero_page;
 468
 469     qemu_mutex_lock(&param->mutex);
 470     while (!param->quit) {
 471         if (param->block) {
 472             block = param->block;
 473             offset = param->offset;
 474             param->block = NULL;
 475             qemu_mutex_unlock(&param->mutex);
 476
 477             zero_page = do_compress_ram_page(param->file, &param->stream,
 478                                              block, offset, param->originbuf);
 479
 480             qemu_mutex_lock(&comp_done_lock);
 481             param->done = true;
 482             param->zero_page = zero_page;
 483             qemu_cond_signal(&comp_done_cond);
 484             qemu_mutex_unlock(&comp_done_lock);
 485
 486             qemu_mutex_lock(&param->mutex);
 487         } else {
 488             qemu_cond_wait(&param->cond, &param->mutex);
 489         }
 490     }
 491     qemu_mutex_unlock(&param->mutex);
 492
 493     return NULL;
 494 }
 495
 496 static void compress_threads_save_cleanup(void)
 497 {
 498     int i, thread_count;
 499
 500     if (!migrate_use_compression() || !comp_param) {
 501         return;
 502     }
 503
 504     thread_count = migrate_compress_threads();
 505     for (i = 0; i < thread_count; i++) {
 506         /*
 507          * we use it as a indicator which shows if the thread is
 508          * properly init'd or not
 509          */
 510         if (!comp_param[i].file) {
 511             break;
 512         }
 513
 514         qemu_mutex_lock(&comp_param[i].mutex);
 515         comp_param[i].quit = true;
 516         qemu_cond_signal(&comp_param[i].cond);
 517         qemu_mutex_unlock(&comp_param[i].mutex);
 518
 519         qemu_thread_join(compress_threads + i);
 520         qemu_mutex_destroy(&comp_param[i].mutex);
 521         qemu_cond_destroy(&comp_param[i].cond);
 522         deflateEnd(&comp_param[i].stream);
 523         g_free(comp_param[i].originbuf);
 524         qemu_fclose(comp_param[i].file);
 525         comp_param[i].file = NULL;
 526     }
 527     qemu_mutex_destroy(&comp_done_lock);
 528     qemu_cond_destroy(&comp_done_cond);
 529     g_free(compress_threads);
 530     g_free(comp_param);
 531     compress_threads = NULL;
 532     comp_param = NULL;
 533 }
 534
 535 static int compress_threads_save_setup(void)
 536 {
 537     int i, thread_count;
 538
 539     if (!migrate_use_compression()) {
 540         return 0;
 541     }
 542     thread_count = migrate_compress_threads();
 543     compress_threads = g_new0(QemuThread, thread_count);
 544     comp_param = g_new0(CompressParam, thread_count);
 545     qemu_cond_init(&comp_done_cond);
 546     qemu_mutex_init(&comp_done_lock);
 547     for (i = 0; i < thread_count; i++) {
 548         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 549         if (!comp_param[i].originbuf) {
 550             goto exit;
 551         }
 552
 553         if (deflateInit(&comp_param[i].stream,
 554                         migrate_compress_level()) != Z_OK) {
 555             g_free(comp_param[i].originbuf);
 556             goto exit;
 557         }
 558
 559         /* comp_param[i].file is just used as a dummy buffer to save data,
 560          * set its ops to empty.
 561          */
 562         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 563         comp_param[i].done = true;
 564         comp_param[i].quit = false;
 565         qemu_mutex_init(&comp_param[i].mutex);
 566         qemu_cond_init(&comp_param[i].cond);
 567         qemu_thread_create(compress_threads + i, "compress",
 568                            do_data_compress, comp_param + i,
 569                            QEMU_THREAD_JOINABLE);
 570     }
 571     return 0;
 572
 573 exit:
 574     compress_threads_save_cleanup();
 575     return -1;
 576 }
 577
 578 /* Multiple fd's */
 579
 580 #define MULTIFD_MAGIC 0x11223344U
 581 #define MULTIFD_VERSION 1
 582
 583 #define MULTIFD_FLAG_SYNC (1 << 0)
 584
 585 /* This value needs to be a multiple of qemu_target_page_size() */
 586 #define MULTIFD_PACKET_SIZE (512 * 1024)
 587
 588 typedef struct {
 589     uint32_t magic;
 590     uint32_t version;
 591     unsigned char uuid[16]; /* QemuUUID */
 592     uint8_t id;
 593     uint8_t unused1[7];     /* Reserved for future use */
 594     uint64_t unused2[4];    /* Reserved for future use */
 595 } __attribute__((packed)) MultiFDInit_t;
 596
 597 typedef struct {
 598     uint32_t magic;
 599     uint32_t version;
 600     uint32_t flags;
 601     /* maximum number of allocated pages */
 602     uint32_t pages_alloc;
 603     uint32_t pages_used;
 604     /* size of the next packet that contains pages */
 605     uint32_t next_packet_size;
 606     uint64_t packet_num;
 607     uint64_t unused[4];    /* Reserved for future use */
 608     char ramblock[256];
 609     uint64_t offset[];
 610 } __attribute__((packed)) MultiFDPacket_t;
 611
 612 typedef struct {
 613     /* number of used pages */
 614     uint32_t used;
 615     /* number of allocated pages */
 616     uint32_t allocated;
 617     /* global number of generated multifd packets */
 618     uint64_t packet_num;
 619     /* offset of each page */
 620     ram_addr_t *offset;
 621     /* pointer to each page */
 622     struct iovec *iov;
 623     RAMBlock *block;
 624 } MultiFDPages_t;
 625
 626 typedef struct {
 627     /* this fields are not changed once the thread is created */
 628     /* channel number */
 629     uint8_t id;
 630     /* channel thread name */
 631     char *name;
 632     /* channel thread id */
 633     QemuThread thread;
 634     /* communication channel */
 635     QIOChannel *c;
 636     /* sem where to wait for more work */
 637     QemuSemaphore sem;
 638     /* this mutex protects the following parameters */
 639     QemuMutex mutex;
 640     /* is this channel thread running */
 641     bool running;
 642     /* should this thread finish */
 643     bool quit;
 644     /* thread has work to do */
 645     int pending_job;
 646     /* array of pages to sent */
 647     MultiFDPages_t *pages;
 648     /* packet allocated len */
 649     uint32_t packet_len;
 650     /* pointer to the packet */
 651     MultiFDPacket_t *packet;
 652     /* multifd flags for each packet */
 653     uint32_t flags;
 654     /* size of the next packet that contains pages */
 655     uint32_t next_packet_size;
 656     /* global number of generated multifd packets */
 657     uint64_t packet_num;
 658     /* thread local variables */
 659     /* packets sent through this channel */
 660     uint64_t num_packets;
 661     /* pages sent through this channel */
 662     uint64_t num_pages;
 663     /* syncs main thread and channels */
 664     QemuSemaphore sem_sync;
 665 }  MultiFDSendParams;
 666
 667 typedef struct {
 668     /* this fields are not changed once the thread is created */
 669     /* channel number */
 670     uint8_t id;
 671     /* channel thread name */
 672     char *name;
 673     /* channel thread id */
 674     QemuThread thread;
 675     /* communication channel */
 676     QIOChannel *c;
 677     /* this mutex protects the following parameters */
 678     QemuMutex mutex;
 679     /* is this channel thread running */
 680     bool running;
 681     /* should this thread finish */
 682     bool quit;
 683     /* array of pages to receive */
 684     MultiFDPages_t *pages;
 685     /* packet allocated len */
 686     uint32_t packet_len;
 687     /* pointer to the packet */
 688     MultiFDPacket_t *packet;
 689     /* multifd flags for each packet */
 690     uint32_t flags;
 691     /* global number of generated multifd packets */
 692     uint64_t packet_num;
 693     /* thread local variables */
 694     /* size of the next packet that contains pages */
 695     uint32_t next_packet_size;
 696     /* packets sent through this channel */
 697     uint64_t num_packets;
 698     /* pages sent through this channel */
 699     uint64_t num_pages;
 700     /* syncs main thread and channels */
 701     QemuSemaphore sem_sync;
 702 } MultiFDRecvParams;
 703
 704 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 705 {
 706     MultiFDInit_t msg;
 707     int ret;
 708
 709     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 710     msg.version = cpu_to_be32(MULTIFD_VERSION);
 711     msg.id = p->id;
 712     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 713
 714     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 715     if (ret != 0) {
 716         return -1;
 717     }
 718     return 0;
 719 }
 720
 721 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 722 {
 723     MultiFDInit_t msg;
 724     int ret;
 725
 726     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 727     if (ret != 0) {
 728         return -1;
 729     }
 730
 731     msg.magic = be32_to_cpu(msg.magic);
 732     msg.version = be32_to_cpu(msg.version);
 733
 734     if (msg.magic != MULTIFD_MAGIC) {
 735         error_setg(errp, "multifd: received packet magic %x "
 736                    "expected %x", msg.magic, MULTIFD_MAGIC);
 737         return -1;
 738     }
 739
 740     if (msg.version != MULTIFD_VERSION) {
 741         error_setg(errp, "multifd: received packet version %d "
 742                    "expected %d", msg.version, MULTIFD_VERSION);
 743         return -1;
 744     }
 745
 746     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 747         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 748         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 749
 750         error_setg(errp, "multifd: received uuid '%s' and expected "
 751                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 752         g_free(uuid);
 753         g_free(msg_uuid);
 754         return -1;
 755     }
 756
 757     if (msg.id > migrate_multifd_channels()) {
 758         error_setg(errp, "multifd: received channel version %d "
 759                    "expected %d", msg.version, MULTIFD_VERSION);
 760         return -1;
 761     }
 762
 763     return msg.id;
 764 }
 765
 766 static MultiFDPages_t *multifd_pages_init(size_t size)
 767 {
 768     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 769
 770     pages->allocated = size;
 771     pages->iov = g_new0(struct iovec, size);
 772     pages->offset = g_new0(ram_addr_t, size);
 773
 774     return pages;
 775 }
 776
 777 static void multifd_pages_clear(MultiFDPages_t *pages)
 778 {
 779     pages->used = 0;
 780     pages->allocated = 0;
 781     pages->packet_num = 0;
 782     pages->block = NULL;
 783     g_free(pages->iov);
 784     pages->iov = NULL;
 785     g_free(pages->offset);
 786     pages->offset = NULL;
 787     g_free(pages);
 788 }
 789
 790 static void multifd_send_fill_packet(MultiFDSendParams *p)
 791 {
 792     MultiFDPacket_t *packet = p->packet;
 793     int i;
 794
 795     packet->flags = cpu_to_be32(p->flags);
 796     packet->pages_alloc = cpu_to_be32(p->pages->allocated);
 797     packet->pages_used = cpu_to_be32(p->pages->used);
 798     packet->next_packet_size = cpu_to_be32(p->next_packet_size);
 799     packet->packet_num = cpu_to_be64(p->packet_num);
 800
 801     if (p->pages->block) {
 802         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 803     }
 804
 805     for (i = 0; i < p->pages->used; i++) {
 806         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 807     }
 808 }
 809
 810 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 811 {
 812     MultiFDPacket_t *packet = p->packet;
 813     uint32_t pages_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
 814     RAMBlock *block;
 815     int i;
 816
 817     packet->magic = be32_to_cpu(packet->magic);
 818     if (packet->magic != MULTIFD_MAGIC) {
 819         error_setg(errp, "multifd: received packet "
 820                    "magic %x and expected magic %x",
 821                    packet->magic, MULTIFD_MAGIC);
 822         return -1;
 823     }
 824
 825     packet->version = be32_to_cpu(packet->version);
 826     if (packet->version != MULTIFD_VERSION) {
 827         error_setg(errp, "multifd: received packet "
 828                    "version %d and expected version %d",
 829                    packet->version, MULTIFD_VERSION);
 830         return -1;
 831     }
 832
 833     p->flags = be32_to_cpu(packet->flags);
 834
 835     packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
 836     /*
 837      * If we received a packet that is 100 times bigger than expected
 838      * just stop migration.  It is a magic number.
 839      */
 840     if (packet->pages_alloc > pages_max * 100) {
 841         error_setg(errp, "multifd: received packet "
 842                    "with size %d and expected a maximum size of %d",
 843                    packet->pages_alloc, pages_max * 100) ;
 844         return -1;
 845     }
 846     /*
 847      * We received a packet that is bigger than expected but inside
 848      * reasonable limits (see previous comment).  Just reallocate.
 849      */
 850     if (packet->pages_alloc > p->pages->allocated) {
 851         multifd_pages_clear(p->pages);
 852         p->pages = multifd_pages_init(packet->pages_alloc);
 853     }
 854
 855     p->pages->used = be32_to_cpu(packet->pages_used);
 856     if (p->pages->used > packet->pages_alloc) {
 857         error_setg(errp, "multifd: received packet "
 858                    "with %d pages and expected maximum pages are %d",
 859                    p->pages->used, packet->pages_alloc) ;
 860         return -1;
 861     }
 862
 863     p->next_packet_size = be32_to_cpu(packet->next_packet_size);
 864     p->packet_num = be64_to_cpu(packet->packet_num);
 865
 866     if (p->pages->used) {
 867         /* make sure that ramblock is 0 terminated */
 868         packet->ramblock[255] = 0;
 869         block = qemu_ram_block_by_name(packet->ramblock);
 870         if (!block) {
 871             error_setg(errp, "multifd: unknown ram block %s",
 872                        packet->ramblock);
 873             return -1;
 874         }
 875     }
 876
 877     for (i = 0; i < p->pages->used; i++) {
 878         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 879
 880         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 881             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 882                        " (max " RAM_ADDR_FMT ")",
 883                        offset, block->max_length);
 884             return -1;
 885         }
 886         p->pages->iov[i].iov_base = block->host + offset;
 887         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 888     }
 889
 890     return 0;
 891 }
 892
 893 struct {
 894     MultiFDSendParams *params;
 895     /* array of pages to sent */
 896     MultiFDPages_t *pages;
 897     /* global number of generated multifd packets */
 898     uint64_t packet_num;
 899     /* send channels ready */
 900     QemuSemaphore channels_ready;
 901 } *multifd_send_state;
 902
 903 /*
 904  * How we use multifd_send_state->pages and channel->pages?
 905  *
 906  * We create a pages for each channel, and a main one.  Each time that
 907  * we need to send a batch of pages we interchange the ones between
 908  * multifd_send_state and the channel that is sending it.  There are
 909  * two reasons for that:
 910  *    - to not have to do so many mallocs during migration
 911  *    - to make easier to know what to free at the end of migration
 912  *
 913  * This way we always know who is the owner of each "pages" struct,
 914  * and we don't need any locking.  It belongs to the migration thread
 915  * or to the channel thread.  Switching is safe because the migration
 916  * thread is using the channel mutex when changing it, and the channel
 917  * have to had finish with its own, otherwise pending_job can't be
 918  * false.
 919  */
 920
 921 static int multifd_send_pages(RAMState *rs)
 922 {
 923     int i;
 924     static int next_channel;
 925     MultiFDSendParams *p = NULL; /* make happy gcc */
 926     MultiFDPages_t *pages = multifd_send_state->pages;
 927     uint64_t transferred;
 928
 929     qemu_sem_wait(&multifd_send_state->channels_ready);
 930     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 931         p = &multifd_send_state->params[i];
 932
 933         qemu_mutex_lock(&p->mutex);
 934         if (p->quit) {
 935             error_report("%s: channel %d has already quit!", __func__, i);
 936             qemu_mutex_unlock(&p->mutex);
 937             return -1;
 938         }
 939         if (!p->pending_job) {
 940             p->pending_job++;
 941             next_channel = (i + 1) % migrate_multifd_channels();
 942             break;
 943         }
 944         qemu_mutex_unlock(&p->mutex);
 945     }
 946     p->pages->used = 0;
 947
 948     p->packet_num = multifd_send_state->packet_num++;
 949     p->pages->block = NULL;
 950     multifd_send_state->pages = p->pages;
 951     p->pages = pages;
 952     transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
 953     qemu_file_update_transfer(rs->f, transferred);
 954     ram_counters.multifd_bytes += transferred;
 955     ram_counters.transferred += transferred;;
 956     qemu_mutex_unlock(&p->mutex);
 957     qemu_sem_post(&p->sem);
 958
 959     return 1;
 960 }
 961
 962 static int multifd_queue_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
 963 {
 964     MultiFDPages_t *pages = multifd_send_state->pages;
 965
 966     if (!pages->block) {
 967         pages->block = block;
 968     }
 969
 970     if (pages->block == block) {
 971         pages->offset[pages->used] = offset;
 972         pages->iov[pages->used].iov_base = block->host + offset;
 973         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
 974         pages->used++;
 975
 976         if (pages->used < pages->allocated) {
 977             return 1;
 978         }
 979     }
 980
 981     if (multifd_send_pages(rs) < 0) {
 982         return -1;
 983     }
 984
 985     if (pages->block != block) {
 986         return  multifd_queue_page(rs, block, offset);
 987     }
 988
 989     return 1;
 990 }
 991
 992 static void multifd_send_terminate_threads(Error *err)
 993 {
 994     int i;
 995
 996     trace_multifd_send_terminate_threads(err != NULL);
 997
 998     if (err) {
 999         MigrationState *s = migrate_get_current();
1000         migrate_set_error(s, err);
1001         if (s->state == MIGRATION_STATUS_SETUP ||
1002             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
1003             s->state == MIGRATION_STATUS_DEVICE ||
1004             s->state == MIGRATION_STATUS_ACTIVE) {
1005             migrate_set_state(&s->state, s->state,
1006                               MIGRATION_STATUS_FAILED);
1007         }
1008     }
1009
1010     for (i = 0; i < migrate_multifd_channels(); i++) {
1011         MultiFDSendParams *p = &multifd_send_state->params[i];
1012
1013         qemu_mutex_lock(&p->mutex);
1014         p->quit = true;
1015         qemu_sem_post(&p->sem);
1016         qemu_mutex_unlock(&p->mutex);
1017     }
1018 }
1019
1020 void multifd_save_cleanup(void)
1021 {
1022     int i;
1023
1024     if (!migrate_use_multifd()) {
1025         return;
1026     }
1027     multifd_send_terminate_threads(NULL);
1028     for (i = 0; i < migrate_multifd_channels(); i++) {
1029         MultiFDSendParams *p = &multifd_send_state->params[i];
1030
1031         if (p->running) {
1032             qemu_thread_join(&p->thread);
1033         }
1034         socket_send_channel_destroy(p->c);
1035         p->c = NULL;
1036         qemu_mutex_destroy(&p->mutex);
1037         qemu_sem_destroy(&p->sem);
1038         qemu_sem_destroy(&p->sem_sync);
1039         g_free(p->name);
1040         p->name = NULL;
1041         multifd_pages_clear(p->pages);
1042         p->pages = NULL;
1043         p->packet_len = 0;
1044         g_free(p->packet);
1045         p->packet = NULL;
1046     }
1047     qemu_sem_destroy(&multifd_send_state->channels_ready);
1048     g_free(multifd_send_state->params);
1049     multifd_send_state->params = NULL;
1050     multifd_pages_clear(multifd_send_state->pages);
1051     multifd_send_state->pages = NULL;
1052     g_free(multifd_send_state);
1053     multifd_send_state = NULL;
1054 }
1055
1056 static void multifd_send_sync_main(RAMState *rs)
1057 {
1058     int i;
1059
1060     if (!migrate_use_multifd()) {
1061         return;
1062     }
1063     if (multifd_send_state->pages->used) {
1064         if (multifd_send_pages(rs) < 0) {
1065             error_report("%s: multifd_send_pages fail", __func__);
1066             return;
1067         }
1068     }
1069     for (i = 0; i < migrate_multifd_channels(); i++) {
1070         MultiFDSendParams *p = &multifd_send_state->params[i];
1071
1072         trace_multifd_send_sync_main_signal(p->id);
1073
1074         qemu_mutex_lock(&p->mutex);
1075
1076         if (p->quit) {
1077             error_report("%s: channel %d has already quit", __func__, i);
1078             qemu_mutex_unlock(&p->mutex);
1079             return;
1080         }
1081
1082         p->packet_num = multifd_send_state->packet_num++;
1083         p->flags |= MULTIFD_FLAG_SYNC;
1084         p->pending_job++;
1085         qemu_file_update_transfer(rs->f, p->packet_len);
1086         ram_counters.multifd_bytes += p->packet_len;
1087         ram_counters.transferred += p->packet_len;
1088         qemu_mutex_unlock(&p->mutex);
1089         qemu_sem_post(&p->sem);
1090     }
1091     for (i = 0; i < migrate_multifd_channels(); i++) {
1092         MultiFDSendParams *p = &multifd_send_state->params[i];
1093
1094         trace_multifd_send_sync_main_wait(p->id);
1095         qemu_sem_wait(&p->sem_sync);
1096     }
1097     trace_multifd_send_sync_main(multifd_send_state->packet_num);
1098 }
1099
1100 static void *multifd_send_thread(void *opaque)
1101 {
1102     MultiFDSendParams *p = opaque;
1103     Error *local_err = NULL;
1104     int ret = 0;
1105     uint32_t flags = 0;
1106
1107     trace_multifd_send_thread_start(p->id);
1108     rcu_register_thread();
1109
1110     if (multifd_send_initial_packet(p, &local_err) < 0) {
1111         ret = -1;
1112         goto out;
1113     }
1114     /* initial packet */
1115     p->num_packets = 1;
1116
1117     while (true) {
1118         qemu_sem_wait(&p->sem);
1119         qemu_mutex_lock(&p->mutex);
1120
1121         if (p->pending_job) {
1122             uint32_t used = p->pages->used;
1123             uint64_t packet_num = p->packet_num;
1124             flags = p->flags;
1125
1126             p->next_packet_size = used * qemu_target_page_size();
1127             multifd_send_fill_packet(p);
1128             p->flags = 0;
1129             p->num_packets++;
1130             p->num_pages += used;
1131             qemu_mutex_unlock(&p->mutex);
1132
1133             trace_multifd_send(p->id, packet_num, used, flags,
1134                                p->next_packet_size);
1135
1136             ret = qio_channel_write_all(p->c, (void *)p->packet,
1137                                         p->packet_len, &local_err);
1138             if (ret != 0) {
1139                 break;
1140             }
1141
1142             if (used) {
1143                 ret = qio_channel_writev_all(p->c, p->pages->iov,
1144                                              used, &local_err);
1145                 if (ret != 0) {
1146                     break;
1147                 }
1148             }
1149
1150             qemu_mutex_lock(&p->mutex);
1151             p->pending_job--;
1152             qemu_mutex_unlock(&p->mutex);
1153
1154             if (flags & MULTIFD_FLAG_SYNC) {
1155                 qemu_sem_post(&p->sem_sync);
1156             }
1157             qemu_sem_post(&multifd_send_state->channels_ready);
1158         } else if (p->quit) {
1159             qemu_mutex_unlock(&p->mutex);
1160             break;
1161         } else {
1162             qemu_mutex_unlock(&p->mutex);
1163             /* sometimes there are spurious wakeups */
1164         }
1165     }
1166
1167 out:
1168     if (local_err) {
1169         trace_multifd_send_error(p->id);
1170         multifd_send_terminate_threads(local_err);
1171     }
1172
1173     /*
1174      * Error happen, I will exit, but I can't just leave, tell
1175      * who pay attention to me.
1176      */
1177     if (ret != 0) {
1178         qemu_sem_post(&p->sem_sync);
1179         qemu_sem_post(&multifd_send_state->channels_ready);
1180     }
1181
1182     qemu_mutex_lock(&p->mutex);
1183     p->running = false;
1184     qemu_mutex_unlock(&p->mutex);
1185
1186     rcu_unregister_thread();
1187     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1188
1189     return NULL;
1190 }
1191
1192 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1193 {
1194     MultiFDSendParams *p = opaque;
1195     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1196     Error *local_err = NULL;
1197
1198     trace_multifd_new_send_channel_async(p->id);
1199     if (qio_task_propagate_error(task, &local_err)) {
1200         migrate_set_error(migrate_get_current(), local_err);
1201         multifd_save_cleanup();
1202     } else {
1203         p->c = QIO_CHANNEL(sioc);
1204         qio_channel_set_delay(p->c, false);
1205         p->running = true;
1206         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1207                            QEMU_THREAD_JOINABLE);
1208     }
1209 }
1210
1211 int multifd_save_setup(void)
1212 {
1213     int thread_count;
1214     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1215     uint8_t i;
1216
1217     if (!migrate_use_multifd()) {
1218         return 0;
1219     }
1220     thread_count = migrate_multifd_channels();
1221     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1222     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1223     multifd_send_state->pages = multifd_pages_init(page_count);
1224     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1225
1226     for (i = 0; i < thread_count; i++) {
1227         MultiFDSendParams *p = &multifd_send_state->params[i];
1228
1229         qemu_mutex_init(&p->mutex);
1230         qemu_sem_init(&p->sem, 0);
1231         qemu_sem_init(&p->sem_sync, 0);
1232         p->quit = false;
1233         p->pending_job = 0;
1234         p->id = i;
1235         p->pages = multifd_pages_init(page_count);
1236         p->packet_len = sizeof(MultiFDPacket_t)
1237                       + sizeof(ram_addr_t) * page_count;
1238         p->packet = g_malloc0(p->packet_len);
1239         p->packet->magic = cpu_to_be32(MULTIFD_MAGIC);
1240         p->packet->version = cpu_to_be32(MULTIFD_VERSION);
1241         p->name = g_strdup_printf("multifdsend_%d", i);
1242         socket_send_channel_create(multifd_new_send_channel_async, p);
1243     }
1244     return 0;
1245 }
1246
1247 struct {
1248     MultiFDRecvParams *params;
1249     /* number of created threads */
1250     int count;
1251     /* syncs main thread and channels */
1252     QemuSemaphore sem_sync;
1253     /* global number of generated multifd packets */
1254     uint64_t packet_num;
1255 } *multifd_recv_state;
1256
1257 static void multifd_recv_terminate_threads(Error *err)
1258 {
1259     int i;
1260
1261     trace_multifd_recv_terminate_threads(err != NULL);
1262
1263     if (err) {
1264         MigrationState *s = migrate_get_current();
1265         migrate_set_error(s, err);
1266         if (s->state == MIGRATION_STATUS_SETUP ||
1267             s->state == MIGRATION_STATUS_ACTIVE) {
1268             migrate_set_state(&s->state, s->state,
1269                               MIGRATION_STATUS_FAILED);
1270         }
1271     }
1272
1273     for (i = 0; i < migrate_multifd_channels(); i++) {
1274         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1275
1276         qemu_mutex_lock(&p->mutex);
1277         p->quit = true;
1278         /* We could arrive here for two reasons:
1279            - normal quit, i.e. everything went fine, just finished
1280            - error quit: We close the channels so the channel threads
1281              finish the qio_channel_read_all_eof() */
1282         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1283         qemu_mutex_unlock(&p->mutex);
1284     }
1285 }
1286
1287 int multifd_load_cleanup(Error **errp)
1288 {
1289     int i;
1290     int ret = 0;
1291
1292     if (!migrate_use_multifd()) {
1293         return 0;
1294     }
1295     multifd_recv_terminate_threads(NULL);
1296     for (i = 0; i < migrate_multifd_channels(); i++) {
1297         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1298
1299         if (p->running) {
1300             p->quit = true;
1301             /*
1302              * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code,
1303              * however try to wakeup it without harm in cleanup phase.
1304              */
1305             qemu_sem_post(&p->sem_sync);
1306             qemu_thread_join(&p->thread);
1307         }
1308         object_unref(OBJECT(p->c));
1309         p->c = NULL;
1310         qemu_mutex_destroy(&p->mutex);
1311         qemu_sem_destroy(&p->sem_sync);
1312         g_free(p->name);
1313         p->name = NULL;
1314         multifd_pages_clear(p->pages);
1315         p->pages = NULL;
1316         p->packet_len = 0;
1317         g_free(p->packet);
1318         p->packet = NULL;
1319     }
1320     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1321     g_free(multifd_recv_state->params);
1322     multifd_recv_state->params = NULL;
1323     g_free(multifd_recv_state);
1324     multifd_recv_state = NULL;
1325
1326     return ret;
1327 }
1328
1329 static void multifd_recv_sync_main(void)
1330 {
1331     int i;
1332
1333     if (!migrate_use_multifd()) {
1334         return;
1335     }
1336     for (i = 0; i < migrate_multifd_channels(); i++) {
1337         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1338
1339         trace_multifd_recv_sync_main_wait(p->id);
1340         qemu_sem_wait(&multifd_recv_state->sem_sync);
1341     }
1342     for (i = 0; i < migrate_multifd_channels(); i++) {
1343         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1344
1345         qemu_mutex_lock(&p->mutex);
1346         if (multifd_recv_state->packet_num < p->packet_num) {
1347             multifd_recv_state->packet_num = p->packet_num;
1348         }
1349         qemu_mutex_unlock(&p->mutex);
1350         trace_multifd_recv_sync_main_signal(p->id);
1351         qemu_sem_post(&p->sem_sync);
1352     }
1353     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1354 }
1355
1356 static void *multifd_recv_thread(void *opaque)
1357 {
1358     MultiFDRecvParams *p = opaque;
1359     Error *local_err = NULL;
1360     int ret;
1361
1362     trace_multifd_recv_thread_start(p->id);
1363     rcu_register_thread();
1364
1365     while (true) {
1366         uint32_t used;
1367         uint32_t flags;
1368
1369         if (p->quit) {
1370             break;
1371         }
1372
1373         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1374                                        p->packet_len, &local_err);
1375         if (ret == 0) {   /* EOF */
1376             break;
1377         }
1378         if (ret == -1) {   /* Error */
1379             break;
1380         }
1381
1382         qemu_mutex_lock(&p->mutex);
1383         ret = multifd_recv_unfill_packet(p, &local_err);
1384         if (ret) {
1385             qemu_mutex_unlock(&p->mutex);
1386             break;
1387         }
1388
1389         used = p->pages->used;
1390         flags = p->flags;
1391         trace_multifd_recv(p->id, p->packet_num, used, flags,
1392                            p->next_packet_size);
1393         p->num_packets++;
1394         p->num_pages += used;
1395         qemu_mutex_unlock(&p->mutex);
1396
1397         if (used) {
1398             ret = qio_channel_readv_all(p->c, p->pages->iov,
1399                                         used, &local_err);
1400             if (ret != 0) {
1401                 break;
1402             }
1403         }
1404
1405         if (flags & MULTIFD_FLAG_SYNC) {
1406             qemu_sem_post(&multifd_recv_state->sem_sync);
1407             qemu_sem_wait(&p->sem_sync);
1408         }
1409     }
1410
1411     if (local_err) {
1412         multifd_recv_terminate_threads(local_err);
1413     }
1414     qemu_mutex_lock(&p->mutex);
1415     p->running = false;
1416     qemu_mutex_unlock(&p->mutex);
1417
1418     rcu_unregister_thread();
1419     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1420
1421     return NULL;
1422 }
1423
1424 int multifd_load_setup(void)
1425 {
1426     int thread_count;
1427     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1428     uint8_t i;
1429
1430     if (!migrate_use_multifd()) {
1431         return 0;
1432     }
1433     thread_count = migrate_multifd_channels();
1434     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1435     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1436     atomic_set(&multifd_recv_state->count, 0);
1437     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1438
1439     for (i = 0; i < thread_count; i++) {
1440         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1441
1442         qemu_mutex_init(&p->mutex);
1443         qemu_sem_init(&p->sem_sync, 0);
1444         p->quit = false;
1445         p->id = i;
1446         p->pages = multifd_pages_init(page_count);
1447         p->packet_len = sizeof(MultiFDPacket_t)
1448                       + sizeof(ram_addr_t) * page_count;
1449         p->packet = g_malloc0(p->packet_len);
1450         p->name = g_strdup_printf("multifdrecv_%d", i);
1451     }
1452     return 0;
1453 }
1454
1455 bool multifd_recv_all_channels_created(void)
1456 {
1457     int thread_count = migrate_multifd_channels();
1458
1459     if (!migrate_use_multifd()) {
1460         return true;
1461     }
1462
1463     return thread_count == atomic_read(&multifd_recv_state->count);
1464 }
1465
1466 /*
1467  * Try to receive all multifd channels to get ready for the migration.
1468  * - Return true and do not set @errp when correctly receving all channels;
1469  * - Return false and do not set @errp when correctly receiving the current one;
1470  * - Return false and set @errp when failing to receive the current channel.
1471  */
1472 bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
1473 {
1474     MultiFDRecvParams *p;
1475     Error *local_err = NULL;
1476     int id;
1477
1478     id = multifd_recv_initial_packet(ioc, &local_err);
1479     if (id < 0) {
1480         multifd_recv_terminate_threads(local_err);
1481         error_propagate_prepend(errp, local_err,
1482                                 "failed to receive packet"
1483                                 " via multifd channel %d: ",
1484                                 atomic_read(&multifd_recv_state->count));
1485         return false;
1486     }
1487     trace_multifd_recv_new_channel(id);
1488
1489     p = &multifd_recv_state->params[id];
1490     if (p->c != NULL) {
1491         error_setg(&local_err, "multifd: received id '%d' already setup'",
1492                    id);
1493         multifd_recv_terminate_threads(local_err);
1494         error_propagate(errp, local_err);
1495         return false;
1496     }
1497     p->c = ioc;
1498     object_ref(OBJECT(ioc));
1499     /* initial packet */
1500     p->num_packets = 1;
1501
1502     p->running = true;
1503     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1504                        QEMU_THREAD_JOINABLE);
1505     atomic_inc(&multifd_recv_state->count);
1506     return atomic_read(&multifd_recv_state->count) ==
1507            migrate_multifd_channels();
1508 }
1509
1510 /**
1511  * save_page_header: write page header to wire
1512  *
1513  * If this is the 1st block, it also writes the block identification
1514  *
1515  * Returns the number of bytes written
1516  *
1517  * @f: QEMUFile where to send the data
1518  * @block: block that contains the page we want to send
1519  * @offset: offset inside the block for the page
1520  *          in the lower bits, it contains flags
1521  */
1522 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1523                                ram_addr_t offset)
1524 {
1525     size_t size, len;
1526
1527     if (block == rs->last_sent_block) {
1528         offset |= RAM_SAVE_FLAG_CONTINUE;
1529     }
1530     qemu_put_be64(f, offset);
1531     size = 8;
1532
1533     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1534         len = strlen(block->idstr);
1535         qemu_put_byte(f, len);
1536         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1537         size += 1 + len;
1538         rs->last_sent_block = block;
1539     }
1540     return size;
1541 }
1542
1543 /**
1544  * mig_throttle_guest_down: throotle down the guest
1545  *
1546  * Reduce amount of guest cpu execution to hopefully slow down memory
1547  * writes. If guest dirty memory rate is reduced below the rate at
1548  * which we can transfer pages to the destination then we should be
1549  * able to complete migration. Some workloads dirty memory way too
1550  * fast and will not effectively converge, even with auto-converge.
1551  */
1552 static void mig_throttle_guest_down(void)
1553 {
1554     MigrationState *s = migrate_get_current();
1555     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1556     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1557     int pct_max = s->parameters.max_cpu_throttle;
1558
1559     /* We have not started throttling yet. Let's start it. */
1560     if (!cpu_throttle_active()) {
1561         cpu_throttle_set(pct_initial);
1562     } else {
1563         /* Throttling already on, just increase the rate */
1564         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1565                          pct_max));
1566     }
1567 }
1568
1569 /**
1570  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1571  *
1572  * @rs: current RAM state
1573  * @current_addr: address for the zero page
1574  *
1575  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1576  * The important thing is that a stale (not-yet-0'd) page be replaced
1577  * by the new data.
1578  * As a bonus, if the page wasn't in the cache it gets added so that
1579  * when a small write is made into the 0'd page it gets XBZRLE sent.
1580  */
1581 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1582 {
1583     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1584         return;
1585     }
1586
1587     /* We don't care if this fails to allocate a new cache page
1588      * as long as it updated an old one */
1589     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1590                  ram_counters.dirty_sync_count);
1591 }
1592
1593 #define ENCODING_FLAG_XBZRLE 0x1
1594
1595 /**
1596  * save_xbzrle_page: compress and send current page
1597  *
1598  * Returns: 1 means that we wrote the page
1599  *          0 means that page is identical to the one already sent
1600  *          -1 means that xbzrle would be longer than normal
1601  *
1602  * @rs: current RAM state
1603  * @current_data: pointer to the address of the page contents
1604  * @current_addr: addr of the page
1605  * @block: block that contains the page we want to send
1606  * @offset: offset inside the block for the page
1607  * @last_stage: if we are at the completion stage
1608  */
1609 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1610                             ram_addr_t current_addr, RAMBlock *block,
1611                             ram_addr_t offset, bool last_stage)
1612 {
1613     int encoded_len = 0, bytes_xbzrle;
1614     uint8_t *prev_cached_page;
1615
1616     if (!cache_is_cached(XBZRLE.cache, current_addr,
1617                          ram_counters.dirty_sync_count)) {
1618         xbzrle_counters.cache_miss++;
1619         if (!last_stage) {
1620             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1621                              ram_counters.dirty_sync_count) == -1) {
1622                 return -1;
1623             } else {
1624                 /* update *current_data when the page has been
1625                    inserted into cache */
1626                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1627             }
1628         }
1629         return -1;
1630     }
1631
1632     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1633
1634     /* save current buffer into memory */
1635     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1636
1637     /* XBZRLE encoding (if there is no overflow) */
1638     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1639                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1640                                        TARGET_PAGE_SIZE);
1641
1642     /*
1643      * Update the cache contents, so that it corresponds to the data
1644      * sent, in all cases except where we skip the page.
1645      */
1646     if (!last_stage && encoded_len != 0) {
1647         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1648         /*
1649          * In the case where we couldn't compress, ensure that the caller
1650          * sends the data from the cache, since the guest might have
1651          * changed the RAM since we copied it.
1652          */
1653         *current_data = prev_cached_page;
1654     }
1655
1656     if (encoded_len == 0) {
1657         trace_save_xbzrle_page_skipping();
1658         return 0;
1659     } else if (encoded_len == -1) {
1660         trace_save_xbzrle_page_overflow();
1661         xbzrle_counters.overflow++;
1662         return -1;
1663     }
1664
1665     /* Send XBZRLE based compressed page */
1666     bytes_xbzrle = save_page_header(rs, rs->f, block,
1667                                     offset | RAM_SAVE_FLAG_XBZRLE);
1668     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1669     qemu_put_be16(rs->f, encoded_len);
1670     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1671     bytes_xbzrle += encoded_len + 1 + 2;
1672     xbzrle_counters.pages++;
1673     xbzrle_counters.bytes += bytes_xbzrle;
1674     ram_counters.transferred += bytes_xbzrle;
1675
1676     return 1;
1677 }
1678
1679 /**
1680  * migration_bitmap_find_dirty: find the next dirty page from start
1681  *
1682  * Returns the page offset within memory region of the start of a dirty page
1683  *
1684  * @rs: current RAM state
1685  * @rb: RAMBlock where to search for dirty pages
1686  * @start: page where we start the search
1687  */
1688 static inline
1689 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1690                                           unsigned long start)
1691 {
1692     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1693     unsigned long *bitmap = rb->bmap;
1694     unsigned long next;
1695
1696     if (ramblock_is_ignored(rb)) {
1697         return size;
1698     }
1699
1700     /*
1701      * When the free page optimization is enabled, we need to check the bitmap
1702      * to send the non-free pages rather than all the pages in the bulk stage.
1703      */
1704     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
1705         next = start + 1;
1706     } else {
1707         next = find_next_bit(bitmap, size, start);
1708     }
1709
1710     return next;
1711 }
1712
1713 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1714                                                 RAMBlock *rb,
1715                                                 unsigned long page)
1716 {
1717     bool ret;
1718
1719     qemu_mutex_lock(&rs->bitmap_mutex);
1720
1721     /*
1722      * Clear dirty bitmap if needed.  This _must_ be called before we
1723      * send any of the page in the chunk because we need to make sure
1724      * we can capture further page content changes when we sync dirty
1725      * log the next time.  So as long as we are going to send any of
1726      * the page in the chunk we clear the remote dirty bitmap for all.
1727      * Clearing it earlier won't be a problem, but too late will.
1728      */
1729     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
1730         uint8_t shift = rb->clear_bmap_shift;
1731         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
1732         hwaddr start = (page << TARGET_PAGE_BITS) & (-size);
1733
1734         /*
1735          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
1736          * can make things easier sometimes since then start address
1737          * of the small chunk will always be 64 pages aligned so the
1738          * bitmap will always be aligned to unsigned long.  We should
1739          * even be able to remove this restriction but I'm simply
1740          * keeping it.
1741          */
1742         assert(shift >= 6);
1743         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
1744         memory_region_clear_dirty_bitmap(rb->mr, start, size);
1745     }
1746
1747     ret = test_and_clear_bit(page, rb->bmap);
1748
1749     if (ret) {
1750         rs->migration_dirty_pages--;
1751     }
1752     qemu_mutex_unlock(&rs->bitmap_mutex);
1753
1754     return ret;
1755 }
1756
1757 /* Called with RCU critical section */
1758 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1759 {
1760     rs->migration_dirty_pages +=
1761         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
1762                                               &rs->num_dirty_pages_period);
1763 }
1764
1765 /**
1766  * ram_pagesize_summary: calculate all the pagesizes of a VM
1767  *
1768  * Returns a summary bitmap of the page sizes of all RAMBlocks
1769  *
1770  * For VMs with just normal pages this is equivalent to the host page
1771  * size. If it's got some huge pages then it's the OR of all the
1772  * different page sizes.
1773  */
1774 uint64_t ram_pagesize_summary(void)
1775 {
1776     RAMBlock *block;
1777     uint64_t summary = 0;
1778
1779     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1780         summary |= block->page_size;
1781     }
1782
1783     return summary;
1784 }
1785
1786 uint64_t ram_get_total_transferred_pages(void)
1787 {
1788     return  ram_counters.normal + ram_counters.duplicate +
1789                 compression_counters.pages + xbzrle_counters.pages;
1790 }
1791
1792 static void migration_update_rates(RAMState *rs, int64_t end_time)
1793 {
1794     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1795     double compressed_size;
1796
1797     /* calculate period counters */
1798     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1799                 / (end_time - rs->time_last_bitmap_sync);
1800
1801     if (!page_count) {
1802         return;
1803     }
1804
1805     if (migrate_use_xbzrle()) {
1806         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1807             rs->xbzrle_cache_miss_prev) / page_count;
1808         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1809     }
1810
1811     if (migrate_use_compression()) {
1812         compression_counters.busy_rate = (double)(compression_counters.busy -
1813             rs->compress_thread_busy_prev) / page_count;
1814         rs->compress_thread_busy_prev = compression_counters.busy;
1815
1816         compressed_size = compression_counters.compressed_size -
1817                           rs->compressed_size_prev;
1818         if (compressed_size) {
1819             double uncompressed_size = (compression_counters.pages -
1820                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1821
1822             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1823             compression_counters.compression_rate =
1824                                         uncompressed_size / compressed_size;
1825
1826             rs->compress_pages_prev = compression_counters.pages;
1827             rs->compressed_size_prev = compression_counters.compressed_size;
1828         }
1829     }
1830 }
1831
1832 static void migration_bitmap_sync(RAMState *rs)
1833 {
1834     RAMBlock *block;
1835     int64_t end_time;
1836     uint64_t bytes_xfer_now;
1837
1838     ram_counters.dirty_sync_count++;
1839
1840     if (!rs->time_last_bitmap_sync) {
1841         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1842     }
1843
1844     trace_migration_bitmap_sync_start();
1845     memory_global_dirty_log_sync();
1846
1847     qemu_mutex_lock(&rs->bitmap_mutex);
1848     WITH_RCU_READ_LOCK_GUARD() {
1849         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1850             ramblock_sync_dirty_bitmap(rs, block);
1851         }
1852         ram_counters.remaining = ram_bytes_remaining();
1853     }
1854     qemu_mutex_unlock(&rs->bitmap_mutex);
1855
1856     memory_global_after_dirty_log_sync();
1857     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1858
1859     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1860
1861     /* more than 1 second = 1000 millisecons */
1862     if (end_time > rs->time_last_bitmap_sync + 1000) {
1863         bytes_xfer_now = ram_counters.transferred;
1864
1865         /* During block migration the auto-converge logic incorrectly detects
1866          * that ram migration makes no progress. Avoid this by disabling the
1867          * throttling logic during the bulk phase of block migration. */
1868         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1869             /* The following detection logic can be refined later. For now:
1870                Check to see if the dirtied bytes is 50% more than the approx.
1871                amount of bytes that just got transferred since the last time we
1872                were in this routine. If that happens twice, start or increase
1873                throttling */
1874
1875             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1876                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1877                 (++rs->dirty_rate_high_cnt >= 2)) {
1878                     trace_migration_throttle();
1879                     rs->dirty_rate_high_cnt = 0;
1880                     mig_throttle_guest_down();
1881             }
1882         }
1883
1884         migration_update_rates(rs, end_time);
1885
1886         rs->target_page_count_prev = rs->target_page_count;
1887
1888         /* reset period counters */
1889         rs->time_last_bitmap_sync = end_time;
1890         rs->num_dirty_pages_period = 0;
1891         rs->bytes_xfer_prev = bytes_xfer_now;
1892     }
1893     if (migrate_use_events()) {
1894         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1895     }
1896 }
1897
1898 static void migration_bitmap_sync_precopy(RAMState *rs)
1899 {
1900     Error *local_err = NULL;
1901
1902     /*
1903      * The current notifier usage is just an optimization to migration, so we
1904      * don't stop the normal migration process in the error case.
1905      */
1906     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1907         error_report_err(local_err);
1908     }
1909
1910     migration_bitmap_sync(rs);
1911
1912     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1913         error_report_err(local_err);
1914     }
1915 }
1916
1917 /**
1918  * save_zero_page_to_file: send the zero page to the file
1919  *
1920  * Returns the size of data written to the file, 0 means the page is not
1921  * a zero page
1922  *
1923  * @rs: current RAM state
1924  * @file: the file where the data is saved
1925  * @block: block that contains the page we want to send
1926  * @offset: offset inside the block for the page
1927  */
1928 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1929                                   RAMBlock *block, ram_addr_t offset)
1930 {
1931     uint8_t *p = block->host + offset;
1932     int len = 0;
1933
1934     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1935         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1936         qemu_put_byte(file, 0);
1937         len += 1;
1938     }
1939     return len;
1940 }
1941
1942 /**
1943  * save_zero_page: send the zero page to the stream
1944  *
1945  * Returns the number of pages written.
1946  *
1947  * @rs: current RAM state
1948  * @block: block that contains the page we want to send
1949  * @offset: offset inside the block for the page
1950  */
1951 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1952 {
1953     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1954
1955     if (len) {
1956         ram_counters.duplicate++;
1957         ram_counters.transferred += len;
1958         return 1;
1959     }
1960     return -1;
1961 }
1962
1963 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1964 {
1965     if (!migrate_release_ram() || !migration_in_postcopy()) {
1966         return;
1967     }
1968
1969     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1970 }
1971
1972 /*
1973  * @pages: the number of pages written by the control path,
1974  *        < 0 - error
1975  *        > 0 - number of pages written
1976  *
1977  * Return true if the pages has been saved, otherwise false is returned.
1978  */
1979 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1980                               int *pages)
1981 {
1982     uint64_t bytes_xmit = 0;
1983     int ret;
1984
1985     *pages = -1;
1986     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1987                                 &bytes_xmit);
1988     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1989         return false;
1990     }
1991
1992     if (bytes_xmit) {
1993         ram_counters.transferred += bytes_xmit;
1994         *pages = 1;
1995     }
1996
1997     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1998         return true;
1999     }
2000
2001     if (bytes_xmit > 0) {
2002         ram_counters.normal++;
2003     } else if (bytes_xmit == 0) {
2004         ram_counters.duplicate++;
2005     }
2006
2007     return true;
2008 }
2009
2010 /*
2011  * directly send the page to the stream
2012  *
2013  * Returns the number of pages written.
2014  *
2015  * @rs: current RAM state
2016  * @block: block that contains the page we want to send
2017  * @offset: offset inside the block for the page
2018  * @buf: the page to be sent
2019  * @async: send to page asyncly
2020  */
2021 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
2022                             uint8_t *buf, bool async)
2023 {
2024     ram_counters.transferred += save_page_header(rs, rs->f, block,
2025                                                  offset | RAM_SAVE_FLAG_PAGE);
2026     if (async) {
2027         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
2028                               migrate_release_ram() &
2029                               migration_in_postcopy());
2030     } else {
2031         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
2032     }
2033     ram_counters.transferred += TARGET_PAGE_SIZE;
2034     ram_counters.normal++;
2035     return 1;
2036 }
2037
2038 /**
2039  * ram_save_page: send the given page to the stream
2040  *
2041  * Returns the number of pages written.
2042  *          < 0 - error
2043  *          >=0 - Number of pages written - this might legally be 0
2044  *                if xbzrle noticed the page was the same.
2045  *
2046  * @rs: current RAM state
2047  * @block: block that contains the page we want to send
2048  * @offset: offset inside the block for the page
2049  * @last_stage: if we are at the completion stage
2050  */
2051 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
2052 {
2053     int pages = -1;
2054     uint8_t *p;
2055     bool send_async = true;
2056     RAMBlock *block = pss->block;
2057     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2058     ram_addr_t current_addr = block->offset + offset;
2059
2060     p = block->host + offset;
2061     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
2062
2063     XBZRLE_cache_lock();
2064     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
2065         migrate_use_xbzrle()) {
2066         pages = save_xbzrle_page(rs, &p, current_addr, block,
2067                                  offset, last_stage);
2068         if (!last_stage) {
2069             /* Can't send this cached data async, since the cache page
2070              * might get updated before it gets to the wire
2071              */
2072             send_async = false;
2073         }
2074     }
2075
2076     /* XBZRLE overflow or normal page */
2077     if (pages == -1) {
2078         pages = save_normal_page(rs, block, offset, p, send_async);
2079     }
2080
2081     XBZRLE_cache_unlock();
2082
2083     return pages;
2084 }
2085
2086 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
2087                                  ram_addr_t offset)
2088 {
2089     if (multifd_queue_page(rs, block, offset) < 0) {
2090         return -1;
2091     }
2092     ram_counters.normal++;
2093
2094     return 1;
2095 }
2096
2097 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
2098                                  ram_addr_t offset, uint8_t *source_buf)
2099 {
2100     RAMState *rs = ram_state;
2101     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
2102     bool zero_page = false;
2103     int ret;
2104
2105     if (save_zero_page_to_file(rs, f, block, offset)) {
2106         zero_page = true;
2107         goto exit;
2108     }
2109
2110     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
2111
2112     /*
2113      * copy it to a internal buffer to avoid it being modified by VM
2114      * so that we can catch up the error during compression and
2115      * decompression
2116      */
2117     memcpy(source_buf, p, TARGET_PAGE_SIZE);
2118     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
2119     if (ret < 0) {
2120         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
2121         error_report("compressed data failed!");
2122         return false;
2123     }
2124
2125 exit:
2126     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
2127     return zero_page;
2128 }
2129
2130 static void
2131 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
2132 {
2133     ram_counters.transferred += bytes_xmit;
2134
2135     if (param->zero_page) {
2136         ram_counters.duplicate++;
2137         return;
2138     }
2139
2140     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
2141     compression_counters.compressed_size += bytes_xmit - 8;
2142     compression_counters.pages++;
2143 }
2144
2145 static bool save_page_use_compression(RAMState *rs);
2146
2147 static void flush_compressed_data(RAMState *rs)
2148 {
2149     int idx, len, thread_count;
2150
2151     if (!save_page_use_compression(rs)) {
2152         return;
2153     }
2154     thread_count = migrate_compress_threads();
2155
2156     qemu_mutex_lock(&comp_done_lock);
2157     for (idx = 0; idx < thread_count; idx++) {
2158         while (!comp_param[idx].done) {
2159             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2160         }
2161     }
2162     qemu_mutex_unlock(&comp_done_lock);
2163
2164     for (idx = 0; idx < thread_count; idx++) {
2165         qemu_mutex_lock(&comp_param[idx].mutex);
2166         if (!comp_param[idx].quit) {
2167             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2168             /*
2169              * it's safe to fetch zero_page without holding comp_done_lock
2170              * as there is no further request submitted to the thread,
2171              * i.e, the thread should be waiting for a request at this point.
2172              */
2173             update_compress_thread_counts(&comp_param[idx], len);
2174         }
2175         qemu_mutex_unlock(&comp_param[idx].mutex);
2176     }
2177 }
2178
2179 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
2180                                        ram_addr_t offset)
2181 {
2182     param->block = block;
2183     param->offset = offset;
2184 }
2185
2186 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
2187                                            ram_addr_t offset)
2188 {
2189     int idx, thread_count, bytes_xmit = -1, pages = -1;
2190     bool wait = migrate_compress_wait_thread();
2191
2192     thread_count = migrate_compress_threads();
2193     qemu_mutex_lock(&comp_done_lock);
2194 retry:
2195     for (idx = 0; idx < thread_count; idx++) {
2196         if (comp_param[idx].done) {
2197             comp_param[idx].done = false;
2198             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2199             qemu_mutex_lock(&comp_param[idx].mutex);
2200             set_compress_params(&comp_param[idx], block, offset);
2201             qemu_cond_signal(&comp_param[idx].cond);
2202             qemu_mutex_unlock(&comp_param[idx].mutex);
2203             pages = 1;
2204             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
2205             break;
2206         }
2207     }
2208
2209     /*
2210      * wait for the free thread if the user specifies 'compress-wait-thread',
2211      * otherwise we will post the page out in the main thread as normal page.
2212      */
2213     if (pages < 0 && wait) {
2214         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2215         goto retry;
2216     }
2217     qemu_mutex_unlock(&comp_done_lock);
2218
2219     return pages;
2220 }
2221
2222 /**
2223  * find_dirty_block: find the next dirty page and update any state
2224  * associated with the search process.
2225  *
2226  * Returns true if a page is found
2227  *
2228  * @rs: current RAM state
2229  * @pss: data about the state of the current dirty page scan
2230  * @again: set to false if the search has scanned the whole of RAM
2231  */
2232 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
2233 {
2234     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2235     if (pss->complete_round && pss->block == rs->last_seen_block &&
2236         pss->page >= rs->last_page) {
2237         /*
2238          * We've been once around the RAM and haven't found anything.
2239          * Give up.
2240          */
2241         *again = false;
2242         return false;
2243     }
2244     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
2245         /* Didn't find anything in this RAM Block */
2246         pss->page = 0;
2247         pss->block = QLIST_NEXT_RCU(pss->block, next);
2248         if (!pss->block) {
2249             /*
2250              * If memory migration starts over, we will meet a dirtied page
2251              * which may still exists in compression threads's ring, so we
2252              * should flush the compressed data to make sure the new page
2253              * is not overwritten by the old one in the destination.
2254              *
2255              * Also If xbzrle is on, stop using the data compression at this
2256              * point. In theory, xbzrle can do better than compression.
2257              */
2258             flush_compressed_data(rs);
2259
2260             /* Hit the end of the list */
2261             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2262             /* Flag that we've looped */
2263             pss->complete_round = true;
2264             rs->ram_bulk_stage = false;
2265         }
2266         /* Didn't find anything this time, but try again on the new block */
2267         *again = true;
2268         return false;
2269     } else {
2270         /* Can go around again, but... */
2271         *again = true;
2272         /* We've found something so probably don't need to */
2273         return true;
2274     }
2275 }
2276
2277 /**
2278  * unqueue_page: gets a page of the queue
2279  *
2280  * Helper for 'get_queued_page' - gets a page off the queue
2281  *
2282  * Returns the block of the page (or NULL if none available)
2283  *
2284  * @rs: current RAM state
2285  * @offset: used to return the offset within the RAMBlock
2286  */
2287 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
2288 {
2289     RAMBlock *block = NULL;
2290
2291     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2292         return NULL;
2293     }
2294
2295     qemu_mutex_lock(&rs->src_page_req_mutex);
2296     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2297         struct RAMSrcPageRequest *entry =
2298                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
2299         block = entry->rb;
2300         *offset = entry->offset;
2301
2302         if (entry->len > TARGET_PAGE_SIZE) {
2303             entry->len -= TARGET_PAGE_SIZE;
2304             entry->offset += TARGET_PAGE_SIZE;
2305         } else {
2306             memory_region_unref(block->mr);
2307             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2308             g_free(entry);
2309             migration_consume_urgent_request();
2310         }
2311     }
2312     qemu_mutex_unlock(&rs->src_page_req_mutex);
2313
2314     return block;
2315 }
2316
2317 /**
2318  * get_queued_page: unqueue a page from the postcopy requests
2319  *
2320  * Skips pages that are already sent (!dirty)
2321  *
2322  * Returns true if a queued page is found
2323  *
2324  * @rs: current RAM state
2325  * @pss: data about the state of the current dirty page scan
2326  */
2327 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2328 {
2329     RAMBlock  *block;
2330     ram_addr_t offset;
2331     bool dirty;
2332
2333     do {
2334         block = unqueue_page(rs, &offset);
2335         /*
2336          * We're sending this page, and since it's postcopy nothing else
2337          * will dirty it, and we must make sure it doesn't get sent again
2338          * even if this queue request was received after the background
2339          * search already sent it.
2340          */
2341         if (block) {
2342             unsigned long page;
2343
2344             page = offset >> TARGET_PAGE_BITS;
2345             dirty = test_bit(page, block->bmap);
2346             if (!dirty) {
2347                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2348                                                 page);
2349             } else {
2350                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2351             }
2352         }
2353
2354     } while (block && !dirty);
2355
2356     if (block) {
2357         /*
2358          * As soon as we start servicing pages out of order, then we have
2359          * to kill the bulk stage, since the bulk stage assumes
2360          * in (migration_bitmap_find_and_reset_dirty) that every page is
2361          * dirty, that's no longer true.
2362          */
2363         rs->ram_bulk_stage = false;
2364
2365         /*
2366          * We want the background search to continue from the queued page
2367          * since the guest is likely to want other pages near to the page
2368          * it just requested.
2369          */
2370         pss->block = block;
2371         pss->page = offset >> TARGET_PAGE_BITS;
2372
2373         /*
2374          * This unqueued page would break the "one round" check, even is
2375          * really rare.
2376          */
2377         pss->complete_round = false;
2378     }
2379
2380     return !!block;
2381 }
2382
2383 /**
2384  * migration_page_queue_free: drop any remaining pages in the ram
2385  * request queue
2386  *
2387  * It should be empty at the end anyway, but in error cases there may
2388  * be some left.  in case that there is any page left, we drop it.
2389  *
2390  */
2391 static void migration_page_queue_free(RAMState *rs)
2392 {
2393     struct RAMSrcPageRequest *mspr, *next_mspr;
2394     /* This queue generally should be empty - but in the case of a failed
2395      * migration might have some droppings in.
2396      */
2397     RCU_READ_LOCK_GUARD();
2398     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2399         memory_region_unref(mspr->rb->mr);
2400         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2401         g_free(mspr);
2402     }
2403 }
2404
2405 /**
2406  * ram_save_queue_pages: queue the page for transmission
2407  *
2408  * A request from postcopy destination for example.
2409  *
2410  * Returns zero on success or negative on error
2411  *
2412  * @rbname: Name of the RAMBLock of the request. NULL means the
2413  *          same that last one.
2414  * @start: starting address from the start of the RAMBlock
2415  * @len: length (in bytes) to send
2416  */
2417 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2418 {
2419     RAMBlock *ramblock;
2420     RAMState *rs = ram_state;
2421
2422     ram_counters.postcopy_requests++;
2423     RCU_READ_LOCK_GUARD();
2424
2425     if (!rbname) {
2426         /* Reuse last RAMBlock */
2427         ramblock = rs->last_req_rb;
2428
2429         if (!ramblock) {
2430             /*
2431              * Shouldn't happen, we can't reuse the last RAMBlock if
2432              * it's the 1st request.
2433              */
2434             error_report("ram_save_queue_pages no previous block");
2435             goto err;
2436         }
2437     } else {
2438         ramblock = qemu_ram_block_by_name(rbname);
2439
2440         if (!ramblock) {
2441             /* We shouldn't be asked for a non-existent RAMBlock */
2442             error_report("ram_save_queue_pages no block '%s'", rbname);
2443             goto err;
2444         }
2445         rs->last_req_rb = ramblock;
2446     }
2447     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2448     if (start+len > ramblock->used_length) {
2449         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2450                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2451                      __func__, start, len, ramblock->used_length);
2452         goto err;
2453     }
2454
2455     struct RAMSrcPageRequest *new_entry =
2456         g_malloc0(sizeof(struct RAMSrcPageRequest));
2457     new_entry->rb = ramblock;
2458     new_entry->offset = start;
2459     new_entry->len = len;
2460
2461     memory_region_ref(ramblock->mr);
2462     qemu_mutex_lock(&rs->src_page_req_mutex);
2463     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2464     migration_make_urgent_request();
2465     qemu_mutex_unlock(&rs->src_page_req_mutex);
2466
2467     return 0;
2468
2469 err:
2470     return -1;
2471 }
2472
2473 static bool save_page_use_compression(RAMState *rs)
2474 {
2475     if (!migrate_use_compression()) {
2476         return false;
2477     }
2478
2479     /*
2480      * If xbzrle is on, stop using the data compression after first
2481      * round of migration even if compression is enabled. In theory,
2482      * xbzrle can do better than compression.
2483      */
2484     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2485         return true;
2486     }
2487
2488     return false;
2489 }
2490
2491 /*
2492  * try to compress the page before posting it out, return true if the page
2493  * has been properly handled by compression, otherwise needs other
2494  * paths to handle it
2495  */
2496 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2497 {
2498     if (!save_page_use_compression(rs)) {
2499         return false;
2500     }
2501
2502     /*
2503      * When starting the process of a new block, the first page of
2504      * the block should be sent out before other pages in the same
2505      * block, and all the pages in last block should have been sent
2506      * out, keeping this order is important, because the 'cont' flag
2507      * is used to avoid resending the block name.
2508      *
2509      * We post the fist page as normal page as compression will take
2510      * much CPU resource.
2511      */
2512     if (block != rs->last_sent_block) {
2513         flush_compressed_data(rs);
2514         return false;
2515     }
2516
2517     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2518         return true;
2519     }
2520
2521     compression_counters.busy++;
2522     return false;
2523 }
2524
2525 /**
2526  * ram_save_target_page: save one target page
2527  *
2528  * Returns the number of pages written
2529  *
2530  * @rs: current RAM state
2531  * @pss: data about the page we want to send
2532  * @last_stage: if we are at the completion stage
2533  */
2534 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2535                                 bool last_stage)
2536 {
2537     RAMBlock *block = pss->block;
2538     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2539     int res;
2540
2541     if (control_save_page(rs, block, offset, &res)) {
2542         return res;
2543     }
2544
2545     if (save_compress_page(rs, block, offset)) {
2546         return 1;
2547     }
2548
2549     res = save_zero_page(rs, block, offset);
2550     if (res > 0) {
2551         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2552          * page would be stale
2553          */
2554         if (!save_page_use_compression(rs)) {
2555             XBZRLE_cache_lock();
2556             xbzrle_cache_zero_page(rs, block->offset + offset);
2557             XBZRLE_cache_unlock();
2558         }
2559         ram_release_pages(block->idstr, offset, res);
2560         return res;
2561     }
2562
2563     /*
2564      * do not use multifd for compression as the first page in the new
2565      * block should be posted out before sending the compressed page
2566      */
2567     if (!save_page_use_compression(rs) && migrate_use_multifd()) {
2568         return ram_save_multifd_page(rs, block, offset);
2569     }
2570
2571     return ram_save_page(rs, pss, last_stage);
2572 }
2573
2574 /**
2575  * ram_save_host_page: save a whole host page
2576  *
2577  * Starting at *offset send pages up to the end of the current host
2578  * page. It's valid for the initial offset to point into the middle of
2579  * a host page in which case the remainder of the hostpage is sent.
2580  * Only dirty target pages are sent. Note that the host page size may
2581  * be a huge page for this block.
2582  * The saving stops at the boundary of the used_length of the block
2583  * if the RAMBlock isn't a multiple of the host page size.
2584  *
2585  * Returns the number of pages written or negative on error
2586  *
2587  * @rs: current RAM state
2588  * @ms: current migration state
2589  * @pss: data about the page we want to send
2590  * @last_stage: if we are at the completion stage
2591  */
2592 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2593                               bool last_stage)
2594 {
2595     int tmppages, pages = 0;
2596     size_t pagesize_bits =
2597         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2598
2599     if (ramblock_is_ignored(pss->block)) {
2600         error_report("block %s should not be migrated !", pss->block->idstr);
2601         return 0;
2602     }
2603
2604     do {
2605         /* Check the pages is dirty and if it is send it */
2606         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2607             pss->page++;
2608             continue;
2609         }
2610
2611         tmppages = ram_save_target_page(rs, pss, last_stage);
2612         if (tmppages < 0) {
2613             return tmppages;
2614         }
2615
2616         pages += tmppages;
2617         pss->page++;
2618     } while ((pss->page & (pagesize_bits - 1)) &&
2619              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2620
2621     /* The offset we leave with is the last one we looked at */
2622     pss->page--;
2623     return pages;
2624 }
2625
2626 /**
2627  * ram_find_and_save_block: finds a dirty page and sends it to f
2628  *
2629  * Called within an RCU critical section.
2630  *
2631  * Returns the number of pages written where zero means no dirty pages,
2632  * or negative on error
2633  *
2634  * @rs: current RAM state
2635  * @last_stage: if we are at the completion stage
2636  *
2637  * On systems where host-page-size > target-page-size it will send all the
2638  * pages in a host page that are dirty.
2639  */
2640
2641 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2642 {
2643     PageSearchStatus pss;
2644     int pages = 0;
2645     bool again, found;
2646
2647     /* No dirty page as there is zero RAM */
2648     if (!ram_bytes_total()) {
2649         return pages;
2650     }
2651
2652     pss.block = rs->last_seen_block;
2653     pss.page = rs->last_page;
2654     pss.complete_round = false;
2655
2656     if (!pss.block) {
2657         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2658     }
2659
2660     do {
2661         again = true;
2662         found = get_queued_page(rs, &pss);
2663
2664         if (!found) {
2665             /* priority queue empty, so just search for something dirty */
2666             found = find_dirty_block(rs, &pss, &again);
2667         }
2668
2669         if (found) {
2670             pages = ram_save_host_page(rs, &pss, last_stage);
2671         }
2672     } while (!pages && again);
2673
2674     rs->last_seen_block = pss.block;
2675     rs->last_page = pss.page;
2676
2677     return pages;
2678 }
2679
2680 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2681 {
2682     uint64_t pages = size / TARGET_PAGE_SIZE;
2683
2684     if (zero) {
2685         ram_counters.duplicate += pages;
2686     } else {
2687         ram_counters.normal += pages;
2688         ram_counters.transferred += size;
2689         qemu_update_position(f, size);
2690     }
2691 }
2692
2693 static uint64_t ram_bytes_total_common(bool count_ignored)
2694 {
2695     RAMBlock *block;
2696     uint64_t total = 0;
2697
2698     RCU_READ_LOCK_GUARD();
2699
2700     if (count_ignored) {
2701         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2702             total += block->used_length;
2703         }
2704     } else {
2705         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2706             total += block->used_length;
2707         }
2708     }
2709     return total;
2710 }
2711
2712 uint64_t ram_bytes_total(void)
2713 {
2714     return ram_bytes_total_common(false);
2715 }
2716
2717 static void xbzrle_load_setup(void)
2718 {
2719     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2720 }
2721
2722 static void xbzrle_load_cleanup(void)
2723 {
2724     g_free(XBZRLE.decoded_buf);
2725     XBZRLE.decoded_buf = NULL;
2726 }
2727
2728 static void ram_state_cleanup(RAMState **rsp)
2729 {
2730     if (*rsp) {
2731         migration_page_queue_free(*rsp);
2732         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2733         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2734         g_free(*rsp);
2735         *rsp = NULL;
2736     }
2737 }
2738
2739 static void xbzrle_cleanup(void)
2740 {
2741     XBZRLE_cache_lock();
2742     if (XBZRLE.cache) {
2743         cache_fini(XBZRLE.cache);
2744         g_free(XBZRLE.encoded_buf);
2745         g_free(XBZRLE.current_buf);
2746         g_free(XBZRLE.zero_target_page);
2747         XBZRLE.cache = NULL;
2748         XBZRLE.encoded_buf = NULL;
2749         XBZRLE.current_buf = NULL;
2750         XBZRLE.zero_target_page = NULL;
2751     }
2752     XBZRLE_cache_unlock();
2753 }
2754
2755 static void ram_save_cleanup(void *opaque)
2756 {
2757     RAMState **rsp = opaque;
2758     RAMBlock *block;
2759
2760     /* caller have hold iothread lock or is in a bh, so there is
2761      * no writing race against the migration bitmap
2762      */
2763     memory_global_dirty_log_stop();
2764
2765     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2766         g_free(block->clear_bmap);
2767         block->clear_bmap = NULL;
2768         g_free(block->bmap);
2769         block->bmap = NULL;
2770     }
2771
2772     xbzrle_cleanup();
2773     compress_threads_save_cleanup();
2774     ram_state_cleanup(rsp);
2775 }
2776
2777 static void ram_state_reset(RAMState *rs)
2778 {
2779     rs->last_seen_block = NULL;
2780     rs->last_sent_block = NULL;
2781     rs->last_page = 0;
2782     rs->last_version = ram_list.version;
2783     rs->ram_bulk_stage = true;
2784     rs->fpo_enabled = false;
2785 }
2786
2787 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2788
2789 /*
2790  * 'expected' is the value you expect the bitmap mostly to be full
2791  * of; it won't bother printing lines that are all this value.
2792  * If 'todump' is null the migration bitmap is dumped.
2793  */
2794 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2795                            unsigned long pages)
2796 {
2797     int64_t cur;
2798     int64_t linelen = 128;
2799     char linebuf[129];
2800
2801     for (cur = 0; cur < pages; cur += linelen) {
2802         int64_t curb;
2803         bool found = false;
2804         /*
2805          * Last line; catch the case where the line length
2806          * is longer than remaining ram
2807          */
2808         if (cur + linelen > pages) {
2809             linelen = pages - cur;
2810         }
2811         for (curb = 0; curb < linelen; curb++) {
2812             bool thisbit = test_bit(cur + curb, todump);
2813             linebuf[curb] = thisbit ? '1' : '.';
2814             found = found || (thisbit != expected);
2815         }
2816         if (found) {
2817             linebuf[curb] = '\0';
2818             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2819         }
2820     }
2821 }
2822
2823 /* **** functions for postcopy ***** */
2824
2825 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2826 {
2827     struct RAMBlock *block;
2828
2829     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2830         unsigned long *bitmap = block->bmap;
2831         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2832         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2833
2834         while (run_start < range) {
2835             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2836             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2837                               (run_end - run_start) << TARGET_PAGE_BITS);
2838             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2839         }
2840     }
2841 }
2842
2843 /**
2844  * postcopy_send_discard_bm_ram: discard a RAMBlock
2845  *
2846  * Returns zero on success
2847  *
2848  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2849  *
2850  * @ms: current migration state
2851  * @block: RAMBlock to discard
2852  */
2853 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2854 {
2855     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2856     unsigned long current;
2857     unsigned long *bitmap = block->bmap;
2858
2859     for (current = 0; current < end; ) {
2860         unsigned long one = find_next_bit(bitmap, end, current);
2861         unsigned long zero, discard_length;
2862
2863         if (one >= end) {
2864             break;
2865         }
2866
2867         zero = find_next_zero_bit(bitmap, end, one + 1);
2868
2869         if (zero >= end) {
2870             discard_length = end - one;
2871         } else {
2872             discard_length = zero - one;
2873         }
2874         postcopy_discard_send_range(ms, one, discard_length);
2875         current = one + discard_length;
2876     }
2877
2878     return 0;
2879 }
2880
2881 /**
2882  * postcopy_each_ram_send_discard: discard all RAMBlocks
2883  *
2884  * Returns 0 for success or negative for error
2885  *
2886  * Utility for the outgoing postcopy code.
2887  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2888  *   passing it bitmap indexes and name.
2889  * (qemu_ram_foreach_block ends up passing unscaled lengths
2890  *  which would mean postcopy code would have to deal with target page)
2891  *
2892  * @ms: current migration state
2893  */
2894 static int postcopy_each_ram_send_discard(MigrationState *ms)
2895 {
2896     struct RAMBlock *block;
2897     int ret;
2898
2899     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2900         postcopy_discard_send_init(ms, block->idstr);
2901
2902         /*
2903          * Postcopy sends chunks of bitmap over the wire, but it
2904          * just needs indexes at this point, avoids it having
2905          * target page specific code.
2906          */
2907         ret = postcopy_send_discard_bm_ram(ms, block);
2908         postcopy_discard_send_finish(ms);
2909         if (ret) {
2910             return ret;
2911         }
2912     }
2913
2914     return 0;
2915 }
2916
2917 /**
2918  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2919  *
2920  * Helper for postcopy_chunk_hostpages; it's called twice to
2921  * canonicalize the two bitmaps, that are similar, but one is
2922  * inverted.
2923  *
2924  * Postcopy requires that all target pages in a hostpage are dirty or
2925  * clean, not a mix.  This function canonicalizes the bitmaps.
2926  *
2927  * @ms: current migration state
2928  * @block: block that contains the page we want to canonicalize
2929  */
2930 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2931 {
2932     RAMState *rs = ram_state;
2933     unsigned long *bitmap = block->bmap;
2934     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2935     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2936     unsigned long run_start;
2937
2938     if (block->page_size == TARGET_PAGE_SIZE) {
2939         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2940         return;
2941     }
2942
2943     /* Find a dirty page */
2944     run_start = find_next_bit(bitmap, pages, 0);
2945
2946     while (run_start < pages) {
2947
2948         /*
2949          * If the start of this run of pages is in the middle of a host
2950          * page, then we need to fixup this host page.
2951          */
2952         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2953             /* Find the end of this run */
2954             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2955             /*
2956              * If the end isn't at the start of a host page, then the
2957              * run doesn't finish at the end of a host page
2958              * and we need to discard.
2959              */
2960         }
2961
2962         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2963             unsigned long page;
2964             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2965                                                              host_ratio);
2966             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2967
2968             /* Clean up the bitmap */
2969             for (page = fixup_start_addr;
2970                  page < fixup_start_addr + host_ratio; page++) {
2971                 /*
2972                  * Remark them as dirty, updating the count for any pages
2973                  * that weren't previously dirty.
2974                  */
2975                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2976             }
2977         }
2978
2979         /* Find the next dirty page for the next iteration */
2980         run_start = find_next_bit(bitmap, pages, run_start);
2981     }
2982 }
2983
2984 /**
2985  * postcopy_chunk_hostpages: discard any partially sent host page
2986  *
2987  * Utility for the outgoing postcopy code.
2988  *
2989  * Discard any partially sent host-page size chunks, mark any partially
2990  * dirty host-page size chunks as all dirty.  In this case the host-page
2991  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2992  *
2993  * Returns zero on success
2994  *
2995  * @ms: current migration state
2996  * @block: block we want to work with
2997  */
2998 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2999 {
3000     postcopy_discard_send_init(ms, block->idstr);
3001
3002     /*
3003      * Ensure that all partially dirty host pages are made fully dirty.
3004      */
3005     postcopy_chunk_hostpages_pass(ms, block);
3006
3007     postcopy_discard_send_finish(ms);
3008     return 0;
3009 }
3010
3011 /**
3012  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
3013  *
3014  * Returns zero on success
3015  *
3016  * Transmit the set of pages to be discarded after precopy to the target
3017  * these are pages that:
3018  *     a) Have been previously transmitted but are now dirty again
3019  *     b) Pages that have never been transmitted, this ensures that
3020  *        any pages on the destination that have been mapped by background
3021  *        tasks get discarded (transparent huge pages is the specific concern)
3022  * Hopefully this is pretty sparse
3023  *
3024  * @ms: current migration state
3025  */
3026 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
3027 {
3028     RAMState *rs = ram_state;
3029     RAMBlock *block;
3030     int ret;
3031
3032     RCU_READ_LOCK_GUARD();
3033
3034     /* This should be our last sync, the src is now paused */
3035     migration_bitmap_sync(rs);
3036
3037     /* Easiest way to make sure we don't resume in the middle of a host-page */
3038     rs->last_seen_block = NULL;
3039     rs->last_sent_block = NULL;
3040     rs->last_page = 0;
3041
3042     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3043         /* Deal with TPS != HPS and huge pages */
3044         ret = postcopy_chunk_hostpages(ms, block);
3045         if (ret) {
3046             return ret;
3047         }
3048
3049 #ifdef DEBUG_POSTCOPY
3050         ram_debug_dump_bitmap(block->bmap, true,
3051                               block->used_length >> TARGET_PAGE_BITS);
3052 #endif
3053     }
3054     trace_ram_postcopy_send_discard_bitmap();
3055
3056     ret = postcopy_each_ram_send_discard(ms);
3057
3058     return ret;
3059 }
3060
3061 /**
3062  * ram_discard_range: discard dirtied pages at the beginning of postcopy
3063  *
3064  * Returns zero on success
3065  *
3066  * @rbname: name of the RAMBlock of the request. NULL means the
3067  *          same that last one.
3068  * @start: RAMBlock starting page
3069  * @length: RAMBlock size
3070  */
3071 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
3072 {
3073     int ret = -1;
3074
3075     trace_ram_discard_range(rbname, start, length);
3076
3077     RCU_READ_LOCK_GUARD();
3078     RAMBlock *rb = qemu_ram_block_by_name(rbname);
3079
3080     if (!rb) {
3081         error_report("ram_discard_range: Failed to find block '%s'", rbname);
3082         goto err;
3083     }
3084
3085     /*
3086      * On source VM, we don't need to update the received bitmap since
3087      * we don't even have one.
3088      */
3089     if (rb->receivedmap) {
3090         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3091                      length >> qemu_target_page_bits());
3092     }
3093
3094     ret = ram_block_discard_range(rb, start, length);
3095
3096 err:
3097     return ret;
3098 }
3099
3100 /*
3101  * For every allocation, we will try not to crash the VM if the
3102  * allocation failed.
3103  */
3104 static int xbzrle_init(void)
3105 {
3106     Error *local_err = NULL;
3107
3108     if (!migrate_use_xbzrle()) {
3109         return 0;
3110     }
3111
3112     XBZRLE_cache_lock();
3113
3114     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3115     if (!XBZRLE.zero_target_page) {
3116         error_report("%s: Error allocating zero page", __func__);
3117         goto err_out;
3118     }
3119
3120     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3121                               TARGET_PAGE_SIZE, &local_err);
3122     if (!XBZRLE.cache) {
3123         error_report_err(local_err);
3124         goto free_zero_page;
3125     }
3126
3127     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3128     if (!XBZRLE.encoded_buf) {
3129         error_report("%s: Error allocating encoded_buf", __func__);
3130         goto free_cache;
3131     }
3132
3133     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3134     if (!XBZRLE.current_buf) {
3135         error_report("%s: Error allocating current_buf", __func__);
3136         goto free_encoded_buf;
3137     }
3138
3139     /* We are all good */
3140     XBZRLE_cache_unlock();
3141     return 0;
3142
3143 free_encoded_buf:
3144     g_free(XBZRLE.encoded_buf);
3145     XBZRLE.encoded_buf = NULL;
3146 free_cache:
3147     cache_fini(XBZRLE.cache);
3148     XBZRLE.cache = NULL;
3149 free_zero_page:
3150     g_free(XBZRLE.zero_target_page);
3151     XBZRLE.zero_target_page = NULL;
3152 err_out:
3153     XBZRLE_cache_unlock();
3154     return -ENOMEM;
3155 }
3156
3157 static int ram_state_init(RAMState **rsp)
3158 {
3159     *rsp = g_try_new0(RAMState, 1);
3160
3161     if (!*rsp) {
3162         error_report("%s: Init ramstate fail", __func__);
3163         return -1;
3164     }
3165
3166     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3167     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3168     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3169
3170     /*
3171      * Count the total number of pages used by ram blocks not including any
3172      * gaps due to alignment or unplugs.
3173      * This must match with the initial values of dirty bitmap.
3174      */
3175     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3176     ram_state_reset(*rsp);
3177
3178     return 0;
3179 }
3180
3181 static void ram_list_init_bitmaps(void)
3182 {
3183     MigrationState *ms = migrate_get_current();
3184     RAMBlock *block;
3185     unsigned long pages;
3186     uint8_t shift;
3187
3188     /* Skip setting bitmap if there is no RAM */
3189     if (ram_bytes_total()) {
3190         shift = ms->clear_bitmap_shift;
3191         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3192             error_report("clear_bitmap_shift (%u) too big, using "
3193                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3194             shift = CLEAR_BITMAP_SHIFT_MAX;
3195         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3196             error_report("clear_bitmap_shift (%u) too small, using "
3197                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3198             shift = CLEAR_BITMAP_SHIFT_MIN;
3199         }
3200
3201         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3202             pages = block->max_length >> TARGET_PAGE_BITS;
3203             /*
3204              * The initial dirty bitmap for migration must be set with all
3205              * ones to make sure we'll migrate every guest RAM page to
3206              * destination.
3207              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3208              * new migration after a failed migration, ram_list.
3209              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3210              * guest memory.
3211              */
3212             block->bmap = bitmap_new(pages);
3213             bitmap_set(block->bmap, 0, pages);
3214             block->clear_bmap_shift = shift;
3215             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3216         }
3217     }
3218 }
3219
3220 static void ram_init_bitmaps(RAMState *rs)
3221 {
3222     /* For memory_global_dirty_log_start below.  */
3223     qemu_mutex_lock_iothread();
3224     qemu_mutex_lock_ramlist();
3225
3226     WITH_RCU_READ_LOCK_GUARD() {
3227         ram_list_init_bitmaps();
3228         memory_global_dirty_log_start();
3229         migration_bitmap_sync_precopy(rs);
3230     }
3231     qemu_mutex_unlock_ramlist();
3232     qemu_mutex_unlock_iothread();
3233 }
3234
3235 static int ram_init_all(RAMState **rsp)
3236 {
3237     if (ram_state_init(rsp)) {
3238         return -1;
3239     }
3240
3241     if (xbzrle_init()) {
3242         ram_state_cleanup(rsp);
3243         return -1;
3244     }
3245
3246     ram_init_bitmaps(*rsp);
3247
3248     return 0;
3249 }
3250
3251 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3252 {
3253     RAMBlock *block;
3254     uint64_t pages = 0;
3255
3256     /*
3257      * Postcopy is not using xbzrle/compression, so no need for that.
3258      * Also, since source are already halted, we don't need to care
3259      * about dirty page logging as well.
3260      */
3261
3262     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3263         pages += bitmap_count_one(block->bmap,
3264                                   block->used_length >> TARGET_PAGE_BITS);
3265     }
3266
3267     /* This may not be aligned with current bitmaps. Recalculate. */
3268     rs->migration_dirty_pages = pages;
3269
3270     rs->last_seen_block = NULL;
3271     rs->last_sent_block = NULL;
3272     rs->last_page = 0;
3273     rs->last_version = ram_list.version;
3274     /*
3275      * Disable the bulk stage, otherwise we'll resend the whole RAM no
3276      * matter what we have sent.
3277      */
3278     rs->ram_bulk_stage = false;
3279
3280     /* Update RAMState cache of output QEMUFile */
3281     rs->f = out;
3282
3283     trace_ram_state_resume_prepare(pages);
3284 }
3285
3286 /*
3287  * This function clears bits of the free pages reported by the caller from the
3288  * migration dirty bitmap. @addr is the host address corresponding to the
3289  * start of the continuous guest free pages, and @len is the total bytes of
3290  * those pages.
3291  */
3292 void qemu_guest_free_page_hint(void *addr, size_t len)
3293 {
3294     RAMBlock *block;
3295     ram_addr_t offset;
3296     size_t used_len, start, npages;
3297     MigrationState *s = migrate_get_current();
3298
3299     /* This function is currently expected to be used during live migration */
3300     if (!migration_is_setup_or_active(s->state)) {
3301         return;
3302     }
3303
3304     for (; len > 0; len -= used_len, addr += used_len) {
3305         block = qemu_ram_block_from_host(addr, false, &offset);
3306         if (unlikely(!block || offset >= block->used_length)) {
3307             /*
3308              * The implementation might not support RAMBlock resize during
3309              * live migration, but it could happen in theory with future
3310              * updates. So we add a check here to capture that case.
3311              */
3312             error_report_once("%s unexpected error", __func__);
3313             return;
3314         }
3315
3316         if (len <= block->used_length - offset) {
3317             used_len = len;
3318         } else {
3319             used_len = block->used_length - offset;
3320         }
3321
3322         start = offset >> TARGET_PAGE_BITS;
3323         npages = used_len >> TARGET_PAGE_BITS;
3324
3325         qemu_mutex_lock(&ram_state->bitmap_mutex);
3326         ram_state->migration_dirty_pages -=
3327                       bitmap_count_one_with_offset(block->bmap, start, npages);
3328         bitmap_clear(block->bmap, start, npages);
3329         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3330     }
3331 }
3332
3333 /*
3334  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3335  * long-running RCU critical section.  When rcu-reclaims in the code
3336  * start to become numerous it will be necessary to reduce the
3337  * granularity of these critical sections.
3338  */
3339
3340 /**
3341  * ram_save_setup: Setup RAM for migration
3342  *
3343  * Returns zero to indicate success and negative for error
3344  *
3345  * @f: QEMUFile where to send the data
3346  * @opaque: RAMState pointer
3347  */
3348 static int ram_save_setup(QEMUFile *f, void *opaque)
3349 {
3350     RAMState **rsp = opaque;
3351     RAMBlock *block;
3352
3353     if (compress_threads_save_setup()) {
3354         return -1;
3355     }
3356
3357     /* migration has already setup the bitmap, reuse it. */
3358     if (!migration_in_colo_state()) {
3359         if (ram_init_all(rsp) != 0) {
3360             compress_threads_save_cleanup();
3361             return -1;
3362         }
3363     }
3364     (*rsp)->f = f;
3365
3366     WITH_RCU_READ_LOCK_GUARD() {
3367         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3368
3369         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3370             qemu_put_byte(f, strlen(block->idstr));
3371             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3372             qemu_put_be64(f, block->used_length);
3373             if (migrate_postcopy_ram() && block->page_size !=
3374                                           qemu_host_page_size) {
3375                 qemu_put_be64(f, block->page_size);
3376             }
3377             if (migrate_ignore_shared()) {
3378                 qemu_put_be64(f, block->mr->addr);
3379             }
3380         }
3381     }
3382
3383     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3384     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3385
3386     multifd_send_sync_main(*rsp);
3387     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3388     qemu_fflush(f);
3389
3390     return 0;
3391 }
3392
3393 /**
3394  * ram_save_iterate: iterative stage for migration
3395  *
3396  * Returns zero to indicate success and negative for error
3397  *
3398  * @f: QEMUFile where to send the data
3399  * @opaque: RAMState pointer
3400  */
3401 static int ram_save_iterate(QEMUFile *f, void *opaque)
3402 {
3403     RAMState **temp = opaque;
3404     RAMState *rs = *temp;
3405     int ret;
3406     int i;
3407     int64_t t0;
3408     int done = 0;
3409
3410     if (blk_mig_bulk_active()) {
3411         /* Avoid transferring ram during bulk phase of block migration as
3412          * the bulk phase will usually take a long time and transferring
3413          * ram updates during that time is pointless. */
3414         goto out;
3415     }
3416
3417     WITH_RCU_READ_LOCK_GUARD() {
3418         if (ram_list.version != rs->last_version) {
3419             ram_state_reset(rs);
3420         }
3421
3422         /* Read version before ram_list.blocks */
3423         smp_rmb();
3424
3425         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3426
3427         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3428         i = 0;
3429         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3430                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3431             int pages;
3432
3433             if (qemu_file_get_error(f)) {
3434                 break;
3435             }
3436
3437             pages = ram_find_and_save_block(rs, false);
3438             /* no more pages to sent */
3439             if (pages == 0) {
3440                 done = 1;
3441                 break;
3442             }
3443
3444             if (pages < 0) {
3445                 qemu_file_set_error(f, pages);
3446                 break;
3447             }
3448
3449             rs->target_page_count += pages;
3450
3451             /*
3452              * we want to check in the 1st loop, just in case it was the 1st
3453              * time and we had to sync the dirty bitmap.
3454              * qemu_clock_get_ns() is a bit expensive, so we only check each
3455              * some iterations
3456              */
3457             if ((i & 63) == 0) {
3458                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3459                               1000000;
3460                 if (t1 > MAX_WAIT) {
3461                     trace_ram_save_iterate_big_wait(t1, i);
3462                     break;
3463                 }
3464             }
3465             i++;
3466         }
3467     }
3468
3469     /*
3470      * Must occur before EOS (or any QEMUFile operation)
3471      * because of RDMA protocol.
3472      */
3473     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3474
3475 out:
3476     multifd_send_sync_main(rs);
3477     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3478     qemu_fflush(f);
3479     ram_counters.transferred += 8;
3480
3481     ret = qemu_file_get_error(f);
3482     if (ret < 0) {
3483         return ret;
3484     }
3485
3486     return done;
3487 }
3488
3489 /**
3490  * ram_save_complete: function called to send the remaining amount of ram
3491  *
3492  * Returns zero to indicate success or negative on error
3493  *
3494  * Called with iothread lock
3495  *
3496  * @f: QEMUFile where to send the data
3497  * @opaque: RAMState pointer
3498  */
3499 static int ram_save_complete(QEMUFile *f, void *opaque)
3500 {
3501     RAMState **temp = opaque;
3502     RAMState *rs = *temp;
3503     int ret = 0;
3504
3505     WITH_RCU_READ_LOCK_GUARD() {
3506         if (!migration_in_postcopy()) {
3507             migration_bitmap_sync_precopy(rs);
3508         }
3509
3510         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3511
3512         /* try transferring iterative blocks of memory */
3513
3514         /* flush all remaining blocks regardless of rate limiting */
3515         while (true) {
3516             int pages;
3517
3518             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3519             /* no more blocks to sent */
3520             if (pages == 0) {
3521                 break;
3522             }
3523             if (pages < 0) {
3524                 ret = pages;
3525                 break;
3526             }
3527         }
3528
3529         flush_compressed_data(rs);
3530         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3531     }
3532
3533     multifd_send_sync_main(rs);
3534     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3535     qemu_fflush(f);
3536
3537     return ret;
3538 }
3539
3540 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3541                              uint64_t *res_precopy_only,
3542                              uint64_t *res_compatible,
3543                              uint64_t *res_postcopy_only)
3544 {
3545     RAMState **temp = opaque;
3546     RAMState *rs = *temp;
3547     uint64_t remaining_size;
3548
3549     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3550
3551     if (!migration_in_postcopy() &&
3552         remaining_size < max_size) {
3553         qemu_mutex_lock_iothread();
3554         WITH_RCU_READ_LOCK_GUARD() {
3555             migration_bitmap_sync_precopy(rs);
3556         }
3557         qemu_mutex_unlock_iothread();
3558         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3559     }
3560
3561     if (migrate_postcopy_ram()) {
3562         /* We can do postcopy, and all the data is postcopiable */
3563         *res_compatible += remaining_size;
3564     } else {
3565         *res_precopy_only += remaining_size;
3566     }
3567 }
3568
3569 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3570 {
3571     unsigned int xh_len;
3572     int xh_flags;
3573     uint8_t *loaded_data;
3574
3575     /* extract RLE header */
3576     xh_flags = qemu_get_byte(f);
3577     xh_len = qemu_get_be16(f);
3578
3579     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3580         error_report("Failed to load XBZRLE page - wrong compression!");
3581         return -1;
3582     }
3583
3584     if (xh_len > TARGET_PAGE_SIZE) {
3585         error_report("Failed to load XBZRLE page - len overflow!");
3586         return -1;
3587     }
3588     loaded_data = XBZRLE.decoded_buf;
3589     /* load data and decode */
3590     /* it can change loaded_data to point to an internal buffer */
3591     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3592
3593     /* decode RLE */
3594     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3595                              TARGET_PAGE_SIZE) == -1) {
3596         error_report("Failed to load XBZRLE page - decode error!");
3597         return -1;
3598     }
3599
3600     return 0;
3601 }
3602
3603 /**
3604  * ram_block_from_stream: read a RAMBlock id from the migration stream
3605  *
3606  * Must be called from within a rcu critical section.
3607  *
3608  * Returns a pointer from within the RCU-protected ram_list.
3609  *
3610  * @f: QEMUFile where to read the data from
3611  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3612  */
3613 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3614 {
3615     static RAMBlock *block = NULL;
3616     char id[256];
3617     uint8_t len;
3618
3619     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3620         if (!block) {
3621             error_report("Ack, bad migration stream!");
3622             return NULL;
3623         }
3624         return block;
3625     }
3626
3627     len = qemu_get_byte(f);
3628     qemu_get_buffer(f, (uint8_t *)id, len);
3629     id[len] = 0;
3630
3631     block = qemu_ram_block_by_name(id);
3632     if (!block) {
3633         error_report("Can't find block %s", id);
3634         return NULL;
3635     }
3636
3637     if (ramblock_is_ignored(block)) {
3638         error_report("block %s should not be migrated !", id);
3639         return NULL;
3640     }
3641
3642     return block;
3643 }
3644
3645 static inline void *host_from_ram_block_offset(RAMBlock *block,
3646                                                ram_addr_t offset)
3647 {
3648     if (!offset_in_ramblock(block, offset)) {
3649         return NULL;
3650     }
3651
3652     return block->host + offset;
3653 }
3654
3655 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3656                                                  ram_addr_t offset)
3657 {
3658     if (!offset_in_ramblock(block, offset)) {
3659         return NULL;
3660     }
3661     if (!block->colo_cache) {
3662         error_report("%s: colo_cache is NULL in block :%s",
3663                      __func__, block->idstr);
3664         return NULL;
3665     }
3666
3667     /*
3668     * During colo checkpoint, we need bitmap of these migrated pages.
3669     * It help us to decide which pages in ram cache should be flushed
3670     * into VM's RAM later.
3671     */
3672     if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3673         ram_state->migration_dirty_pages++;
3674     }
3675     return block->colo_cache + offset;
3676 }
3677
3678 /**
3679  * ram_handle_compressed: handle the zero page case
3680  *
3681  * If a page (or a whole RDMA chunk) has been
3682  * determined to be zero, then zap it.
3683  *
3684  * @host: host address for the zero page
3685  * @ch: what the page is filled from.  We only support zero
3686  * @size: size of the zero page
3687  */
3688 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3689 {
3690     if (ch != 0 || !is_zero_range(host, size)) {
3691         memset(host, ch, size);
3692     }
3693 }
3694
3695 /* return the size after decompression, or negative value on error */
3696 static int
3697 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3698                      const uint8_t *source, size_t source_len)
3699 {
3700     int err;
3701
3702     err = inflateReset(stream);
3703     if (err != Z_OK) {
3704         return -1;
3705     }
3706
3707     stream->avail_in = source_len;
3708     stream->next_in = (uint8_t *)source;
3709     stream->avail_out = dest_len;
3710     stream->next_out = dest;
3711
3712     err = inflate(stream, Z_NO_FLUSH);
3713     if (err != Z_STREAM_END) {
3714         return -1;
3715     }
3716
3717     return stream->total_out;
3718 }
3719
3720 static void *do_data_decompress(void *opaque)
3721 {
3722     DecompressParam *param = opaque;
3723     unsigned long pagesize;
3724     uint8_t *des;
3725     int len, ret;
3726
3727     qemu_mutex_lock(&param->mutex);
3728     while (!param->quit) {
3729         if (param->des) {
3730             des = param->des;
3731             len = param->len;
3732             param->des = 0;
3733             qemu_mutex_unlock(&param->mutex);
3734
3735             pagesize = TARGET_PAGE_SIZE;
3736
3737             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3738                                        param->compbuf, len);
3739             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3740                 error_report("decompress data failed");
3741                 qemu_file_set_error(decomp_file, ret);
3742             }
3743
3744             qemu_mutex_lock(&decomp_done_lock);
3745             param->done = true;
3746             qemu_cond_signal(&decomp_done_cond);
3747             qemu_mutex_unlock(&decomp_done_lock);
3748
3749             qemu_mutex_lock(&param->mutex);
3750         } else {
3751             qemu_cond_wait(&param->cond, &param->mutex);
3752         }
3753     }
3754     qemu_mutex_unlock(&param->mutex);
3755
3756     return NULL;
3757 }
3758
3759 static int wait_for_decompress_done(void)
3760 {
3761     int idx, thread_count;
3762
3763     if (!migrate_use_compression()) {
3764         return 0;
3765     }
3766
3767     thread_count = migrate_decompress_threads();
3768     qemu_mutex_lock(&decomp_done_lock);
3769     for (idx = 0; idx < thread_count; idx++) {
3770         while (!decomp_param[idx].done) {
3771             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3772         }
3773     }
3774     qemu_mutex_unlock(&decomp_done_lock);
3775     return qemu_file_get_error(decomp_file);
3776 }
3777
3778 static void compress_threads_load_cleanup(void)
3779 {
3780     int i, thread_count;
3781
3782     if (!migrate_use_compression()) {
3783         return;
3784     }
3785     thread_count = migrate_decompress_threads();
3786     for (i = 0; i < thread_count; i++) {
3787         /*
3788          * we use it as a indicator which shows if the thread is
3789          * properly init'd or not
3790          */
3791         if (!decomp_param[i].compbuf) {
3792             break;
3793         }
3794
3795         qemu_mutex_lock(&decomp_param[i].mutex);
3796         decomp_param[i].quit = true;
3797         qemu_cond_signal(&decomp_param[i].cond);
3798         qemu_mutex_unlock(&decomp_param[i].mutex);
3799     }
3800     for (i = 0; i < thread_count; i++) {
3801         if (!decomp_param[i].compbuf) {
3802             break;
3803         }
3804
3805         qemu_thread_join(decompress_threads + i);
3806         qemu_mutex_destroy(&decomp_param[i].mutex);
3807         qemu_cond_destroy(&decomp_param[i].cond);
3808         inflateEnd(&decomp_param[i].stream);
3809         g_free(decomp_param[i].compbuf);
3810         decomp_param[i].compbuf = NULL;
3811     }
3812     g_free(decompress_threads);
3813     g_free(decomp_param);
3814     decompress_threads = NULL;
3815     decomp_param = NULL;
3816     decomp_file = NULL;
3817 }
3818
3819 static int compress_threads_load_setup(QEMUFile *f)
3820 {
3821     int i, thread_count;
3822
3823     if (!migrate_use_compression()) {
3824         return 0;
3825     }
3826
3827     thread_count = migrate_decompress_threads();
3828     decompress_threads = g_new0(QemuThread, thread_count);
3829     decomp_param = g_new0(DecompressParam, thread_count);
3830     qemu_mutex_init(&decomp_done_lock);
3831     qemu_cond_init(&decomp_done_cond);
3832     decomp_file = f;
3833     for (i = 0; i < thread_count; i++) {
3834         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3835             goto exit;
3836         }
3837
3838         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3839         qemu_mutex_init(&decomp_param[i].mutex);
3840         qemu_cond_init(&decomp_param[i].cond);
3841         decomp_param[i].done = true;
3842         decomp_param[i].quit = false;
3843         qemu_thread_create(decompress_threads + i, "decompress",
3844                            do_data_decompress, decomp_param + i,
3845                            QEMU_THREAD_JOINABLE);
3846     }
3847     return 0;
3848 exit:
3849     compress_threads_load_cleanup();
3850     return -1;
3851 }
3852
3853 static void decompress_data_with_multi_threads(QEMUFile *f,
3854                                                void *host, int len)
3855 {
3856     int idx, thread_count;
3857
3858     thread_count = migrate_decompress_threads();
3859     qemu_mutex_lock(&decomp_done_lock);
3860     while (true) {
3861         for (idx = 0; idx < thread_count; idx++) {
3862             if (decomp_param[idx].done) {
3863                 decomp_param[idx].done = false;
3864                 qemu_mutex_lock(&decomp_param[idx].mutex);
3865                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3866                 decomp_param[idx].des = host;
3867                 decomp_param[idx].len = len;
3868                 qemu_cond_signal(&decomp_param[idx].cond);
3869                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3870                 break;
3871             }
3872         }
3873         if (idx < thread_count) {
3874             break;
3875         } else {
3876             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3877         }
3878     }
3879     qemu_mutex_unlock(&decomp_done_lock);
3880 }
3881
3882 /*
3883  * colo cache: this is for secondary VM, we cache the whole
3884  * memory of the secondary VM, it is need to hold the global lock
3885  * to call this helper.
3886  */
3887 int colo_init_ram_cache(void)
3888 {
3889     RAMBlock *block;
3890
3891     rcu_read_lock();
3892     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3893         block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3894                                                 NULL,
3895                                                 false);
3896         if (!block->colo_cache) {
3897             error_report("%s: Can't alloc memory for COLO cache of block %s,"
3898                          "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3899                          block->used_length);
3900             RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3901                 if (block->colo_cache) {
3902                     qemu_anon_ram_free(block->colo_cache, block->used_length);
3903                     block->colo_cache = NULL;
3904                 }
3905             }
3906             return -errno;
3907         }
3908         memcpy(block->colo_cache, block->host, block->used_length);
3909     }
3910     rcu_read_unlock();
3911     /*
3912     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3913     * with to decide which page in cache should be flushed into SVM's RAM. Here
3914     * we use the same name 'ram_bitmap' as for migration.
3915     */
3916     if (ram_bytes_total()) {
3917         RAMBlock *block;
3918
3919         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3920             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3921
3922             block->bmap = bitmap_new(pages);
3923             bitmap_set(block->bmap, 0, pages);
3924         }
3925     }
3926     ram_state = g_new0(RAMState, 1);
3927     ram_state->migration_dirty_pages = 0;
3928     qemu_mutex_init(&ram_state->bitmap_mutex);
3929     memory_global_dirty_log_start();
3930
3931     return 0;
3932 }
3933
3934 /* It is need to hold the global lock to call this helper */
3935 void colo_release_ram_cache(void)
3936 {
3937     RAMBlock *block;
3938
3939     memory_global_dirty_log_stop();
3940     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3941         g_free(block->bmap);
3942         block->bmap = NULL;
3943     }
3944
3945     WITH_RCU_READ_LOCK_GUARD() {
3946         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3947             if (block->colo_cache) {
3948                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3949                 block->colo_cache = NULL;
3950             }
3951         }
3952     }
3953     qemu_mutex_destroy(&ram_state->bitmap_mutex);
3954     g_free(ram_state);
3955     ram_state = NULL;
3956 }
3957
3958 /**
3959  * ram_load_setup: Setup RAM for migration incoming side
3960  *
3961  * Returns zero to indicate success and negative for error
3962  *
3963  * @f: QEMUFile where to receive the data
3964  * @opaque: RAMState pointer
3965  */
3966 static int ram_load_setup(QEMUFile *f, void *opaque)
3967 {
3968     if (compress_threads_load_setup(f)) {
3969         return -1;
3970     }
3971
3972     xbzrle_load_setup();
3973     ramblock_recv_map_init();
3974
3975     return 0;
3976 }
3977
3978 static int ram_load_cleanup(void *opaque)
3979 {
3980     RAMBlock *rb;
3981
3982     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3983         qemu_ram_block_writeback(rb);
3984     }
3985
3986     xbzrle_load_cleanup();
3987     compress_threads_load_cleanup();
3988
3989     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3990         g_free(rb->receivedmap);
3991         rb->receivedmap = NULL;
3992     }
3993
3994     return 0;
3995 }
3996
3997 /**
3998  * ram_postcopy_incoming_init: allocate postcopy data structures
3999  *
4000  * Returns 0 for success and negative if there was one error
4001  *
4002  * @mis: current migration incoming state
4003  *
4004  * Allocate data structures etc needed by incoming migration with
4005  * postcopy-ram. postcopy-ram's similarly names
4006  * postcopy_ram_incoming_init does the work.
4007  */
4008 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4009 {
4010     return postcopy_ram_incoming_init(mis);
4011 }
4012
4013 /**
4014  * ram_load_postcopy: load a page in postcopy case
4015  *
4016  * Returns 0 for success or -errno in case of error
4017  *
4018  * Called in postcopy mode by ram_load().
4019  * rcu_read_lock is taken prior to this being called.
4020  *
4021  * @f: QEMUFile where to send the data
4022  */
4023 static int ram_load_postcopy(QEMUFile *f)
4024 {
4025     int flags = 0, ret = 0;
4026     bool place_needed = false;
4027     bool matches_target_page_size = false;
4028     MigrationIncomingState *mis = migration_incoming_get_current();
4029     /* Temporary page that is later 'placed' */
4030     void *postcopy_host_page = mis->postcopy_tmp_page;
4031     void *last_host = NULL;
4032     bool all_zero = false;
4033
4034     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4035         ram_addr_t addr;
4036         void *host = NULL;
4037         void *page_buffer = NULL;
4038         void *place_source = NULL;
4039         RAMBlock *block = NULL;
4040         uint8_t ch;
4041
4042         addr = qemu_get_be64(f);
4043
4044         /*
4045          * If qemu file error, we should stop here, and then "addr"
4046          * may be invalid
4047          */
4048         ret = qemu_file_get_error(f);
4049         if (ret) {
4050             break;
4051         }
4052
4053         flags = addr & ~TARGET_PAGE_MASK;
4054         addr &= TARGET_PAGE_MASK;
4055
4056         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
4057         place_needed = false;
4058         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
4059             block = ram_block_from_stream(f, flags);
4060
4061             host = host_from_ram_block_offset(block, addr);
4062             if (!host) {
4063                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4064                 ret = -EINVAL;
4065                 break;
4066             }
4067             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4068             /*
4069              * Postcopy requires that we place whole host pages atomically;
4070              * these may be huge pages for RAMBlocks that are backed by
4071              * hugetlbfs.
4072              * To make it atomic, the data is read into a temporary page
4073              * that's moved into place later.
4074              * The migration protocol uses,  possibly smaller, target-pages
4075              * however the source ensures it always sends all the components
4076              * of a host page in order.
4077              */
4078             page_buffer = postcopy_host_page +
4079                           ((uintptr_t)host & (block->page_size - 1));
4080             /* If all TP are zero then we can optimise the place */
4081             if (!((uintptr_t)host & (block->page_size - 1))) {
4082                 all_zero = true;
4083             } else {
4084                 /* not the 1st TP within the HP */
4085                 if (host != (last_host + TARGET_PAGE_SIZE)) {
4086                     error_report("Non-sequential target page %p/%p",
4087                                   host, last_host);
4088                     ret = -EINVAL;
4089                     break;
4090                 }
4091             }
4092
4093
4094             /*
4095              * If it's the last part of a host page then we place the host
4096              * page
4097              */
4098             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
4099                                      (block->page_size - 1)) == 0;
4100             place_source = postcopy_host_page;
4101         }
4102         last_host = host;
4103
4104         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4105         case RAM_SAVE_FLAG_ZERO:
4106             ch = qemu_get_byte(f);
4107             memset(page_buffer, ch, TARGET_PAGE_SIZE);
4108             if (ch) {
4109                 all_zero = false;
4110             }
4111             break;
4112
4113         case RAM_SAVE_FLAG_PAGE:
4114             all_zero = false;
4115             if (!matches_target_page_size) {
4116                 /* For huge pages, we always use temporary buffer */
4117                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4118             } else {
4119                 /*
4120                  * For small pages that matches target page size, we
4121                  * avoid the qemu_file copy.  Instead we directly use
4122                  * the buffer of QEMUFile to place the page.  Note: we
4123                  * cannot do any QEMUFile operation before using that
4124                  * buffer to make sure the buffer is valid when
4125                  * placing the page.
4126                  */
4127                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4128                                          TARGET_PAGE_SIZE);
4129             }
4130             break;
4131         case RAM_SAVE_FLAG_EOS:
4132             /* normal exit */
4133             multifd_recv_sync_main();
4134             break;
4135         default:
4136             error_report("Unknown combination of migration flags: %#x"
4137                          " (postcopy mode)", flags);
4138             ret = -EINVAL;
4139             break;
4140         }
4141
4142         /* Detect for any possible file errors */
4143         if (!ret && qemu_file_get_error(f)) {
4144             ret = qemu_file_get_error(f);
4145         }
4146
4147         if (!ret && place_needed) {
4148             /* This gets called at the last target page in the host page */
4149             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
4150
4151             if (all_zero) {
4152                 ret = postcopy_place_page_zero(mis, place_dest,
4153                                                block);
4154             } else {
4155                 ret = postcopy_place_page(mis, place_dest,
4156                                           place_source, block);
4157             }
4158         }
4159     }
4160
4161     return ret;
4162 }
4163
4164 static bool postcopy_is_advised(void)
4165 {
4166     PostcopyState ps = postcopy_state_get();
4167     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4168 }
4169
4170 static bool postcopy_is_running(void)
4171 {
4172     PostcopyState ps = postcopy_state_get();
4173     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4174 }
4175
4176 /*
4177  * Flush content of RAM cache into SVM's memory.
4178  * Only flush the pages that be dirtied by PVM or SVM or both.
4179  */
4180 static void colo_flush_ram_cache(void)
4181 {
4182     RAMBlock *block = NULL;
4183     void *dst_host;
4184     void *src_host;
4185     unsigned long offset = 0;
4186
4187     memory_global_dirty_log_sync();
4188     WITH_RCU_READ_LOCK_GUARD() {
4189         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4190             ramblock_sync_dirty_bitmap(ram_state, block);
4191         }
4192     }
4193
4194     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4195     WITH_RCU_READ_LOCK_GUARD() {
4196         block = QLIST_FIRST_RCU(&ram_list.blocks);
4197
4198         while (block) {
4199             offset = migration_bitmap_find_dirty(ram_state, block, offset);
4200
4201             if (offset << TARGET_PAGE_BITS >= block->used_length) {
4202                 offset = 0;
4203                 block = QLIST_NEXT_RCU(block, next);
4204             } else {
4205                 migration_bitmap_clear_dirty(ram_state, block, offset);
4206                 dst_host = block->host + (offset << TARGET_PAGE_BITS);
4207                 src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
4208                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4209             }
4210         }
4211     }
4212     trace_colo_flush_ram_cache_end();
4213 }
4214
4215 /**
4216  * ram_load_precopy: load pages in precopy case
4217  *
4218  * Returns 0 for success or -errno in case of error
4219  *
4220  * Called in precopy mode by ram_load().
4221  * rcu_read_lock is taken prior to this being called.
4222  *
4223  * @f: QEMUFile where to send the data
4224  */
4225 static int ram_load_precopy(QEMUFile *f)
4226 {
4227     int flags = 0, ret = 0, invalid_flags = 0, len = 0;
4228     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4229     bool postcopy_advised = postcopy_is_advised();
4230     if (!migrate_use_compression()) {
4231         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4232     }
4233
4234     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4235         ram_addr_t addr, total_ram_bytes;
4236         void *host = NULL;
4237         uint8_t ch;
4238
4239         addr = qemu_get_be64(f);
4240         flags = addr & ~TARGET_PAGE_MASK;
4241         addr &= TARGET_PAGE_MASK;
4242
4243         if (flags & invalid_flags) {
4244             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4245                 error_report("Received an unexpected compressed page");
4246             }
4247
4248             ret = -EINVAL;
4249             break;
4250         }
4251
4252         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4253                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4254             RAMBlock *block = ram_block_from_stream(f, flags);
4255
4256             /*
4257              * After going into COLO, we should load the Page into colo_cache.
4258              */
4259             if (migration_incoming_in_colo_state()) {
4260                 host = colo_cache_from_block_offset(block, addr);
4261             } else {
4262                 host = host_from_ram_block_offset(block, addr);
4263             }
4264             if (!host) {
4265                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4266                 ret = -EINVAL;
4267                 break;
4268             }
4269
4270             if (!migration_incoming_in_colo_state()) {
4271                 ramblock_recv_bitmap_set(block, host);
4272             }
4273
4274             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4275         }
4276
4277         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4278         case RAM_SAVE_FLAG_MEM_SIZE:
4279             /* Synchronize RAM block list */
4280             total_ram_bytes = addr;
4281             while (!ret && total_ram_bytes) {
4282                 RAMBlock *block;
4283                 char id[256];
4284                 ram_addr_t length;
4285
4286                 len = qemu_get_byte(f);
4287                 qemu_get_buffer(f, (uint8_t *)id, len);
4288                 id[len] = 0;
4289                 length = qemu_get_be64(f);
4290
4291                 block = qemu_ram_block_by_name(id);
4292                 if (block && !qemu_ram_is_migratable(block)) {
4293                     error_report("block %s should not be migrated !", id);
4294                     ret = -EINVAL;
4295                 } else if (block) {
4296                     if (length != block->used_length) {
4297                         Error *local_err = NULL;
4298
4299                         ret = qemu_ram_resize(block, length,
4300                                               &local_err);
4301                         if (local_err) {
4302                             error_report_err(local_err);
4303                         }
4304                     }
4305                     /* For postcopy we need to check hugepage sizes match */
4306                     if (postcopy_advised &&
4307                         block->page_size != qemu_host_page_size) {
4308                         uint64_t remote_page_size = qemu_get_be64(f);
4309                         if (remote_page_size != block->page_size) {
4310                             error_report("Mismatched RAM page size %s "
4311                                          "(local) %zd != %" PRId64,
4312                                          id, block->page_size,
4313                                          remote_page_size);
4314                             ret = -EINVAL;
4315                         }
4316                     }
4317                     if (migrate_ignore_shared()) {
4318                         hwaddr addr = qemu_get_be64(f);
4319                         if (ramblock_is_ignored(block) &&
4320                             block->mr->addr != addr) {
4321                             error_report("Mismatched GPAs for block %s "
4322                                          "%" PRId64 "!= %" PRId64,
4323                                          id, (uint64_t)addr,
4324                                          (uint64_t)block->mr->addr);
4325                             ret = -EINVAL;
4326                         }
4327                     }
4328                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4329                                           block->idstr);
4330                 } else {
4331                     error_report("Unknown ramblock \"%s\", cannot "
4332                                  "accept migration", id);
4333                     ret = -EINVAL;
4334                 }
4335
4336                 total_ram_bytes -= length;
4337             }
4338             break;
4339
4340         case RAM_SAVE_FLAG_ZERO:
4341             ch = qemu_get_byte(f);
4342             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4343             break;
4344
4345         case RAM_SAVE_FLAG_PAGE:
4346             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4347             break;
4348
4349         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4350             len = qemu_get_be32(f);
4351             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4352                 error_report("Invalid compressed data length: %d", len);
4353                 ret = -EINVAL;
4354                 break;
4355             }
4356             decompress_data_with_multi_threads(f, host, len);
4357             break;
4358
4359         case RAM_SAVE_FLAG_XBZRLE:
4360             if (load_xbzrle(f, addr, host) < 0) {
4361                 error_report("Failed to decompress XBZRLE page at "
4362                              RAM_ADDR_FMT, addr);
4363                 ret = -EINVAL;
4364                 break;
4365             }
4366             break;
4367         case RAM_SAVE_FLAG_EOS:
4368             /* normal exit */
4369             multifd_recv_sync_main();
4370             break;
4371         default:
4372             if (flags & RAM_SAVE_FLAG_HOOK) {
4373                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4374             } else {
4375                 error_report("Unknown combination of migration flags: %#x",
4376                              flags);
4377                 ret = -EINVAL;
4378             }
4379         }
4380         if (!ret) {
4381             ret = qemu_file_get_error(f);
4382         }
4383     }
4384
4385     return ret;
4386 }
4387
4388 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4389 {
4390     int ret = 0;
4391     static uint64_t seq_iter;
4392     /*
4393      * If system is running in postcopy mode, page inserts to host memory must
4394      * be atomic
4395      */
4396     bool postcopy_running = postcopy_is_running();
4397
4398     seq_iter++;
4399
4400     if (version_id != 4) {
4401         return -EINVAL;
4402     }
4403
4404     /*
4405      * This RCU critical section can be very long running.
4406      * When RCU reclaims in the code start to become numerous,
4407      * it will be necessary to reduce the granularity of this
4408      * critical section.
4409      */
4410     WITH_RCU_READ_LOCK_GUARD() {
4411         if (postcopy_running) {
4412             ret = ram_load_postcopy(f);
4413         } else {
4414             ret = ram_load_precopy(f);
4415         }
4416
4417         ret |= wait_for_decompress_done();
4418     }
4419     trace_ram_load_complete(ret, seq_iter);
4420
4421     if (!ret  && migration_incoming_in_colo_state()) {
4422         colo_flush_ram_cache();
4423     }
4424     return ret;
4425 }
4426
4427 static bool ram_has_postcopy(void *opaque)
4428 {
4429     RAMBlock *rb;
4430     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4431         if (ramblock_is_pmem(rb)) {
4432             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4433                          "is not supported now!", rb->idstr, rb->host);
4434             return false;
4435         }
4436     }
4437
4438     return migrate_postcopy_ram();
4439 }
4440
4441 /* Sync all the dirty bitmap with destination VM.  */
4442 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4443 {
4444     RAMBlock *block;
4445     QEMUFile *file = s->to_dst_file;
4446     int ramblock_count = 0;
4447
4448     trace_ram_dirty_bitmap_sync_start();
4449
4450     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4451         qemu_savevm_send_recv_bitmap(file, block->idstr);
4452         trace_ram_dirty_bitmap_request(block->idstr);
4453         ramblock_count++;
4454     }
4455
4456     trace_ram_dirty_bitmap_sync_wait();
4457
4458     /* Wait until all the ramblocks' dirty bitmap synced */
4459     while (ramblock_count--) {
4460         qemu_sem_wait(&s->rp_state.rp_sem);
4461     }
4462
4463     trace_ram_dirty_bitmap_sync_complete();
4464
4465     return 0;
4466 }
4467
4468 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4469 {
4470     qemu_sem_post(&s->rp_state.rp_sem);
4471 }
4472
4473 /*
4474  * Read the received bitmap, revert it as the initial dirty bitmap.
4475  * This is only used when the postcopy migration is paused but wants
4476  * to resume from a middle point.
4477  */
4478 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4479 {
4480     int ret = -EINVAL;
4481     QEMUFile *file = s->rp_state.from_dst_file;
4482     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4483     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4484     uint64_t size, end_mark;
4485
4486     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4487
4488     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4489         error_report("%s: incorrect state %s", __func__,
4490                      MigrationStatus_str(s->state));
4491         return -EINVAL;
4492     }
4493
4494     /*
4495      * Note: see comments in ramblock_recv_bitmap_send() on why we
4496      * need the endianess convertion, and the paddings.
4497      */
4498     local_size = ROUND_UP(local_size, 8);
4499
4500     /* Add paddings */
4501     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4502
4503     size = qemu_get_be64(file);
4504
4505     /* The size of the bitmap should match with our ramblock */
4506     if (size != local_size) {
4507         error_report("%s: ramblock '%s' bitmap size mismatch "
4508                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4509                      block->idstr, size, local_size);
4510         ret = -EINVAL;
4511         goto out;
4512     }
4513
4514     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4515     end_mark = qemu_get_be64(file);
4516
4517     ret = qemu_file_get_error(file);
4518     if (ret || size != local_size) {
4519         error_report("%s: read bitmap failed for ramblock '%s': %d"
4520                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4521                      __func__, block->idstr, ret, local_size, size);
4522         ret = -EIO;
4523         goto out;
4524     }
4525
4526     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4527         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4528                      __func__, block->idstr, end_mark);
4529         ret = -EINVAL;
4530         goto out;
4531     }
4532
4533     /*
4534      * Endianess convertion. We are during postcopy (though paused).
4535      * The dirty bitmap won't change. We can directly modify it.
4536      */
4537     bitmap_from_le(block->bmap, le_bitmap, nbits);
4538
4539     /*
4540      * What we received is "received bitmap". Revert it as the initial
4541      * dirty bitmap for this ramblock.
4542      */
4543     bitmap_complement(block->bmap, block->bmap, nbits);
4544
4545     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4546
4547     /*
4548      * We succeeded to sync bitmap for current ramblock. If this is
4549      * the last one to sync, we need to notify the main send thread.
4550      */
4551     ram_dirty_bitmap_reload_notify(s);
4552
4553     ret = 0;
4554 out:
4555     g_free(le_bitmap);
4556     return ret;
4557 }
4558
4559 static int ram_resume_prepare(MigrationState *s, void *opaque)
4560 {
4561     RAMState *rs = *(RAMState **)opaque;
4562     int ret;
4563
4564     ret = ram_dirty_bitmap_sync_all(s, rs);
4565     if (ret) {
4566         return ret;
4567     }
4568
4569     ram_state_resume_prepare(rs, s->to_dst_file);
4570
4571     return 0;
4572 }
4573
4574 static SaveVMHandlers savevm_ram_handlers = {
4575     .save_setup = ram_save_setup,
4576     .save_live_iterate = ram_save_iterate,
4577     .save_live_complete_postcopy = ram_save_complete,
4578     .save_live_complete_precopy = ram_save_complete,
4579     .has_postcopy = ram_has_postcopy,
4580     .save_live_pending = ram_save_pending,
4581     .load_state = ram_load,
4582     .save_cleanup = ram_save_cleanup,
4583     .load_setup = ram_load_setup,
4584     .load_cleanup = ram_load_cleanup,
4585     .resume_prepare = ram_resume_prepare,
4586 };
4587
4588 void ram_mig_init(void)
4589 {
4590     qemu_mutex_init(&XBZRLE.lock);
4591     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4592 }