migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "qemu/pmem.h"
  37 #include "xbzrle.h"
  38 #include "ram.h"
  39 #include "migration.h"
  40 #include "socket.h"
  41 #include "migration/register.h"
  42 #include "migration/misc.h"
  43 #include "qemu-file.h"
  44 #include "postcopy-ram.h"
  45 #include "page_cache.h"
  46 #include "qemu/error-report.h"
  47 #include "qapi/error.h"
  48 #include "qapi/qapi-events-migration.h"
  49 #include "qapi/qmp/qerror.h"
  50 #include "trace.h"
  51 #include "exec/ram_addr.h"
  52 #include "exec/target_page.h"
  53 #include "qemu/rcu_queue.h"
  54 #include "migration/colo.h"
  55 #include "block.h"
  56 #include "sysemu/sysemu.h"
  57 #include "qemu/uuid.h"
  58 #include "savevm.h"
  59 #include "qemu/iov.h"
  60
  61 /***********************************************************/
  62 /* ram save/restore */
  63
  64 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  65  * worked for pages that where filled with the same char.  We switched
  66  * it to only search for the zero value.  And to avoid confusion with
  67  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  68  */
  69
  70 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  71 #define RAM_SAVE_FLAG_ZERO     0x02
  72 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  73 #define RAM_SAVE_FLAG_PAGE     0x08
  74 #define RAM_SAVE_FLAG_EOS      0x10
  75 #define RAM_SAVE_FLAG_CONTINUE 0x20
  76 #define RAM_SAVE_FLAG_XBZRLE   0x40
  77 /* 0x80 is reserved in migration.h start with 0x100 next */
  78 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  79
  80 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  81 {
  82     return buffer_is_zero(p, size);
  83 }
  84
  85 XBZRLECacheStats xbzrle_counters;
  86
  87 /* struct contains XBZRLE cache and a static page
  88    used by the compression */
  89 static struct {
  90     /* buffer used for XBZRLE encoding */
  91     uint8_t *encoded_buf;
  92     /* buffer for storing page content */
  93     uint8_t *current_buf;
  94     /* Cache for XBZRLE, Protected by lock. */
  95     PageCache *cache;
  96     QemuMutex lock;
  97     /* it will store a page full of zeros */
  98     uint8_t *zero_target_page;
  99     /* buffer used for XBZRLE decoding */
 100     uint8_t *decoded_buf;
 101 } XBZRLE;
 102
 103 static void XBZRLE_cache_lock(void)
 104 {
 105     if (migrate_use_xbzrle())
 106         qemu_mutex_lock(&XBZRLE.lock);
 107 }
 108
 109 static void XBZRLE_cache_unlock(void)
 110 {
 111     if (migrate_use_xbzrle())
 112         qemu_mutex_unlock(&XBZRLE.lock);
 113 }
 114
 115 /**
 116  * xbzrle_cache_resize: resize the xbzrle cache
 117  *
 118  * This function is called from qmp_migrate_set_cache_size in main
 119  * thread, possibly while a migration is in progress.  A running
 120  * migration may be using the cache and might finish during this call,
 121  * hence changes to the cache are protected by XBZRLE.lock().
 122  *
 123  * Returns 0 for success or -1 for error
 124  *
 125  * @new_size: new cache size
 126  * @errp: set *errp if the check failed, with reason
 127  */
 128 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 129 {
 130     PageCache *new_cache;
 131     int64_t ret = 0;
 132
 133     /* Check for truncation */
 134     if (new_size != (size_t)new_size) {
 135         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 136                    "exceeding address space");
 137         return -1;
 138     }
 139
 140     if (new_size == migrate_xbzrle_cache_size()) {
 141         /* nothing to do */
 142         return 0;
 143     }
 144
 145     XBZRLE_cache_lock();
 146
 147     if (XBZRLE.cache != NULL) {
 148         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 149         if (!new_cache) {
 150             ret = -1;
 151             goto out;
 152         }
 153
 154         cache_fini(XBZRLE.cache);
 155         XBZRLE.cache = new_cache;
 156     }
 157 out:
 158     XBZRLE_cache_unlock();
 159     return ret;
 160 }
 161
 162 /* Should be holding either ram_list.mutex, or the RCU lock. */
 163 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 164     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 165         if (!qemu_ram_is_migratable(block)) {} else
 166
 167 #undef RAMBLOCK_FOREACH
 168
 169 static void ramblock_recv_map_init(void)
 170 {
 171     RAMBlock *rb;
 172
 173     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
 174         assert(!rb->receivedmap);
 175         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 176     }
 177 }
 178
 179 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 180 {
 181     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 182                     rb->receivedmap);
 183 }
 184
 185 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 186 {
 187     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 188 }
 189
 190 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 191 {
 192     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 193 }
 194
 195 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 196                                     size_t nr)
 197 {
 198     bitmap_set_atomic(rb->receivedmap,
 199                       ramblock_recv_bitmap_offset(host_addr, rb),
 200                       nr);
 201 }
 202
 203 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 204
 205 /*
 206  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 207  *
 208  * Returns >0 if success with sent bytes, or <0 if error.
 209  */
 210 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 211                                   const char *block_name)
 212 {
 213     RAMBlock *block = qemu_ram_block_by_name(block_name);
 214     unsigned long *le_bitmap, nbits;
 215     uint64_t size;
 216
 217     if (!block) {
 218         error_report("%s: invalid block name: %s", __func__, block_name);
 219         return -1;
 220     }
 221
 222     nbits = block->used_length >> TARGET_PAGE_BITS;
 223
 224     /*
 225      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 226      * machines we may need 4 more bytes for padding (see below
 227      * comment). So extend it a bit before hand.
 228      */
 229     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 230
 231     /*
 232      * Always use little endian when sending the bitmap. This is
 233      * required that when source and destination VMs are not using the
 234      * same endianess. (Note: big endian won't work.)
 235      */
 236     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 237
 238     /* Size of the bitmap, in bytes */
 239     size = DIV_ROUND_UP(nbits, 8);
 240
 241     /*
 242      * size is always aligned to 8 bytes for 64bit machines, but it
 243      * may not be true for 32bit machines. We need this padding to
 244      * make sure the migration can survive even between 32bit and
 245      * 64bit machines.
 246      */
 247     size = ROUND_UP(size, 8);
 248
 249     qemu_put_be64(file, size);
 250     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 251     /*
 252      * Mark as an end, in case the middle part is screwed up due to
 253      * some "misterious" reason.
 254      */
 255     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 256     qemu_fflush(file);
 257
 258     g_free(le_bitmap);
 259
 260     if (qemu_file_get_error(file)) {
 261         return qemu_file_get_error(file);
 262     }
 263
 264     return size + sizeof(size);
 265 }
 266
 267 /*
 268  * An outstanding page request, on the source, having been received
 269  * and queued
 270  */
 271 struct RAMSrcPageRequest {
 272     RAMBlock *rb;
 273     hwaddr    offset;
 274     hwaddr    len;
 275
 276     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 277 };
 278
 279 /* State of RAM for migration */
 280 struct RAMState {
 281     /* QEMUFile used for this migration */
 282     QEMUFile *f;
 283     /* Last block that we have visited searching for dirty pages */
 284     RAMBlock *last_seen_block;
 285     /* Last block from where we have sent data */
 286     RAMBlock *last_sent_block;
 287     /* Last dirty target page we have sent */
 288     ram_addr_t last_page;
 289     /* last ram version we have seen */
 290     uint32_t last_version;
 291     /* We are in the first round */
 292     bool ram_bulk_stage;
 293     /* How many times we have dirty too many pages */
 294     int dirty_rate_high_cnt;
 295     /* these variables are used for bitmap sync */
 296     /* last time we did a full bitmap_sync */
 297     int64_t time_last_bitmap_sync;
 298     /* bytes transferred at start_time */
 299     uint64_t bytes_xfer_prev;
 300     /* number of dirty pages since start_time */
 301     uint64_t num_dirty_pages_period;
 302     /* xbzrle misses since the beginning of the period */
 303     uint64_t xbzrle_cache_miss_prev;
 304     /* number of iterations at the beginning of period */
 305     uint64_t iterations_prev;
 306     /* Iterations since start */
 307     uint64_t iterations;
 308     /* number of dirty bits in the bitmap */
 309     uint64_t migration_dirty_pages;
 310     /* protects modification of the bitmap */
 311     QemuMutex bitmap_mutex;
 312     /* The RAMBlock used in the last src_page_requests */
 313     RAMBlock *last_req_rb;
 314     /* Queue of outstanding page requests from the destination */
 315     QemuMutex src_page_req_mutex;
 316     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 317 };
 318 typedef struct RAMState RAMState;
 319
 320 static RAMState *ram_state;
 321
 322 uint64_t ram_bytes_remaining(void)
 323 {
 324     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 325                        0;
 326 }
 327
 328 MigrationStats ram_counters;
 329
 330 /* used by the search for pages to send */
 331 struct PageSearchStatus {
 332     /* Current block being searched */
 333     RAMBlock    *block;
 334     /* Current page to search from */
 335     unsigned long page;
 336     /* Set once we wrap around */
 337     bool         complete_round;
 338 };
 339 typedef struct PageSearchStatus PageSearchStatus;
 340
 341 struct CompressParam {
 342     bool done;
 343     bool quit;
 344     QEMUFile *file;
 345     QemuMutex mutex;
 346     QemuCond cond;
 347     RAMBlock *block;
 348     ram_addr_t offset;
 349
 350     /* internally used fields */
 351     z_stream stream;
 352     uint8_t *originbuf;
 353 };
 354 typedef struct CompressParam CompressParam;
 355
 356 struct DecompressParam {
 357     bool done;
 358     bool quit;
 359     QemuMutex mutex;
 360     QemuCond cond;
 361     void *des;
 362     uint8_t *compbuf;
 363     int len;
 364     z_stream stream;
 365 };
 366 typedef struct DecompressParam DecompressParam;
 367
 368 static CompressParam *comp_param;
 369 static QemuThread *compress_threads;
 370 /* comp_done_cond is used to wake up the migration thread when
 371  * one of the compression threads has finished the compression.
 372  * comp_done_lock is used to co-work with comp_done_cond.
 373  */
 374 static QemuMutex comp_done_lock;
 375 static QemuCond comp_done_cond;
 376 /* The empty QEMUFileOps will be used by file in CompressParam */
 377 static const QEMUFileOps empty_ops = { };
 378
 379 static QEMUFile *decomp_file;
 380 static DecompressParam *decomp_param;
 381 static QemuThread *decompress_threads;
 382 static QemuMutex decomp_done_lock;
 383 static QemuCond decomp_done_cond;
 384
 385 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 386                                 ram_addr_t offset, uint8_t *source_buf);
 387
 388 static void *do_data_compress(void *opaque)
 389 {
 390     CompressParam *param = opaque;
 391     RAMBlock *block;
 392     ram_addr_t offset;
 393
 394     qemu_mutex_lock(&param->mutex);
 395     while (!param->quit) {
 396         if (param->block) {
 397             block = param->block;
 398             offset = param->offset;
 399             param->block = NULL;
 400             qemu_mutex_unlock(&param->mutex);
 401
 402             do_compress_ram_page(param->file, &param->stream, block, offset,
 403                                  param->originbuf);
 404
 405             qemu_mutex_lock(&comp_done_lock);
 406             param->done = true;
 407             qemu_cond_signal(&comp_done_cond);
 408             qemu_mutex_unlock(&comp_done_lock);
 409
 410             qemu_mutex_lock(&param->mutex);
 411         } else {
 412             qemu_cond_wait(&param->cond, &param->mutex);
 413         }
 414     }
 415     qemu_mutex_unlock(&param->mutex);
 416
 417     return NULL;
 418 }
 419
 420 static inline void terminate_compression_threads(void)
 421 {
 422     int idx, thread_count;
 423
 424     thread_count = migrate_compress_threads();
 425
 426     for (idx = 0; idx < thread_count; idx++) {
 427         qemu_mutex_lock(&comp_param[idx].mutex);
 428         comp_param[idx].quit = true;
 429         qemu_cond_signal(&comp_param[idx].cond);
 430         qemu_mutex_unlock(&comp_param[idx].mutex);
 431     }
 432 }
 433
 434 static void compress_threads_save_cleanup(void)
 435 {
 436     int i, thread_count;
 437
 438     if (!migrate_use_compression()) {
 439         return;
 440     }
 441     terminate_compression_threads();
 442     thread_count = migrate_compress_threads();
 443     for (i = 0; i < thread_count; i++) {
 444         /*
 445          * we use it as a indicator which shows if the thread is
 446          * properly init'd or not
 447          */
 448         if (!comp_param[i].file) {
 449             break;
 450         }
 451         qemu_thread_join(compress_threads + i);
 452         qemu_mutex_destroy(&comp_param[i].mutex);
 453         qemu_cond_destroy(&comp_param[i].cond);
 454         deflateEnd(&comp_param[i].stream);
 455         g_free(comp_param[i].originbuf);
 456         qemu_fclose(comp_param[i].file);
 457         comp_param[i].file = NULL;
 458     }
 459     qemu_mutex_destroy(&comp_done_lock);
 460     qemu_cond_destroy(&comp_done_cond);
 461     g_free(compress_threads);
 462     g_free(comp_param);
 463     compress_threads = NULL;
 464     comp_param = NULL;
 465 }
 466
 467 static int compress_threads_save_setup(void)
 468 {
 469     int i, thread_count;
 470
 471     if (!migrate_use_compression()) {
 472         return 0;
 473     }
 474     thread_count = migrate_compress_threads();
 475     compress_threads = g_new0(QemuThread, thread_count);
 476     comp_param = g_new0(CompressParam, thread_count);
 477     qemu_cond_init(&comp_done_cond);
 478     qemu_mutex_init(&comp_done_lock);
 479     for (i = 0; i < thread_count; i++) {
 480         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 481         if (!comp_param[i].originbuf) {
 482             goto exit;
 483         }
 484
 485         if (deflateInit(&comp_param[i].stream,
 486                         migrate_compress_level()) != Z_OK) {
 487             g_free(comp_param[i].originbuf);
 488             goto exit;
 489         }
 490
 491         /* comp_param[i].file is just used as a dummy buffer to save data,
 492          * set its ops to empty.
 493          */
 494         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 495         comp_param[i].done = true;
 496         comp_param[i].quit = false;
 497         qemu_mutex_init(&comp_param[i].mutex);
 498         qemu_cond_init(&comp_param[i].cond);
 499         qemu_thread_create(compress_threads + i, "compress",
 500                            do_data_compress, comp_param + i,
 501                            QEMU_THREAD_JOINABLE);
 502     }
 503     return 0;
 504
 505 exit:
 506     compress_threads_save_cleanup();
 507     return -1;
 508 }
 509
 510 /* Multiple fd's */
 511
 512 #define MULTIFD_MAGIC 0x11223344U
 513 #define MULTIFD_VERSION 1
 514
 515 #define MULTIFD_FLAG_SYNC (1 << 0)
 516
 517 typedef struct {
 518     uint32_t magic;
 519     uint32_t version;
 520     unsigned char uuid[16]; /* QemuUUID */
 521     uint8_t id;
 522 } __attribute__((packed)) MultiFDInit_t;
 523
 524 typedef struct {
 525     uint32_t magic;
 526     uint32_t version;
 527     uint32_t flags;
 528     uint32_t size;
 529     uint32_t used;
 530     uint64_t packet_num;
 531     char ramblock[256];
 532     uint64_t offset[];
 533 } __attribute__((packed)) MultiFDPacket_t;
 534
 535 typedef struct {
 536     /* number of used pages */
 537     uint32_t used;
 538     /* number of allocated pages */
 539     uint32_t allocated;
 540     /* global number of generated multifd packets */
 541     uint64_t packet_num;
 542     /* offset of each page */
 543     ram_addr_t *offset;
 544     /* pointer to each page */
 545     struct iovec *iov;
 546     RAMBlock *block;
 547 } MultiFDPages_t;
 548
 549 typedef struct {
 550     /* this fields are not changed once the thread is created */
 551     /* channel number */
 552     uint8_t id;
 553     /* channel thread name */
 554     char *name;
 555     /* channel thread id */
 556     QemuThread thread;
 557     /* communication channel */
 558     QIOChannel *c;
 559     /* sem where to wait for more work */
 560     QemuSemaphore sem;
 561     /* this mutex protects the following parameters */
 562     QemuMutex mutex;
 563     /* is this channel thread running */
 564     bool running;
 565     /* should this thread finish */
 566     bool quit;
 567     /* thread has work to do */
 568     int pending_job;
 569     /* array of pages to sent */
 570     MultiFDPages_t *pages;
 571     /* packet allocated len */
 572     uint32_t packet_len;
 573     /* pointer to the packet */
 574     MultiFDPacket_t *packet;
 575     /* multifd flags for each packet */
 576     uint32_t flags;
 577     /* global number of generated multifd packets */
 578     uint64_t packet_num;
 579     /* thread local variables */
 580     /* packets sent through this channel */
 581     uint64_t num_packets;
 582     /* pages sent through this channel */
 583     uint64_t num_pages;
 584     /* syncs main thread and channels */
 585     QemuSemaphore sem_sync;
 586 }  MultiFDSendParams;
 587
 588 typedef struct {
 589     /* this fields are not changed once the thread is created */
 590     /* channel number */
 591     uint8_t id;
 592     /* channel thread name */
 593     char *name;
 594     /* channel thread id */
 595     QemuThread thread;
 596     /* communication channel */
 597     QIOChannel *c;
 598     /* this mutex protects the following parameters */
 599     QemuMutex mutex;
 600     /* is this channel thread running */
 601     bool running;
 602     /* array of pages to receive */
 603     MultiFDPages_t *pages;
 604     /* packet allocated len */
 605     uint32_t packet_len;
 606     /* pointer to the packet */
 607     MultiFDPacket_t *packet;
 608     /* multifd flags for each packet */
 609     uint32_t flags;
 610     /* global number of generated multifd packets */
 611     uint64_t packet_num;
 612     /* thread local variables */
 613     /* packets sent through this channel */
 614     uint64_t num_packets;
 615     /* pages sent through this channel */
 616     uint64_t num_pages;
 617     /* syncs main thread and channels */
 618     QemuSemaphore sem_sync;
 619 } MultiFDRecvParams;
 620
 621 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 622 {
 623     MultiFDInit_t msg;
 624     int ret;
 625
 626     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 627     msg.version = cpu_to_be32(MULTIFD_VERSION);
 628     msg.id = p->id;
 629     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 630
 631     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 632     if (ret != 0) {
 633         return -1;
 634     }
 635     return 0;
 636 }
 637
 638 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 639 {
 640     MultiFDInit_t msg;
 641     int ret;
 642
 643     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 644     if (ret != 0) {
 645         return -1;
 646     }
 647
 648     be32_to_cpus(&msg.magic);
 649     be32_to_cpus(&msg.version);
 650
 651     if (msg.magic != MULTIFD_MAGIC) {
 652         error_setg(errp, "multifd: received packet magic %x "
 653                    "expected %x", msg.magic, MULTIFD_MAGIC);
 654         return -1;
 655     }
 656
 657     if (msg.version != MULTIFD_VERSION) {
 658         error_setg(errp, "multifd: received packet version %d "
 659                    "expected %d", msg.version, MULTIFD_VERSION);
 660         return -1;
 661     }
 662
 663     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 664         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 665         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 666
 667         error_setg(errp, "multifd: received uuid '%s' and expected "
 668                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 669         g_free(uuid);
 670         g_free(msg_uuid);
 671         return -1;
 672     }
 673
 674     if (msg.id > migrate_multifd_channels()) {
 675         error_setg(errp, "multifd: received channel version %d "
 676                    "expected %d", msg.version, MULTIFD_VERSION);
 677         return -1;
 678     }
 679
 680     return msg.id;
 681 }
 682
 683 static MultiFDPages_t *multifd_pages_init(size_t size)
 684 {
 685     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 686
 687     pages->allocated = size;
 688     pages->iov = g_new0(struct iovec, size);
 689     pages->offset = g_new0(ram_addr_t, size);
 690
 691     return pages;
 692 }
 693
 694 static void multifd_pages_clear(MultiFDPages_t *pages)
 695 {
 696     pages->used = 0;
 697     pages->allocated = 0;
 698     pages->packet_num = 0;
 699     pages->block = NULL;
 700     g_free(pages->iov);
 701     pages->iov = NULL;
 702     g_free(pages->offset);
 703     pages->offset = NULL;
 704     g_free(pages);
 705 }
 706
 707 static void multifd_send_fill_packet(MultiFDSendParams *p)
 708 {
 709     MultiFDPacket_t *packet = p->packet;
 710     int i;
 711
 712     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
 713     packet->version = cpu_to_be32(MULTIFD_VERSION);
 714     packet->flags = cpu_to_be32(p->flags);
 715     packet->size = cpu_to_be32(migrate_multifd_page_count());
 716     packet->used = cpu_to_be32(p->pages->used);
 717     packet->packet_num = cpu_to_be64(p->packet_num);
 718
 719     if (p->pages->block) {
 720         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 721     }
 722
 723     for (i = 0; i < p->pages->used; i++) {
 724         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 725     }
 726 }
 727
 728 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 729 {
 730     MultiFDPacket_t *packet = p->packet;
 731     RAMBlock *block;
 732     int i;
 733
 734     be32_to_cpus(&packet->magic);
 735     if (packet->magic != MULTIFD_MAGIC) {
 736         error_setg(errp, "multifd: received packet "
 737                    "magic %x and expected magic %x",
 738                    packet->magic, MULTIFD_MAGIC);
 739         return -1;
 740     }
 741
 742     be32_to_cpus(&packet->version);
 743     if (packet->version != MULTIFD_VERSION) {
 744         error_setg(errp, "multifd: received packet "
 745                    "version %d and expected version %d",
 746                    packet->version, MULTIFD_VERSION);
 747         return -1;
 748     }
 749
 750     p->flags = be32_to_cpu(packet->flags);
 751
 752     be32_to_cpus(&packet->size);
 753     if (packet->size > migrate_multifd_page_count()) {
 754         error_setg(errp, "multifd: received packet "
 755                    "with size %d and expected maximum size %d",
 756                    packet->size, migrate_multifd_page_count()) ;
 757         return -1;
 758     }
 759
 760     p->pages->used = be32_to_cpu(packet->used);
 761     if (p->pages->used > packet->size) {
 762         error_setg(errp, "multifd: received packet "
 763                    "with size %d and expected maximum size %d",
 764                    p->pages->used, packet->size) ;
 765         return -1;
 766     }
 767
 768     p->packet_num = be64_to_cpu(packet->packet_num);
 769
 770     if (p->pages->used) {
 771         /* make sure that ramblock is 0 terminated */
 772         packet->ramblock[255] = 0;
 773         block = qemu_ram_block_by_name(packet->ramblock);
 774         if (!block) {
 775             error_setg(errp, "multifd: unknown ram block %s",
 776                        packet->ramblock);
 777             return -1;
 778         }
 779     }
 780
 781     for (i = 0; i < p->pages->used; i++) {
 782         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 783
 784         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 785             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 786                        " (max " RAM_ADDR_FMT ")",
 787                        offset, block->max_length);
 788             return -1;
 789         }
 790         p->pages->iov[i].iov_base = block->host + offset;
 791         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 792     }
 793
 794     return 0;
 795 }
 796
 797 struct {
 798     MultiFDSendParams *params;
 799     /* number of created threads */
 800     int count;
 801     /* array of pages to sent */
 802     MultiFDPages_t *pages;
 803     /* syncs main thread and channels */
 804     QemuSemaphore sem_sync;
 805     /* global number of generated multifd packets */
 806     uint64_t packet_num;
 807     /* send channels ready */
 808     QemuSemaphore channels_ready;
 809 } *multifd_send_state;
 810
 811 /*
 812  * How we use multifd_send_state->pages and channel->pages?
 813  *
 814  * We create a pages for each channel, and a main one.  Each time that
 815  * we need to send a batch of pages we interchange the ones between
 816  * multifd_send_state and the channel that is sending it.  There are
 817  * two reasons for that:
 818  *    - to not have to do so many mallocs during migration
 819  *    - to make easier to know what to free at the end of migration
 820  *
 821  * This way we always know who is the owner of each "pages" struct,
 822  * and we don't need any loocking.  It belongs to the migration thread
 823  * or to the channel thread.  Switching is safe because the migration
 824  * thread is using the channel mutex when changing it, and the channel
 825  * have to had finish with its own, otherwise pending_job can't be
 826  * false.
 827  */
 828
 829 static void multifd_send_pages(void)
 830 {
 831     int i;
 832     static int next_channel;
 833     MultiFDSendParams *p = NULL; /* make happy gcc */
 834     MultiFDPages_t *pages = multifd_send_state->pages;
 835     uint64_t transferred;
 836
 837     qemu_sem_wait(&multifd_send_state->channels_ready);
 838     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 839         p = &multifd_send_state->params[i];
 840
 841         qemu_mutex_lock(&p->mutex);
 842         if (!p->pending_job) {
 843             p->pending_job++;
 844             next_channel = (i + 1) % migrate_multifd_channels();
 845             break;
 846         }
 847         qemu_mutex_unlock(&p->mutex);
 848     }
 849     p->pages->used = 0;
 850
 851     p->packet_num = multifd_send_state->packet_num++;
 852     p->pages->block = NULL;
 853     multifd_send_state->pages = p->pages;
 854     p->pages = pages;
 855     transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
 856     ram_counters.multifd_bytes += transferred;
 857     ram_counters.transferred += transferred;;
 858     qemu_mutex_unlock(&p->mutex);
 859     qemu_sem_post(&p->sem);
 860 }
 861
 862 static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
 863 {
 864     MultiFDPages_t *pages = multifd_send_state->pages;
 865
 866     if (!pages->block) {
 867         pages->block = block;
 868     }
 869
 870     if (pages->block == block) {
 871         pages->offset[pages->used] = offset;
 872         pages->iov[pages->used].iov_base = block->host + offset;
 873         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
 874         pages->used++;
 875
 876         if (pages->used < pages->allocated) {
 877             return;
 878         }
 879     }
 880
 881     multifd_send_pages();
 882
 883     if (pages->block != block) {
 884         multifd_queue_page(block, offset);
 885     }
 886 }
 887
 888 static void multifd_send_terminate_threads(Error *err)
 889 {
 890     int i;
 891
 892     if (err) {
 893         MigrationState *s = migrate_get_current();
 894         migrate_set_error(s, err);
 895         if (s->state == MIGRATION_STATUS_SETUP ||
 896             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
 897             s->state == MIGRATION_STATUS_DEVICE ||
 898             s->state == MIGRATION_STATUS_ACTIVE) {
 899             migrate_set_state(&s->state, s->state,
 900                               MIGRATION_STATUS_FAILED);
 901         }
 902     }
 903
 904     for (i = 0; i < migrate_multifd_channels(); i++) {
 905         MultiFDSendParams *p = &multifd_send_state->params[i];
 906
 907         qemu_mutex_lock(&p->mutex);
 908         p->quit = true;
 909         qemu_sem_post(&p->sem);
 910         qemu_mutex_unlock(&p->mutex);
 911     }
 912 }
 913
 914 int multifd_save_cleanup(Error **errp)
 915 {
 916     int i;
 917     int ret = 0;
 918
 919     if (!migrate_use_multifd()) {
 920         return 0;
 921     }
 922     multifd_send_terminate_threads(NULL);
 923     for (i = 0; i < migrate_multifd_channels(); i++) {
 924         MultiFDSendParams *p = &multifd_send_state->params[i];
 925
 926         if (p->running) {
 927             qemu_thread_join(&p->thread);
 928         }
 929         socket_send_channel_destroy(p->c);
 930         p->c = NULL;
 931         qemu_mutex_destroy(&p->mutex);
 932         qemu_sem_destroy(&p->sem);
 933         qemu_sem_destroy(&p->sem_sync);
 934         g_free(p->name);
 935         p->name = NULL;
 936         multifd_pages_clear(p->pages);
 937         p->pages = NULL;
 938         p->packet_len = 0;
 939         g_free(p->packet);
 940         p->packet = NULL;
 941     }
 942     qemu_sem_destroy(&multifd_send_state->channels_ready);
 943     qemu_sem_destroy(&multifd_send_state->sem_sync);
 944     g_free(multifd_send_state->params);
 945     multifd_send_state->params = NULL;
 946     multifd_pages_clear(multifd_send_state->pages);
 947     multifd_send_state->pages = NULL;
 948     g_free(multifd_send_state);
 949     multifd_send_state = NULL;
 950     return ret;
 951 }
 952
 953 static void multifd_send_sync_main(void)
 954 {
 955     int i;
 956
 957     if (!migrate_use_multifd()) {
 958         return;
 959     }
 960     if (multifd_send_state->pages->used) {
 961         multifd_send_pages();
 962     }
 963     for (i = 0; i < migrate_multifd_channels(); i++) {
 964         MultiFDSendParams *p = &multifd_send_state->params[i];
 965
 966         trace_multifd_send_sync_main_signal(p->id);
 967
 968         qemu_mutex_lock(&p->mutex);
 969
 970         p->packet_num = multifd_send_state->packet_num++;
 971         p->flags |= MULTIFD_FLAG_SYNC;
 972         p->pending_job++;
 973         qemu_mutex_unlock(&p->mutex);
 974         qemu_sem_post(&p->sem);
 975     }
 976     for (i = 0; i < migrate_multifd_channels(); i++) {
 977         MultiFDSendParams *p = &multifd_send_state->params[i];
 978
 979         trace_multifd_send_sync_main_wait(p->id);
 980         qemu_sem_wait(&multifd_send_state->sem_sync);
 981     }
 982     trace_multifd_send_sync_main(multifd_send_state->packet_num);
 983 }
 984
 985 static void *multifd_send_thread(void *opaque)
 986 {
 987     MultiFDSendParams *p = opaque;
 988     Error *local_err = NULL;
 989     int ret;
 990
 991     trace_multifd_send_thread_start(p->id);
 992
 993     if (multifd_send_initial_packet(p, &local_err) < 0) {
 994         goto out;
 995     }
 996     /* initial packet */
 997     p->num_packets = 1;
 998
 999     while (true) {
1000         qemu_sem_wait(&p->sem);
1001         qemu_mutex_lock(&p->mutex);
1002
1003         if (p->pending_job) {
1004             uint32_t used = p->pages->used;
1005             uint64_t packet_num = p->packet_num;
1006             uint32_t flags = p->flags;
1007
1008             multifd_send_fill_packet(p);
1009             p->flags = 0;
1010             p->num_packets++;
1011             p->num_pages += used;
1012             p->pages->used = 0;
1013             qemu_mutex_unlock(&p->mutex);
1014
1015             trace_multifd_send(p->id, packet_num, used, flags);
1016
1017             ret = qio_channel_write_all(p->c, (void *)p->packet,
1018                                         p->packet_len, &local_err);
1019             if (ret != 0) {
1020                 break;
1021             }
1022
1023             ret = qio_channel_writev_all(p->c, p->pages->iov, used, &local_err);
1024             if (ret != 0) {
1025                 break;
1026             }
1027
1028             qemu_mutex_lock(&p->mutex);
1029             p->pending_job--;
1030             qemu_mutex_unlock(&p->mutex);
1031
1032             if (flags & MULTIFD_FLAG_SYNC) {
1033                 qemu_sem_post(&multifd_send_state->sem_sync);
1034             }
1035             qemu_sem_post(&multifd_send_state->channels_ready);
1036         } else if (p->quit) {
1037             qemu_mutex_unlock(&p->mutex);
1038             break;
1039         } else {
1040             qemu_mutex_unlock(&p->mutex);
1041             /* sometimes there are spurious wakeups */
1042         }
1043     }
1044
1045 out:
1046     if (local_err) {
1047         multifd_send_terminate_threads(local_err);
1048     }
1049
1050     qemu_mutex_lock(&p->mutex);
1051     p->running = false;
1052     qemu_mutex_unlock(&p->mutex);
1053
1054     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1055
1056     return NULL;
1057 }
1058
1059 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1060 {
1061     MultiFDSendParams *p = opaque;
1062     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1063     Error *local_err = NULL;
1064
1065     if (qio_task_propagate_error(task, &local_err)) {
1066         if (multifd_save_cleanup(&local_err) != 0) {
1067             migrate_set_error(migrate_get_current(), local_err);
1068         }
1069     } else {
1070         p->c = QIO_CHANNEL(sioc);
1071         qio_channel_set_delay(p->c, false);
1072         p->running = true;
1073         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1074                            QEMU_THREAD_JOINABLE);
1075
1076         atomic_inc(&multifd_send_state->count);
1077     }
1078 }
1079
1080 int multifd_save_setup(void)
1081 {
1082     int thread_count;
1083     uint32_t page_count = migrate_multifd_page_count();
1084     uint8_t i;
1085
1086     if (!migrate_use_multifd()) {
1087         return 0;
1088     }
1089     thread_count = migrate_multifd_channels();
1090     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1091     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1092     atomic_set(&multifd_send_state->count, 0);
1093     multifd_send_state->pages = multifd_pages_init(page_count);
1094     qemu_sem_init(&multifd_send_state->sem_sync, 0);
1095     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1096
1097     for (i = 0; i < thread_count; i++) {
1098         MultiFDSendParams *p = &multifd_send_state->params[i];
1099
1100         qemu_mutex_init(&p->mutex);
1101         qemu_sem_init(&p->sem, 0);
1102         qemu_sem_init(&p->sem_sync, 0);
1103         p->quit = false;
1104         p->pending_job = 0;
1105         p->id = i;
1106         p->pages = multifd_pages_init(page_count);
1107         p->packet_len = sizeof(MultiFDPacket_t)
1108                       + sizeof(ram_addr_t) * page_count;
1109         p->packet = g_malloc0(p->packet_len);
1110         p->name = g_strdup_printf("multifdsend_%d", i);
1111         socket_send_channel_create(multifd_new_send_channel_async, p);
1112     }
1113     return 0;
1114 }
1115
1116 struct {
1117     MultiFDRecvParams *params;
1118     /* number of created threads */
1119     int count;
1120     /* syncs main thread and channels */
1121     QemuSemaphore sem_sync;
1122     /* global number of generated multifd packets */
1123     uint64_t packet_num;
1124 } *multifd_recv_state;
1125
1126 static void multifd_recv_terminate_threads(Error *err)
1127 {
1128     int i;
1129
1130     if (err) {
1131         MigrationState *s = migrate_get_current();
1132         migrate_set_error(s, err);
1133         if (s->state == MIGRATION_STATUS_SETUP ||
1134             s->state == MIGRATION_STATUS_ACTIVE) {
1135             migrate_set_state(&s->state, s->state,
1136                               MIGRATION_STATUS_FAILED);
1137         }
1138     }
1139
1140     for (i = 0; i < migrate_multifd_channels(); i++) {
1141         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1142
1143         qemu_mutex_lock(&p->mutex);
1144         /* We could arrive here for two reasons:
1145            - normal quit, i.e. everything went fine, just finished
1146            - error quit: We close the channels so the channel threads
1147              finish the qio_channel_read_all_eof() */
1148         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1149         qemu_mutex_unlock(&p->mutex);
1150     }
1151 }
1152
1153 int multifd_load_cleanup(Error **errp)
1154 {
1155     int i;
1156     int ret = 0;
1157
1158     if (!migrate_use_multifd()) {
1159         return 0;
1160     }
1161     multifd_recv_terminate_threads(NULL);
1162     for (i = 0; i < migrate_multifd_channels(); i++) {
1163         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1164
1165         if (p->running) {
1166             qemu_thread_join(&p->thread);
1167         }
1168         object_unref(OBJECT(p->c));
1169         p->c = NULL;
1170         qemu_mutex_destroy(&p->mutex);
1171         qemu_sem_destroy(&p->sem_sync);
1172         g_free(p->name);
1173         p->name = NULL;
1174         multifd_pages_clear(p->pages);
1175         p->pages = NULL;
1176         p->packet_len = 0;
1177         g_free(p->packet);
1178         p->packet = NULL;
1179     }
1180     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1181     g_free(multifd_recv_state->params);
1182     multifd_recv_state->params = NULL;
1183     g_free(multifd_recv_state);
1184     multifd_recv_state = NULL;
1185
1186     return ret;
1187 }
1188
1189 static void multifd_recv_sync_main(void)
1190 {
1191     int i;
1192
1193     if (!migrate_use_multifd()) {
1194         return;
1195     }
1196     for (i = 0; i < migrate_multifd_channels(); i++) {
1197         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1198
1199         trace_multifd_recv_sync_main_wait(p->id);
1200         qemu_sem_wait(&multifd_recv_state->sem_sync);
1201         qemu_mutex_lock(&p->mutex);
1202         if (multifd_recv_state->packet_num < p->packet_num) {
1203             multifd_recv_state->packet_num = p->packet_num;
1204         }
1205         qemu_mutex_unlock(&p->mutex);
1206     }
1207     for (i = 0; i < migrate_multifd_channels(); i++) {
1208         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1209
1210         trace_multifd_recv_sync_main_signal(p->id);
1211         qemu_sem_post(&p->sem_sync);
1212     }
1213     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1214 }
1215
1216 static void *multifd_recv_thread(void *opaque)
1217 {
1218     MultiFDRecvParams *p = opaque;
1219     Error *local_err = NULL;
1220     int ret;
1221
1222     trace_multifd_recv_thread_start(p->id);
1223
1224     while (true) {
1225         uint32_t used;
1226         uint32_t flags;
1227
1228         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1229                                        p->packet_len, &local_err);
1230         if (ret == 0) {   /* EOF */
1231             break;
1232         }
1233         if (ret == -1) {   /* Error */
1234             break;
1235         }
1236
1237         qemu_mutex_lock(&p->mutex);
1238         ret = multifd_recv_unfill_packet(p, &local_err);
1239         if (ret) {
1240             qemu_mutex_unlock(&p->mutex);
1241             break;
1242         }
1243
1244         used = p->pages->used;
1245         flags = p->flags;
1246         trace_multifd_recv(p->id, p->packet_num, used, flags);
1247         p->num_packets++;
1248         p->num_pages += used;
1249         qemu_mutex_unlock(&p->mutex);
1250
1251         ret = qio_channel_readv_all(p->c, p->pages->iov, used, &local_err);
1252         if (ret != 0) {
1253             break;
1254         }
1255
1256         if (flags & MULTIFD_FLAG_SYNC) {
1257             qemu_sem_post(&multifd_recv_state->sem_sync);
1258             qemu_sem_wait(&p->sem_sync);
1259         }
1260     }
1261
1262     if (local_err) {
1263         multifd_recv_terminate_threads(local_err);
1264     }
1265     qemu_mutex_lock(&p->mutex);
1266     p->running = false;
1267     qemu_mutex_unlock(&p->mutex);
1268
1269     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1270
1271     return NULL;
1272 }
1273
1274 int multifd_load_setup(void)
1275 {
1276     int thread_count;
1277     uint32_t page_count = migrate_multifd_page_count();
1278     uint8_t i;
1279
1280     if (!migrate_use_multifd()) {
1281         return 0;
1282     }
1283     thread_count = migrate_multifd_channels();
1284     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1285     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1286     atomic_set(&multifd_recv_state->count, 0);
1287     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1288
1289     for (i = 0; i < thread_count; i++) {
1290         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1291
1292         qemu_mutex_init(&p->mutex);
1293         qemu_sem_init(&p->sem_sync, 0);
1294         p->id = i;
1295         p->pages = multifd_pages_init(page_count);
1296         p->packet_len = sizeof(MultiFDPacket_t)
1297                       + sizeof(ram_addr_t) * page_count;
1298         p->packet = g_malloc0(p->packet_len);
1299         p->name = g_strdup_printf("multifdrecv_%d", i);
1300     }
1301     return 0;
1302 }
1303
1304 bool multifd_recv_all_channels_created(void)
1305 {
1306     int thread_count = migrate_multifd_channels();
1307
1308     if (!migrate_use_multifd()) {
1309         return true;
1310     }
1311
1312     return thread_count == atomic_read(&multifd_recv_state->count);
1313 }
1314
1315 /* Return true if multifd is ready for the migration, otherwise false */
1316 bool multifd_recv_new_channel(QIOChannel *ioc)
1317 {
1318     MultiFDRecvParams *p;
1319     Error *local_err = NULL;
1320     int id;
1321
1322     id = multifd_recv_initial_packet(ioc, &local_err);
1323     if (id < 0) {
1324         multifd_recv_terminate_threads(local_err);
1325         return false;
1326     }
1327
1328     p = &multifd_recv_state->params[id];
1329     if (p->c != NULL) {
1330         error_setg(&local_err, "multifd: received id '%d' already setup'",
1331                    id);
1332         multifd_recv_terminate_threads(local_err);
1333         return false;
1334     }
1335     p->c = ioc;
1336     object_ref(OBJECT(ioc));
1337     /* initial packet */
1338     p->num_packets = 1;
1339
1340     p->running = true;
1341     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1342                        QEMU_THREAD_JOINABLE);
1343     atomic_inc(&multifd_recv_state->count);
1344     return multifd_recv_state->count == migrate_multifd_channels();
1345 }
1346
1347 /**
1348  * save_page_header: write page header to wire
1349  *
1350  * If this is the 1st block, it also writes the block identification
1351  *
1352  * Returns the number of bytes written
1353  *
1354  * @f: QEMUFile where to send the data
1355  * @block: block that contains the page we want to send
1356  * @offset: offset inside the block for the page
1357  *          in the lower bits, it contains flags
1358  */
1359 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1360                                ram_addr_t offset)
1361 {
1362     size_t size, len;
1363
1364     if (block == rs->last_sent_block) {
1365         offset |= RAM_SAVE_FLAG_CONTINUE;
1366     }
1367     qemu_put_be64(f, offset);
1368     size = 8;
1369
1370     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1371         len = strlen(block->idstr);
1372         qemu_put_byte(f, len);
1373         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1374         size += 1 + len;
1375         rs->last_sent_block = block;
1376     }
1377     return size;
1378 }
1379
1380 /**
1381  * mig_throttle_guest_down: throotle down the guest
1382  *
1383  * Reduce amount of guest cpu execution to hopefully slow down memory
1384  * writes. If guest dirty memory rate is reduced below the rate at
1385  * which we can transfer pages to the destination then we should be
1386  * able to complete migration. Some workloads dirty memory way too
1387  * fast and will not effectively converge, even with auto-converge.
1388  */
1389 static void mig_throttle_guest_down(void)
1390 {
1391     MigrationState *s = migrate_get_current();
1392     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1393     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1394
1395     /* We have not started throttling yet. Let's start it. */
1396     if (!cpu_throttle_active()) {
1397         cpu_throttle_set(pct_initial);
1398     } else {
1399         /* Throttling already on, just increase the rate */
1400         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
1401     }
1402 }
1403
1404 /**
1405  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1406  *
1407  * @rs: current RAM state
1408  * @current_addr: address for the zero page
1409  *
1410  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1411  * The important thing is that a stale (not-yet-0'd) page be replaced
1412  * by the new data.
1413  * As a bonus, if the page wasn't in the cache it gets added so that
1414  * when a small write is made into the 0'd page it gets XBZRLE sent.
1415  */
1416 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1417 {
1418     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1419         return;
1420     }
1421
1422     /* We don't care if this fails to allocate a new cache page
1423      * as long as it updated an old one */
1424     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1425                  ram_counters.dirty_sync_count);
1426 }
1427
1428 #define ENCODING_FLAG_XBZRLE 0x1
1429
1430 /**
1431  * save_xbzrle_page: compress and send current page
1432  *
1433  * Returns: 1 means that we wrote the page
1434  *          0 means that page is identical to the one already sent
1435  *          -1 means that xbzrle would be longer than normal
1436  *
1437  * @rs: current RAM state
1438  * @current_data: pointer to the address of the page contents
1439  * @current_addr: addr of the page
1440  * @block: block that contains the page we want to send
1441  * @offset: offset inside the block for the page
1442  * @last_stage: if we are at the completion stage
1443  */
1444 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1445                             ram_addr_t current_addr, RAMBlock *block,
1446                             ram_addr_t offset, bool last_stage)
1447 {
1448     int encoded_len = 0, bytes_xbzrle;
1449     uint8_t *prev_cached_page;
1450
1451     if (!cache_is_cached(XBZRLE.cache, current_addr,
1452                          ram_counters.dirty_sync_count)) {
1453         xbzrle_counters.cache_miss++;
1454         if (!last_stage) {
1455             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1456                              ram_counters.dirty_sync_count) == -1) {
1457                 return -1;
1458             } else {
1459                 /* update *current_data when the page has been
1460                    inserted into cache */
1461                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1462             }
1463         }
1464         return -1;
1465     }
1466
1467     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1468
1469     /* save current buffer into memory */
1470     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1471
1472     /* XBZRLE encoding (if there is no overflow) */
1473     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1474                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1475                                        TARGET_PAGE_SIZE);
1476     if (encoded_len == 0) {
1477         trace_save_xbzrle_page_skipping();
1478         return 0;
1479     } else if (encoded_len == -1) {
1480         trace_save_xbzrle_page_overflow();
1481         xbzrle_counters.overflow++;
1482         /* update data in the cache */
1483         if (!last_stage) {
1484             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1485             *current_data = prev_cached_page;
1486         }
1487         return -1;
1488     }
1489
1490     /* we need to update the data in the cache, in order to get the same data */
1491     if (!last_stage) {
1492         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1493     }
1494
1495     /* Send XBZRLE based compressed page */
1496     bytes_xbzrle = save_page_header(rs, rs->f, block,
1497                                     offset | RAM_SAVE_FLAG_XBZRLE);
1498     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1499     qemu_put_be16(rs->f, encoded_len);
1500     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1501     bytes_xbzrle += encoded_len + 1 + 2;
1502     xbzrle_counters.pages++;
1503     xbzrle_counters.bytes += bytes_xbzrle;
1504     ram_counters.transferred += bytes_xbzrle;
1505
1506     return 1;
1507 }
1508
1509 /**
1510  * migration_bitmap_find_dirty: find the next dirty page from start
1511  *
1512  * Called with rcu_read_lock() to protect migration_bitmap
1513  *
1514  * Returns the byte offset within memory region of the start of a dirty page
1515  *
1516  * @rs: current RAM state
1517  * @rb: RAMBlock where to search for dirty pages
1518  * @start: page where we start the search
1519  */
1520 static inline
1521 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1522                                           unsigned long start)
1523 {
1524     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1525     unsigned long *bitmap = rb->bmap;
1526     unsigned long next;
1527
1528     if (!qemu_ram_is_migratable(rb)) {
1529         return size;
1530     }
1531
1532     if (rs->ram_bulk_stage && start > 0) {
1533         next = start + 1;
1534     } else {
1535         next = find_next_bit(bitmap, size, start);
1536     }
1537
1538     return next;
1539 }
1540
1541 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1542                                                 RAMBlock *rb,
1543                                                 unsigned long page)
1544 {
1545     bool ret;
1546
1547     ret = test_and_clear_bit(page, rb->bmap);
1548
1549     if (ret) {
1550         rs->migration_dirty_pages--;
1551     }
1552     return ret;
1553 }
1554
1555 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1556                                         ram_addr_t start, ram_addr_t length)
1557 {
1558     rs->migration_dirty_pages +=
1559         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1560                                               &rs->num_dirty_pages_period);
1561 }
1562
1563 /**
1564  * ram_pagesize_summary: calculate all the pagesizes of a VM
1565  *
1566  * Returns a summary bitmap of the page sizes of all RAMBlocks
1567  *
1568  * For VMs with just normal pages this is equivalent to the host page
1569  * size. If it's got some huge pages then it's the OR of all the
1570  * different page sizes.
1571  */
1572 uint64_t ram_pagesize_summary(void)
1573 {
1574     RAMBlock *block;
1575     uint64_t summary = 0;
1576
1577     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1578         summary |= block->page_size;
1579     }
1580
1581     return summary;
1582 }
1583
1584 static void migration_update_rates(RAMState *rs, int64_t end_time)
1585 {
1586     uint64_t iter_count = rs->iterations - rs->iterations_prev;
1587
1588     /* calculate period counters */
1589     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1590                 / (end_time - rs->time_last_bitmap_sync);
1591
1592     if (!iter_count) {
1593         return;
1594     }
1595
1596     if (migrate_use_xbzrle()) {
1597         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1598             rs->xbzrle_cache_miss_prev) / iter_count;
1599         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1600     }
1601 }
1602
1603 static void migration_bitmap_sync(RAMState *rs)
1604 {
1605     RAMBlock *block;
1606     int64_t end_time;
1607     uint64_t bytes_xfer_now;
1608
1609     ram_counters.dirty_sync_count++;
1610
1611     if (!rs->time_last_bitmap_sync) {
1612         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1613     }
1614
1615     trace_migration_bitmap_sync_start();
1616     memory_global_dirty_log_sync();
1617
1618     qemu_mutex_lock(&rs->bitmap_mutex);
1619     rcu_read_lock();
1620     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1621         migration_bitmap_sync_range(rs, block, 0, block->used_length);
1622     }
1623     ram_counters.remaining = ram_bytes_remaining();
1624     rcu_read_unlock();
1625     qemu_mutex_unlock(&rs->bitmap_mutex);
1626
1627     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1628
1629     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1630
1631     /* more than 1 second = 1000 millisecons */
1632     if (end_time > rs->time_last_bitmap_sync + 1000) {
1633         bytes_xfer_now = ram_counters.transferred;
1634
1635         /* During block migration the auto-converge logic incorrectly detects
1636          * that ram migration makes no progress. Avoid this by disabling the
1637          * throttling logic during the bulk phase of block migration. */
1638         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1639             /* The following detection logic can be refined later. For now:
1640                Check to see if the dirtied bytes is 50% more than the approx.
1641                amount of bytes that just got transferred since the last time we
1642                were in this routine. If that happens twice, start or increase
1643                throttling */
1644
1645             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1646                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1647                 (++rs->dirty_rate_high_cnt >= 2)) {
1648                     trace_migration_throttle();
1649                     rs->dirty_rate_high_cnt = 0;
1650                     mig_throttle_guest_down();
1651             }
1652         }
1653
1654         migration_update_rates(rs, end_time);
1655
1656         rs->iterations_prev = rs->iterations;
1657
1658         /* reset period counters */
1659         rs->time_last_bitmap_sync = end_time;
1660         rs->num_dirty_pages_period = 0;
1661         rs->bytes_xfer_prev = bytes_xfer_now;
1662     }
1663     if (migrate_use_events()) {
1664         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
1665     }
1666 }
1667
1668 /**
1669  * save_zero_page: send the zero page to the stream
1670  *
1671  * Returns the number of pages written.
1672  *
1673  * @rs: current RAM state
1674  * @block: block that contains the page we want to send
1675  * @offset: offset inside the block for the page
1676  */
1677 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1678 {
1679     uint8_t *p = block->host + offset;
1680     int pages = -1;
1681
1682     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1683         ram_counters.duplicate++;
1684         ram_counters.transferred +=
1685             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
1686         qemu_put_byte(rs->f, 0);
1687         ram_counters.transferred += 1;
1688         pages = 1;
1689     }
1690
1691     return pages;
1692 }
1693
1694 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1695 {
1696     if (!migrate_release_ram() || !migration_in_postcopy()) {
1697         return;
1698     }
1699
1700     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1701 }
1702
1703 /*
1704  * @pages: the number of pages written by the control path,
1705  *        < 0 - error
1706  *        > 0 - number of pages written
1707  *
1708  * Return true if the pages has been saved, otherwise false is returned.
1709  */
1710 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1711                               int *pages)
1712 {
1713     uint64_t bytes_xmit = 0;
1714     int ret;
1715
1716     *pages = -1;
1717     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1718                                 &bytes_xmit);
1719     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1720         return false;
1721     }
1722
1723     if (bytes_xmit) {
1724         ram_counters.transferred += bytes_xmit;
1725         *pages = 1;
1726     }
1727
1728     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1729         return true;
1730     }
1731
1732     if (bytes_xmit > 0) {
1733         ram_counters.normal++;
1734     } else if (bytes_xmit == 0) {
1735         ram_counters.duplicate++;
1736     }
1737
1738     return true;
1739 }
1740
1741 /*
1742  * directly send the page to the stream
1743  *
1744  * Returns the number of pages written.
1745  *
1746  * @rs: current RAM state
1747  * @block: block that contains the page we want to send
1748  * @offset: offset inside the block for the page
1749  * @buf: the page to be sent
1750  * @async: send to page asyncly
1751  */
1752 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1753                             uint8_t *buf, bool async)
1754 {
1755     ram_counters.transferred += save_page_header(rs, rs->f, block,
1756                                                  offset | RAM_SAVE_FLAG_PAGE);
1757     if (async) {
1758         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1759                               migrate_release_ram() &
1760                               migration_in_postcopy());
1761     } else {
1762         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1763     }
1764     ram_counters.transferred += TARGET_PAGE_SIZE;
1765     ram_counters.normal++;
1766     return 1;
1767 }
1768
1769 /**
1770  * ram_save_page: send the given page to the stream
1771  *
1772  * Returns the number of pages written.
1773  *          < 0 - error
1774  *          >=0 - Number of pages written - this might legally be 0
1775  *                if xbzrle noticed the page was the same.
1776  *
1777  * @rs: current RAM state
1778  * @block: block that contains the page we want to send
1779  * @offset: offset inside the block for the page
1780  * @last_stage: if we are at the completion stage
1781  */
1782 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1783 {
1784     int pages = -1;
1785     uint8_t *p;
1786     bool send_async = true;
1787     RAMBlock *block = pss->block;
1788     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1789     ram_addr_t current_addr = block->offset + offset;
1790
1791     p = block->host + offset;
1792     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1793
1794     XBZRLE_cache_lock();
1795     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1796         migrate_use_xbzrle()) {
1797         pages = save_xbzrle_page(rs, &p, current_addr, block,
1798                                  offset, last_stage);
1799         if (!last_stage) {
1800             /* Can't send this cached data async, since the cache page
1801              * might get updated before it gets to the wire
1802              */
1803             send_async = false;
1804         }
1805     }
1806
1807     /* XBZRLE overflow or normal page */
1808     if (pages == -1) {
1809         pages = save_normal_page(rs, block, offset, p, send_async);
1810     }
1811
1812     XBZRLE_cache_unlock();
1813
1814     return pages;
1815 }
1816
1817 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1818                                  ram_addr_t offset)
1819 {
1820     multifd_queue_page(block, offset);
1821     ram_counters.normal++;
1822
1823     return 1;
1824 }
1825
1826 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1827                                 ram_addr_t offset, uint8_t *source_buf)
1828 {
1829     RAMState *rs = ram_state;
1830     int bytes_sent, blen;
1831     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1832
1833     bytes_sent = save_page_header(rs, f, block, offset |
1834                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
1835
1836     /*
1837      * copy it to a internal buffer to avoid it being modified by VM
1838      * so that we can catch up the error during compression and
1839      * decompression
1840      */
1841     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1842     blen = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1843     if (blen < 0) {
1844         bytes_sent = 0;
1845         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1846         error_report("compressed data failed!");
1847     } else {
1848         bytes_sent += blen;
1849         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1850     }
1851
1852     return bytes_sent;
1853 }
1854
1855 static void flush_compressed_data(RAMState *rs)
1856 {
1857     int idx, len, thread_count;
1858
1859     if (!migrate_use_compression()) {
1860         return;
1861     }
1862     thread_count = migrate_compress_threads();
1863
1864     qemu_mutex_lock(&comp_done_lock);
1865     for (idx = 0; idx < thread_count; idx++) {
1866         while (!comp_param[idx].done) {
1867             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1868         }
1869     }
1870     qemu_mutex_unlock(&comp_done_lock);
1871
1872     for (idx = 0; idx < thread_count; idx++) {
1873         qemu_mutex_lock(&comp_param[idx].mutex);
1874         if (!comp_param[idx].quit) {
1875             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1876             ram_counters.transferred += len;
1877         }
1878         qemu_mutex_unlock(&comp_param[idx].mutex);
1879     }
1880 }
1881
1882 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1883                                        ram_addr_t offset)
1884 {
1885     param->block = block;
1886     param->offset = offset;
1887 }
1888
1889 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1890                                            ram_addr_t offset)
1891 {
1892     int idx, thread_count, bytes_xmit = -1, pages = -1;
1893
1894     thread_count = migrate_compress_threads();
1895     qemu_mutex_lock(&comp_done_lock);
1896     while (true) {
1897         for (idx = 0; idx < thread_count; idx++) {
1898             if (comp_param[idx].done) {
1899                 comp_param[idx].done = false;
1900                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1901                 qemu_mutex_lock(&comp_param[idx].mutex);
1902                 set_compress_params(&comp_param[idx], block, offset);
1903                 qemu_cond_signal(&comp_param[idx].cond);
1904                 qemu_mutex_unlock(&comp_param[idx].mutex);
1905                 pages = 1;
1906                 ram_counters.normal++;
1907                 ram_counters.transferred += bytes_xmit;
1908                 break;
1909             }
1910         }
1911         if (pages > 0) {
1912             break;
1913         } else {
1914             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1915         }
1916     }
1917     qemu_mutex_unlock(&comp_done_lock);
1918
1919     return pages;
1920 }
1921
1922 /**
1923  * find_dirty_block: find the next dirty page and update any state
1924  * associated with the search process.
1925  *
1926  * Returns if a page is found
1927  *
1928  * @rs: current RAM state
1929  * @pss: data about the state of the current dirty page scan
1930  * @again: set to false if the search has scanned the whole of RAM
1931  */
1932 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1933 {
1934     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1935     if (pss->complete_round && pss->block == rs->last_seen_block &&
1936         pss->page >= rs->last_page) {
1937         /*
1938          * We've been once around the RAM and haven't found anything.
1939          * Give up.
1940          */
1941         *again = false;
1942         return false;
1943     }
1944     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1945         /* Didn't find anything in this RAM Block */
1946         pss->page = 0;
1947         pss->block = QLIST_NEXT_RCU(pss->block, next);
1948         if (!pss->block) {
1949             /* Hit the end of the list */
1950             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1951             /* Flag that we've looped */
1952             pss->complete_round = true;
1953             rs->ram_bulk_stage = false;
1954             if (migrate_use_xbzrle()) {
1955                 /* If xbzrle is on, stop using the data compression at this
1956                  * point. In theory, xbzrle can do better than compression.
1957                  */
1958                 flush_compressed_data(rs);
1959             }
1960         }
1961         /* Didn't find anything this time, but try again on the new block */
1962         *again = true;
1963         return false;
1964     } else {
1965         /* Can go around again, but... */
1966         *again = true;
1967         /* We've found something so probably don't need to */
1968         return true;
1969     }
1970 }
1971
1972 /**
1973  * unqueue_page: gets a page of the queue
1974  *
1975  * Helper for 'get_queued_page' - gets a page off the queue
1976  *
1977  * Returns the block of the page (or NULL if none available)
1978  *
1979  * @rs: current RAM state
1980  * @offset: used to return the offset within the RAMBlock
1981  */
1982 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1983 {
1984     RAMBlock *block = NULL;
1985
1986     qemu_mutex_lock(&rs->src_page_req_mutex);
1987     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1988         struct RAMSrcPageRequest *entry =
1989                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1990         block = entry->rb;
1991         *offset = entry->offset;
1992
1993         if (entry->len > TARGET_PAGE_SIZE) {
1994             entry->len -= TARGET_PAGE_SIZE;
1995             entry->offset += TARGET_PAGE_SIZE;
1996         } else {
1997             memory_region_unref(block->mr);
1998             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1999             g_free(entry);
2000             migration_consume_urgent_request();
2001         }
2002     }
2003     qemu_mutex_unlock(&rs->src_page_req_mutex);
2004
2005     return block;
2006 }
2007
2008 /**
2009  * get_queued_page: unqueue a page from the postocpy requests
2010  *
2011  * Skips pages that are already sent (!dirty)
2012  *
2013  * Returns if a queued page is found
2014  *
2015  * @rs: current RAM state
2016  * @pss: data about the state of the current dirty page scan
2017  */
2018 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2019 {
2020     RAMBlock  *block;
2021     ram_addr_t offset;
2022     bool dirty;
2023
2024     do {
2025         block = unqueue_page(rs, &offset);
2026         /*
2027          * We're sending this page, and since it's postcopy nothing else
2028          * will dirty it, and we must make sure it doesn't get sent again
2029          * even if this queue request was received after the background
2030          * search already sent it.
2031          */
2032         if (block) {
2033             unsigned long page;
2034
2035             page = offset >> TARGET_PAGE_BITS;
2036             dirty = test_bit(page, block->bmap);
2037             if (!dirty) {
2038                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2039                        page, test_bit(page, block->unsentmap));
2040             } else {
2041                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2042             }
2043         }
2044
2045     } while (block && !dirty);
2046
2047     if (block) {
2048         /*
2049          * As soon as we start servicing pages out of order, then we have
2050          * to kill the bulk stage, since the bulk stage assumes
2051          * in (migration_bitmap_find_and_reset_dirty) that every page is
2052          * dirty, that's no longer true.
2053          */
2054         rs->ram_bulk_stage = false;
2055
2056         /*
2057          * We want the background search to continue from the queued page
2058          * since the guest is likely to want other pages near to the page
2059          * it just requested.
2060          */
2061         pss->block = block;
2062         pss->page = offset >> TARGET_PAGE_BITS;
2063     }
2064
2065     return !!block;
2066 }
2067
2068 /**
2069  * migration_page_queue_free: drop any remaining pages in the ram
2070  * request queue
2071  *
2072  * It should be empty at the end anyway, but in error cases there may
2073  * be some left.  in case that there is any page left, we drop it.
2074  *
2075  */
2076 static void migration_page_queue_free(RAMState *rs)
2077 {
2078     struct RAMSrcPageRequest *mspr, *next_mspr;
2079     /* This queue generally should be empty - but in the case of a failed
2080      * migration might have some droppings in.
2081      */
2082     rcu_read_lock();
2083     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2084         memory_region_unref(mspr->rb->mr);
2085         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2086         g_free(mspr);
2087     }
2088     rcu_read_unlock();
2089 }
2090
2091 /**
2092  * ram_save_queue_pages: queue the page for transmission
2093  *
2094  * A request from postcopy destination for example.
2095  *
2096  * Returns zero on success or negative on error
2097  *
2098  * @rbname: Name of the RAMBLock of the request. NULL means the
2099  *          same that last one.
2100  * @start: starting address from the start of the RAMBlock
2101  * @len: length (in bytes) to send
2102  */
2103 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2104 {
2105     RAMBlock *ramblock;
2106     RAMState *rs = ram_state;
2107
2108     ram_counters.postcopy_requests++;
2109     rcu_read_lock();
2110     if (!rbname) {
2111         /* Reuse last RAMBlock */
2112         ramblock = rs->last_req_rb;
2113
2114         if (!ramblock) {
2115             /*
2116              * Shouldn't happen, we can't reuse the last RAMBlock if
2117              * it's the 1st request.
2118              */
2119             error_report("ram_save_queue_pages no previous block");
2120             goto err;
2121         }
2122     } else {
2123         ramblock = qemu_ram_block_by_name(rbname);
2124
2125         if (!ramblock) {
2126             /* We shouldn't be asked for a non-existent RAMBlock */
2127             error_report("ram_save_queue_pages no block '%s'", rbname);
2128             goto err;
2129         }
2130         rs->last_req_rb = ramblock;
2131     }
2132     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2133     if (start+len > ramblock->used_length) {
2134         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2135                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2136                      __func__, start, len, ramblock->used_length);
2137         goto err;
2138     }
2139
2140     struct RAMSrcPageRequest *new_entry =
2141         g_malloc0(sizeof(struct RAMSrcPageRequest));
2142     new_entry->rb = ramblock;
2143     new_entry->offset = start;
2144     new_entry->len = len;
2145
2146     memory_region_ref(ramblock->mr);
2147     qemu_mutex_lock(&rs->src_page_req_mutex);
2148     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2149     migration_make_urgent_request();
2150     qemu_mutex_unlock(&rs->src_page_req_mutex);
2151     rcu_read_unlock();
2152
2153     return 0;
2154
2155 err:
2156     rcu_read_unlock();
2157     return -1;
2158 }
2159
2160 static bool save_page_use_compression(RAMState *rs)
2161 {
2162     if (!migrate_use_compression()) {
2163         return false;
2164     }
2165
2166     /*
2167      * If xbzrle is on, stop using the data compression after first
2168      * round of migration even if compression is enabled. In theory,
2169      * xbzrle can do better than compression.
2170      */
2171     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2172         return true;
2173     }
2174
2175     return false;
2176 }
2177
2178 /**
2179  * ram_save_target_page: save one target page
2180  *
2181  * Returns the number of pages written
2182  *
2183  * @rs: current RAM state
2184  * @pss: data about the page we want to send
2185  * @last_stage: if we are at the completion stage
2186  */
2187 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2188                                 bool last_stage)
2189 {
2190     RAMBlock *block = pss->block;
2191     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2192     int res;
2193
2194     if (control_save_page(rs, block, offset, &res)) {
2195         return res;
2196     }
2197
2198     /*
2199      * When starting the process of a new block, the first page of
2200      * the block should be sent out before other pages in the same
2201      * block, and all the pages in last block should have been sent
2202      * out, keeping this order is important, because the 'cont' flag
2203      * is used to avoid resending the block name.
2204      */
2205     if (block != rs->last_sent_block && save_page_use_compression(rs)) {
2206             flush_compressed_data(rs);
2207     }
2208
2209     res = save_zero_page(rs, block, offset);
2210     if (res > 0) {
2211         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2212          * page would be stale
2213          */
2214         if (!save_page_use_compression(rs)) {
2215             XBZRLE_cache_lock();
2216             xbzrle_cache_zero_page(rs, block->offset + offset);
2217             XBZRLE_cache_unlock();
2218         }
2219         ram_release_pages(block->idstr, offset, res);
2220         return res;
2221     }
2222
2223     /*
2224      * Make sure the first page is sent out before other pages.
2225      *
2226      * we post it as normal page as compression will take much
2227      * CPU resource.
2228      */
2229     if (block == rs->last_sent_block && save_page_use_compression(rs)) {
2230         return compress_page_with_multi_thread(rs, block, offset);
2231     } else if (migrate_use_multifd()) {
2232         return ram_save_multifd_page(rs, block, offset);
2233     }
2234
2235     return ram_save_page(rs, pss, last_stage);
2236 }
2237
2238 /**
2239  * ram_save_host_page: save a whole host page
2240  *
2241  * Starting at *offset send pages up to the end of the current host
2242  * page. It's valid for the initial offset to point into the middle of
2243  * a host page in which case the remainder of the hostpage is sent.
2244  * Only dirty target pages are sent. Note that the host page size may
2245  * be a huge page for this block.
2246  * The saving stops at the boundary of the used_length of the block
2247  * if the RAMBlock isn't a multiple of the host page size.
2248  *
2249  * Returns the number of pages written or negative on error
2250  *
2251  * @rs: current RAM state
2252  * @ms: current migration state
2253  * @pss: data about the page we want to send
2254  * @last_stage: if we are at the completion stage
2255  */
2256 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2257                               bool last_stage)
2258 {
2259     int tmppages, pages = 0;
2260     size_t pagesize_bits =
2261         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2262
2263     if (!qemu_ram_is_migratable(pss->block)) {
2264         error_report("block %s should not be migrated !", pss->block->idstr);
2265         return 0;
2266     }
2267
2268     do {
2269         /* Check the pages is dirty and if it is send it */
2270         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2271             pss->page++;
2272             continue;
2273         }
2274
2275         tmppages = ram_save_target_page(rs, pss, last_stage);
2276         if (tmppages < 0) {
2277             return tmppages;
2278         }
2279
2280         pages += tmppages;
2281         if (pss->block->unsentmap) {
2282             clear_bit(pss->page, pss->block->unsentmap);
2283         }
2284
2285         pss->page++;
2286     } while ((pss->page & (pagesize_bits - 1)) &&
2287              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2288
2289     /* The offset we leave with is the last one we looked at */
2290     pss->page--;
2291     return pages;
2292 }
2293
2294 /**
2295  * ram_find_and_save_block: finds a dirty page and sends it to f
2296  *
2297  * Called within an RCU critical section.
2298  *
2299  * Returns the number of pages written where zero means no dirty pages
2300  *
2301  * @rs: current RAM state
2302  * @last_stage: if we are at the completion stage
2303  *
2304  * On systems where host-page-size > target-page-size it will send all the
2305  * pages in a host page that are dirty.
2306  */
2307
2308 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2309 {
2310     PageSearchStatus pss;
2311     int pages = 0;
2312     bool again, found;
2313
2314     /* No dirty page as there is zero RAM */
2315     if (!ram_bytes_total()) {
2316         return pages;
2317     }
2318
2319     pss.block = rs->last_seen_block;
2320     pss.page = rs->last_page;
2321     pss.complete_round = false;
2322
2323     if (!pss.block) {
2324         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2325     }
2326
2327     do {
2328         again = true;
2329         found = get_queued_page(rs, &pss);
2330
2331         if (!found) {
2332             /* priority queue empty, so just search for something dirty */
2333             found = find_dirty_block(rs, &pss, &again);
2334         }
2335
2336         if (found) {
2337             pages = ram_save_host_page(rs, &pss, last_stage);
2338         }
2339     } while (!pages && again);
2340
2341     rs->last_seen_block = pss.block;
2342     rs->last_page = pss.page;
2343
2344     return pages;
2345 }
2346
2347 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2348 {
2349     uint64_t pages = size / TARGET_PAGE_SIZE;
2350
2351     if (zero) {
2352         ram_counters.duplicate += pages;
2353     } else {
2354         ram_counters.normal += pages;
2355         ram_counters.transferred += size;
2356         qemu_update_position(f, size);
2357     }
2358 }
2359
2360 uint64_t ram_bytes_total(void)
2361 {
2362     RAMBlock *block;
2363     uint64_t total = 0;
2364
2365     rcu_read_lock();
2366     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2367         total += block->used_length;
2368     }
2369     rcu_read_unlock();
2370     return total;
2371 }
2372
2373 static void xbzrle_load_setup(void)
2374 {
2375     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2376 }
2377
2378 static void xbzrle_load_cleanup(void)
2379 {
2380     g_free(XBZRLE.decoded_buf);
2381     XBZRLE.decoded_buf = NULL;
2382 }
2383
2384 static void ram_state_cleanup(RAMState **rsp)
2385 {
2386     if (*rsp) {
2387         migration_page_queue_free(*rsp);
2388         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2389         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2390         g_free(*rsp);
2391         *rsp = NULL;
2392     }
2393 }
2394
2395 static void xbzrle_cleanup(void)
2396 {
2397     XBZRLE_cache_lock();
2398     if (XBZRLE.cache) {
2399         cache_fini(XBZRLE.cache);
2400         g_free(XBZRLE.encoded_buf);
2401         g_free(XBZRLE.current_buf);
2402         g_free(XBZRLE.zero_target_page);
2403         XBZRLE.cache = NULL;
2404         XBZRLE.encoded_buf = NULL;
2405         XBZRLE.current_buf = NULL;
2406         XBZRLE.zero_target_page = NULL;
2407     }
2408     XBZRLE_cache_unlock();
2409 }
2410
2411 static void ram_save_cleanup(void *opaque)
2412 {
2413     RAMState **rsp = opaque;
2414     RAMBlock *block;
2415
2416     /* caller have hold iothread lock or is in a bh, so there is
2417      * no writing race against this migration_bitmap
2418      */
2419     memory_global_dirty_log_stop();
2420
2421     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2422         g_free(block->bmap);
2423         block->bmap = NULL;
2424         g_free(block->unsentmap);
2425         block->unsentmap = NULL;
2426     }
2427
2428     xbzrle_cleanup();
2429     compress_threads_save_cleanup();
2430     ram_state_cleanup(rsp);
2431 }
2432
2433 static void ram_state_reset(RAMState *rs)
2434 {
2435     rs->last_seen_block = NULL;
2436     rs->last_sent_block = NULL;
2437     rs->last_page = 0;
2438     rs->last_version = ram_list.version;
2439     rs->ram_bulk_stage = true;
2440 }
2441
2442 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2443
2444 /*
2445  * 'expected' is the value you expect the bitmap mostly to be full
2446  * of; it won't bother printing lines that are all this value.
2447  * If 'todump' is null the migration bitmap is dumped.
2448  */
2449 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2450                            unsigned long pages)
2451 {
2452     int64_t cur;
2453     int64_t linelen = 128;
2454     char linebuf[129];
2455
2456     for (cur = 0; cur < pages; cur += linelen) {
2457         int64_t curb;
2458         bool found = false;
2459         /*
2460          * Last line; catch the case where the line length
2461          * is longer than remaining ram
2462          */
2463         if (cur + linelen > pages) {
2464             linelen = pages - cur;
2465         }
2466         for (curb = 0; curb < linelen; curb++) {
2467             bool thisbit = test_bit(cur + curb, todump);
2468             linebuf[curb] = thisbit ? '1' : '.';
2469             found = found || (thisbit != expected);
2470         }
2471         if (found) {
2472             linebuf[curb] = '\0';
2473             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2474         }
2475     }
2476 }
2477
2478 /* **** functions for postcopy ***** */
2479
2480 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2481 {
2482     struct RAMBlock *block;
2483
2484     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2485         unsigned long *bitmap = block->bmap;
2486         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2487         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2488
2489         while (run_start < range) {
2490             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2491             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2492                               (run_end - run_start) << TARGET_PAGE_BITS);
2493             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2494         }
2495     }
2496 }
2497
2498 /**
2499  * postcopy_send_discard_bm_ram: discard a RAMBlock
2500  *
2501  * Returns zero on success
2502  *
2503  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2504  * Note: At this point the 'unsentmap' is the processed bitmap combined
2505  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2506  *
2507  * @ms: current migration state
2508  * @pds: state for postcopy
2509  * @start: RAMBlock starting page
2510  * @length: RAMBlock size
2511  */
2512 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2513                                         PostcopyDiscardState *pds,
2514                                         RAMBlock *block)
2515 {
2516     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2517     unsigned long current;
2518     unsigned long *unsentmap = block->unsentmap;
2519
2520     for (current = 0; current < end; ) {
2521         unsigned long one = find_next_bit(unsentmap, end, current);
2522
2523         if (one <= end) {
2524             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2525             unsigned long discard_length;
2526
2527             if (zero >= end) {
2528                 discard_length = end - one;
2529             } else {
2530                 discard_length = zero - one;
2531             }
2532             if (discard_length) {
2533                 postcopy_discard_send_range(ms, pds, one, discard_length);
2534             }
2535             current = one + discard_length;
2536         } else {
2537             current = one;
2538         }
2539     }
2540
2541     return 0;
2542 }
2543
2544 /**
2545  * postcopy_each_ram_send_discard: discard all RAMBlocks
2546  *
2547  * Returns 0 for success or negative for error
2548  *
2549  * Utility for the outgoing postcopy code.
2550  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2551  *   passing it bitmap indexes and name.
2552  * (qemu_ram_foreach_block ends up passing unscaled lengths
2553  *  which would mean postcopy code would have to deal with target page)
2554  *
2555  * @ms: current migration state
2556  */
2557 static int postcopy_each_ram_send_discard(MigrationState *ms)
2558 {
2559     struct RAMBlock *block;
2560     int ret;
2561
2562     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2563         PostcopyDiscardState *pds =
2564             postcopy_discard_send_init(ms, block->idstr);
2565
2566         /*
2567          * Postcopy sends chunks of bitmap over the wire, but it
2568          * just needs indexes at this point, avoids it having
2569          * target page specific code.
2570          */
2571         ret = postcopy_send_discard_bm_ram(ms, pds, block);
2572         postcopy_discard_send_finish(ms, pds);
2573         if (ret) {
2574             return ret;
2575         }
2576     }
2577
2578     return 0;
2579 }
2580
2581 /**
2582  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2583  *
2584  * Helper for postcopy_chunk_hostpages; it's called twice to
2585  * canonicalize the two bitmaps, that are similar, but one is
2586  * inverted.
2587  *
2588  * Postcopy requires that all target pages in a hostpage are dirty or
2589  * clean, not a mix.  This function canonicalizes the bitmaps.
2590  *
2591  * @ms: current migration state
2592  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2593  *               otherwise we need to canonicalize partially dirty host pages
2594  * @block: block that contains the page we want to canonicalize
2595  * @pds: state for postcopy
2596  */
2597 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2598                                           RAMBlock *block,
2599                                           PostcopyDiscardState *pds)
2600 {
2601     RAMState *rs = ram_state;
2602     unsigned long *bitmap = block->bmap;
2603     unsigned long *unsentmap = block->unsentmap;
2604     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2605     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2606     unsigned long run_start;
2607
2608     if (block->page_size == TARGET_PAGE_SIZE) {
2609         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2610         return;
2611     }
2612
2613     if (unsent_pass) {
2614         /* Find a sent page */
2615         run_start = find_next_zero_bit(unsentmap, pages, 0);
2616     } else {
2617         /* Find a dirty page */
2618         run_start = find_next_bit(bitmap, pages, 0);
2619     }
2620
2621     while (run_start < pages) {
2622         bool do_fixup = false;
2623         unsigned long fixup_start_addr;
2624         unsigned long host_offset;
2625
2626         /*
2627          * If the start of this run of pages is in the middle of a host
2628          * page, then we need to fixup this host page.
2629          */
2630         host_offset = run_start % host_ratio;
2631         if (host_offset) {
2632             do_fixup = true;
2633             run_start -= host_offset;
2634             fixup_start_addr = run_start;
2635             /* For the next pass */
2636             run_start = run_start + host_ratio;
2637         } else {
2638             /* Find the end of this run */
2639             unsigned long run_end;
2640             if (unsent_pass) {
2641                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2642             } else {
2643                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2644             }
2645             /*
2646              * If the end isn't at the start of a host page, then the
2647              * run doesn't finish at the end of a host page
2648              * and we need to discard.
2649              */
2650             host_offset = run_end % host_ratio;
2651             if (host_offset) {
2652                 do_fixup = true;
2653                 fixup_start_addr = run_end - host_offset;
2654                 /*
2655                  * This host page has gone, the next loop iteration starts
2656                  * from after the fixup
2657                  */
2658                 run_start = fixup_start_addr + host_ratio;
2659             } else {
2660                 /*
2661                  * No discards on this iteration, next loop starts from
2662                  * next sent/dirty page
2663                  */
2664                 run_start = run_end + 1;
2665             }
2666         }
2667
2668         if (do_fixup) {
2669             unsigned long page;
2670
2671             /* Tell the destination to discard this page */
2672             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2673                 /* For the unsent_pass we:
2674                  *     discard partially sent pages
2675                  * For the !unsent_pass (dirty) we:
2676                  *     discard partially dirty pages that were sent
2677                  *     (any partially sent pages were already discarded
2678                  *     by the previous unsent_pass)
2679                  */
2680                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2681                                             host_ratio);
2682             }
2683
2684             /* Clean up the bitmap */
2685             for (page = fixup_start_addr;
2686                  page < fixup_start_addr + host_ratio; page++) {
2687                 /* All pages in this host page are now not sent */
2688                 set_bit(page, unsentmap);
2689
2690                 /*
2691                  * Remark them as dirty, updating the count for any pages
2692                  * that weren't previously dirty.
2693                  */
2694                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2695             }
2696         }
2697
2698         if (unsent_pass) {
2699             /* Find the next sent page for the next iteration */
2700             run_start = find_next_zero_bit(unsentmap, pages, run_start);
2701         } else {
2702             /* Find the next dirty page for the next iteration */
2703             run_start = find_next_bit(bitmap, pages, run_start);
2704         }
2705     }
2706 }
2707
2708 /**
2709  * postcopy_chuck_hostpages: discrad any partially sent host page
2710  *
2711  * Utility for the outgoing postcopy code.
2712  *
2713  * Discard any partially sent host-page size chunks, mark any partially
2714  * dirty host-page size chunks as all dirty.  In this case the host-page
2715  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2716  *
2717  * Returns zero on success
2718  *
2719  * @ms: current migration state
2720  * @block: block we want to work with
2721  */
2722 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2723 {
2724     PostcopyDiscardState *pds =
2725         postcopy_discard_send_init(ms, block->idstr);
2726
2727     /* First pass: Discard all partially sent host pages */
2728     postcopy_chunk_hostpages_pass(ms, true, block, pds);
2729     /*
2730      * Second pass: Ensure that all partially dirty host pages are made
2731      * fully dirty.
2732      */
2733     postcopy_chunk_hostpages_pass(ms, false, block, pds);
2734
2735     postcopy_discard_send_finish(ms, pds);
2736     return 0;
2737 }
2738
2739 /**
2740  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2741  *
2742  * Returns zero on success
2743  *
2744  * Transmit the set of pages to be discarded after precopy to the target
2745  * these are pages that:
2746  *     a) Have been previously transmitted but are now dirty again
2747  *     b) Pages that have never been transmitted, this ensures that
2748  *        any pages on the destination that have been mapped by background
2749  *        tasks get discarded (transparent huge pages is the specific concern)
2750  * Hopefully this is pretty sparse
2751  *
2752  * @ms: current migration state
2753  */
2754 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2755 {
2756     RAMState *rs = ram_state;
2757     RAMBlock *block;
2758     int ret;
2759
2760     rcu_read_lock();
2761
2762     /* This should be our last sync, the src is now paused */
2763     migration_bitmap_sync(rs);
2764
2765     /* Easiest way to make sure we don't resume in the middle of a host-page */
2766     rs->last_seen_block = NULL;
2767     rs->last_sent_block = NULL;
2768     rs->last_page = 0;
2769
2770     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2771         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2772         unsigned long *bitmap = block->bmap;
2773         unsigned long *unsentmap = block->unsentmap;
2774
2775         if (!unsentmap) {
2776             /* We don't have a safe way to resize the sentmap, so
2777              * if the bitmap was resized it will be NULL at this
2778              * point.
2779              */
2780             error_report("migration ram resized during precopy phase");
2781             rcu_read_unlock();
2782             return -EINVAL;
2783         }
2784         /* Deal with TPS != HPS and huge pages */
2785         ret = postcopy_chunk_hostpages(ms, block);
2786         if (ret) {
2787             rcu_read_unlock();
2788             return ret;
2789         }
2790
2791         /*
2792          * Update the unsentmap to be unsentmap = unsentmap | dirty
2793          */
2794         bitmap_or(unsentmap, unsentmap, bitmap, pages);
2795 #ifdef DEBUG_POSTCOPY
2796         ram_debug_dump_bitmap(unsentmap, true, pages);
2797 #endif
2798     }
2799     trace_ram_postcopy_send_discard_bitmap();
2800
2801     ret = postcopy_each_ram_send_discard(ms);
2802     rcu_read_unlock();
2803
2804     return ret;
2805 }
2806
2807 /**
2808  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2809  *
2810  * Returns zero on success
2811  *
2812  * @rbname: name of the RAMBlock of the request. NULL means the
2813  *          same that last one.
2814  * @start: RAMBlock starting page
2815  * @length: RAMBlock size
2816  */
2817 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2818 {
2819     int ret = -1;
2820
2821     trace_ram_discard_range(rbname, start, length);
2822
2823     rcu_read_lock();
2824     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2825
2826     if (!rb) {
2827         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2828         goto err;
2829     }
2830
2831     /*
2832      * On source VM, we don't need to update the received bitmap since
2833      * we don't even have one.
2834      */
2835     if (rb->receivedmap) {
2836         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2837                      length >> qemu_target_page_bits());
2838     }
2839
2840     ret = ram_block_discard_range(rb, start, length);
2841
2842 err:
2843     rcu_read_unlock();
2844
2845     return ret;
2846 }
2847
2848 /*
2849  * For every allocation, we will try not to crash the VM if the
2850  * allocation failed.
2851  */
2852 static int xbzrle_init(void)
2853 {
2854     Error *local_err = NULL;
2855
2856     if (!migrate_use_xbzrle()) {
2857         return 0;
2858     }
2859
2860     XBZRLE_cache_lock();
2861
2862     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2863     if (!XBZRLE.zero_target_page) {
2864         error_report("%s: Error allocating zero page", __func__);
2865         goto err_out;
2866     }
2867
2868     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2869                               TARGET_PAGE_SIZE, &local_err);
2870     if (!XBZRLE.cache) {
2871         error_report_err(local_err);
2872         goto free_zero_page;
2873     }
2874
2875     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2876     if (!XBZRLE.encoded_buf) {
2877         error_report("%s: Error allocating encoded_buf", __func__);
2878         goto free_cache;
2879     }
2880
2881     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2882     if (!XBZRLE.current_buf) {
2883         error_report("%s: Error allocating current_buf", __func__);
2884         goto free_encoded_buf;
2885     }
2886
2887     /* We are all good */
2888     XBZRLE_cache_unlock();
2889     return 0;
2890
2891 free_encoded_buf:
2892     g_free(XBZRLE.encoded_buf);
2893     XBZRLE.encoded_buf = NULL;
2894 free_cache:
2895     cache_fini(XBZRLE.cache);
2896     XBZRLE.cache = NULL;
2897 free_zero_page:
2898     g_free(XBZRLE.zero_target_page);
2899     XBZRLE.zero_target_page = NULL;
2900 err_out:
2901     XBZRLE_cache_unlock();
2902     return -ENOMEM;
2903 }
2904
2905 static int ram_state_init(RAMState **rsp)
2906 {
2907     *rsp = g_try_new0(RAMState, 1);
2908
2909     if (!*rsp) {
2910         error_report("%s: Init ramstate fail", __func__);
2911         return -1;
2912     }
2913
2914     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2915     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2916     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2917
2918     /*
2919      * Count the total number of pages used by ram blocks not including any
2920      * gaps due to alignment or unplugs.
2921      */
2922     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2923
2924     ram_state_reset(*rsp);
2925
2926     return 0;
2927 }
2928
2929 static void ram_list_init_bitmaps(void)
2930 {
2931     RAMBlock *block;
2932     unsigned long pages;
2933
2934     /* Skip setting bitmap if there is no RAM */
2935     if (ram_bytes_total()) {
2936         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2937             pages = block->max_length >> TARGET_PAGE_BITS;
2938             block->bmap = bitmap_new(pages);
2939             bitmap_set(block->bmap, 0, pages);
2940             if (migrate_postcopy_ram()) {
2941                 block->unsentmap = bitmap_new(pages);
2942                 bitmap_set(block->unsentmap, 0, pages);
2943             }
2944         }
2945     }
2946 }
2947
2948 static void ram_init_bitmaps(RAMState *rs)
2949 {
2950     /* For memory_global_dirty_log_start below.  */
2951     qemu_mutex_lock_iothread();
2952     qemu_mutex_lock_ramlist();
2953     rcu_read_lock();
2954
2955     ram_list_init_bitmaps();
2956     memory_global_dirty_log_start();
2957     migration_bitmap_sync(rs);
2958
2959     rcu_read_unlock();
2960     qemu_mutex_unlock_ramlist();
2961     qemu_mutex_unlock_iothread();
2962 }
2963
2964 static int ram_init_all(RAMState **rsp)
2965 {
2966     if (ram_state_init(rsp)) {
2967         return -1;
2968     }
2969
2970     if (xbzrle_init()) {
2971         ram_state_cleanup(rsp);
2972         return -1;
2973     }
2974
2975     ram_init_bitmaps(*rsp);
2976
2977     return 0;
2978 }
2979
2980 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2981 {
2982     RAMBlock *block;
2983     uint64_t pages = 0;
2984
2985     /*
2986      * Postcopy is not using xbzrle/compression, so no need for that.
2987      * Also, since source are already halted, we don't need to care
2988      * about dirty page logging as well.
2989      */
2990
2991     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2992         pages += bitmap_count_one(block->bmap,
2993                                   block->used_length >> TARGET_PAGE_BITS);
2994     }
2995
2996     /* This may not be aligned with current bitmaps. Recalculate. */
2997     rs->migration_dirty_pages = pages;
2998
2999     rs->last_seen_block = NULL;
3000     rs->last_sent_block = NULL;
3001     rs->last_page = 0;
3002     rs->last_version = ram_list.version;
3003     /*
3004      * Disable the bulk stage, otherwise we'll resend the whole RAM no
3005      * matter what we have sent.
3006      */
3007     rs->ram_bulk_stage = false;
3008
3009     /* Update RAMState cache of output QEMUFile */
3010     rs->f = out;
3011
3012     trace_ram_state_resume_prepare(pages);
3013 }
3014
3015 /*
3016  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3017  * long-running RCU critical section.  When rcu-reclaims in the code
3018  * start to become numerous it will be necessary to reduce the
3019  * granularity of these critical sections.
3020  */
3021
3022 /**
3023  * ram_save_setup: Setup RAM for migration
3024  *
3025  * Returns zero to indicate success and negative for error
3026  *
3027  * @f: QEMUFile where to send the data
3028  * @opaque: RAMState pointer
3029  */
3030 static int ram_save_setup(QEMUFile *f, void *opaque)
3031 {
3032     RAMState **rsp = opaque;
3033     RAMBlock *block;
3034
3035     if (compress_threads_save_setup()) {
3036         return -1;
3037     }
3038
3039     /* migration has already setup the bitmap, reuse it. */
3040     if (!migration_in_colo_state()) {
3041         if (ram_init_all(rsp) != 0) {
3042             compress_threads_save_cleanup();
3043             return -1;
3044         }
3045     }
3046     (*rsp)->f = f;
3047
3048     rcu_read_lock();
3049
3050     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
3051
3052     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3053         qemu_put_byte(f, strlen(block->idstr));
3054         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3055         qemu_put_be64(f, block->used_length);
3056         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3057             qemu_put_be64(f, block->page_size);
3058         }
3059     }
3060
3061     rcu_read_unlock();
3062
3063     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3064     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3065
3066     multifd_send_sync_main();
3067     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3068     qemu_fflush(f);
3069
3070     return 0;
3071 }
3072
3073 /**
3074  * ram_save_iterate: iterative stage for migration
3075  *
3076  * Returns zero to indicate success and negative for error
3077  *
3078  * @f: QEMUFile where to send the data
3079  * @opaque: RAMState pointer
3080  */
3081 static int ram_save_iterate(QEMUFile *f, void *opaque)
3082 {
3083     RAMState **temp = opaque;
3084     RAMState *rs = *temp;
3085     int ret;
3086     int i;
3087     int64_t t0;
3088     int done = 0;
3089
3090     if (blk_mig_bulk_active()) {
3091         /* Avoid transferring ram during bulk phase of block migration as
3092          * the bulk phase will usually take a long time and transferring
3093          * ram updates during that time is pointless. */
3094         goto out;
3095     }
3096
3097     rcu_read_lock();
3098     if (ram_list.version != rs->last_version) {
3099         ram_state_reset(rs);
3100     }
3101
3102     /* Read version before ram_list.blocks */
3103     smp_rmb();
3104
3105     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3106
3107     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3108     i = 0;
3109     while ((ret = qemu_file_rate_limit(f)) == 0 ||
3110             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3111         int pages;
3112
3113         if (qemu_file_get_error(f)) {
3114             break;
3115         }
3116
3117         pages = ram_find_and_save_block(rs, false);
3118         /* no more pages to sent */
3119         if (pages == 0) {
3120             done = 1;
3121             break;
3122         }
3123         rs->iterations++;
3124
3125         /* we want to check in the 1st loop, just in case it was the 1st time
3126            and we had to sync the dirty bitmap.
3127            qemu_get_clock_ns() is a bit expensive, so we only check each some
3128            iterations
3129         */
3130         if ((i & 63) == 0) {
3131             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3132             if (t1 > MAX_WAIT) {
3133                 trace_ram_save_iterate_big_wait(t1, i);
3134                 break;
3135             }
3136         }
3137         i++;
3138     }
3139     flush_compressed_data(rs);
3140     rcu_read_unlock();
3141
3142     /*
3143      * Must occur before EOS (or any QEMUFile operation)
3144      * because of RDMA protocol.
3145      */
3146     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3147
3148     multifd_send_sync_main();
3149 out:
3150     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3151     qemu_fflush(f);
3152     ram_counters.transferred += 8;
3153
3154     ret = qemu_file_get_error(f);
3155     if (ret < 0) {
3156         return ret;
3157     }
3158
3159     return done;
3160 }
3161
3162 /**
3163  * ram_save_complete: function called to send the remaining amount of ram
3164  *
3165  * Returns zero to indicate success
3166  *
3167  * Called with iothread lock
3168  *
3169  * @f: QEMUFile where to send the data
3170  * @opaque: RAMState pointer
3171  */
3172 static int ram_save_complete(QEMUFile *f, void *opaque)
3173 {
3174     RAMState **temp = opaque;
3175     RAMState *rs = *temp;
3176
3177     rcu_read_lock();
3178
3179     if (!migration_in_postcopy()) {
3180         migration_bitmap_sync(rs);
3181     }
3182
3183     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3184
3185     /* try transferring iterative blocks of memory */
3186
3187     /* flush all remaining blocks regardless of rate limiting */
3188     while (true) {
3189         int pages;
3190
3191         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3192         /* no more blocks to sent */
3193         if (pages == 0) {
3194             break;
3195         }
3196     }
3197
3198     flush_compressed_data(rs);
3199     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3200
3201     rcu_read_unlock();
3202
3203     multifd_send_sync_main();
3204     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3205     qemu_fflush(f);
3206
3207     return 0;
3208 }
3209
3210 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3211                              uint64_t *res_precopy_only,
3212                              uint64_t *res_compatible,
3213                              uint64_t *res_postcopy_only)
3214 {
3215     RAMState **temp = opaque;
3216     RAMState *rs = *temp;
3217     uint64_t remaining_size;
3218
3219     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3220
3221     if (!migration_in_postcopy() &&
3222         remaining_size < max_size) {
3223         qemu_mutex_lock_iothread();
3224         rcu_read_lock();
3225         migration_bitmap_sync(rs);
3226         rcu_read_unlock();
3227         qemu_mutex_unlock_iothread();
3228         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3229     }
3230
3231     if (migrate_postcopy_ram()) {
3232         /* We can do postcopy, and all the data is postcopiable */
3233         *res_compatible += remaining_size;
3234     } else {
3235         *res_precopy_only += remaining_size;
3236     }
3237 }
3238
3239 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3240 {
3241     unsigned int xh_len;
3242     int xh_flags;
3243     uint8_t *loaded_data;
3244
3245     /* extract RLE header */
3246     xh_flags = qemu_get_byte(f);
3247     xh_len = qemu_get_be16(f);
3248
3249     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3250         error_report("Failed to load XBZRLE page - wrong compression!");
3251         return -1;
3252     }
3253
3254     if (xh_len > TARGET_PAGE_SIZE) {
3255         error_report("Failed to load XBZRLE page - len overflow!");
3256         return -1;
3257     }
3258     loaded_data = XBZRLE.decoded_buf;
3259     /* load data and decode */
3260     /* it can change loaded_data to point to an internal buffer */
3261     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3262
3263     /* decode RLE */
3264     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3265                              TARGET_PAGE_SIZE) == -1) {
3266         error_report("Failed to load XBZRLE page - decode error!");
3267         return -1;
3268     }
3269
3270     return 0;
3271 }
3272
3273 /**
3274  * ram_block_from_stream: read a RAMBlock id from the migration stream
3275  *
3276  * Must be called from within a rcu critical section.
3277  *
3278  * Returns a pointer from within the RCU-protected ram_list.
3279  *
3280  * @f: QEMUFile where to read the data from
3281  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3282  */
3283 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3284 {
3285     static RAMBlock *block = NULL;
3286     char id[256];
3287     uint8_t len;
3288
3289     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3290         if (!block) {
3291             error_report("Ack, bad migration stream!");
3292             return NULL;
3293         }
3294         return block;
3295     }
3296
3297     len = qemu_get_byte(f);
3298     qemu_get_buffer(f, (uint8_t *)id, len);
3299     id[len] = 0;
3300
3301     block = qemu_ram_block_by_name(id);
3302     if (!block) {
3303         error_report("Can't find block %s", id);
3304         return NULL;
3305     }
3306
3307     if (!qemu_ram_is_migratable(block)) {
3308         error_report("block %s should not be migrated !", id);
3309         return NULL;
3310     }
3311
3312     return block;
3313 }
3314
3315 static inline void *host_from_ram_block_offset(RAMBlock *block,
3316                                                ram_addr_t offset)
3317 {
3318     if (!offset_in_ramblock(block, offset)) {
3319         return NULL;
3320     }
3321
3322     return block->host + offset;
3323 }
3324
3325 /**
3326  * ram_handle_compressed: handle the zero page case
3327  *
3328  * If a page (or a whole RDMA chunk) has been
3329  * determined to be zero, then zap it.
3330  *
3331  * @host: host address for the zero page
3332  * @ch: what the page is filled from.  We only support zero
3333  * @size: size of the zero page
3334  */
3335 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3336 {
3337     if (ch != 0 || !is_zero_range(host, size)) {
3338         memset(host, ch, size);
3339     }
3340 }
3341
3342 /* return the size after decompression, or negative value on error */
3343 static int
3344 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3345                      const uint8_t *source, size_t source_len)
3346 {
3347     int err;
3348
3349     err = inflateReset(stream);
3350     if (err != Z_OK) {
3351         return -1;
3352     }
3353
3354     stream->avail_in = source_len;
3355     stream->next_in = (uint8_t *)source;
3356     stream->avail_out = dest_len;
3357     stream->next_out = dest;
3358
3359     err = inflate(stream, Z_NO_FLUSH);
3360     if (err != Z_STREAM_END) {
3361         return -1;
3362     }
3363
3364     return stream->total_out;
3365 }
3366
3367 static void *do_data_decompress(void *opaque)
3368 {
3369     DecompressParam *param = opaque;
3370     unsigned long pagesize;
3371     uint8_t *des;
3372     int len, ret;
3373
3374     qemu_mutex_lock(&param->mutex);
3375     while (!param->quit) {
3376         if (param->des) {
3377             des = param->des;
3378             len = param->len;
3379             param->des = 0;
3380             qemu_mutex_unlock(&param->mutex);
3381
3382             pagesize = TARGET_PAGE_SIZE;
3383
3384             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3385                                        param->compbuf, len);
3386             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3387                 error_report("decompress data failed");
3388                 qemu_file_set_error(decomp_file, ret);
3389             }
3390
3391             qemu_mutex_lock(&decomp_done_lock);
3392             param->done = true;
3393             qemu_cond_signal(&decomp_done_cond);
3394             qemu_mutex_unlock(&decomp_done_lock);
3395
3396             qemu_mutex_lock(&param->mutex);
3397         } else {
3398             qemu_cond_wait(&param->cond, &param->mutex);
3399         }
3400     }
3401     qemu_mutex_unlock(&param->mutex);
3402
3403     return NULL;
3404 }
3405
3406 static int wait_for_decompress_done(void)
3407 {
3408     int idx, thread_count;
3409
3410     if (!migrate_use_compression()) {
3411         return 0;
3412     }
3413
3414     thread_count = migrate_decompress_threads();
3415     qemu_mutex_lock(&decomp_done_lock);
3416     for (idx = 0; idx < thread_count; idx++) {
3417         while (!decomp_param[idx].done) {
3418             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3419         }
3420     }
3421     qemu_mutex_unlock(&decomp_done_lock);
3422     return qemu_file_get_error(decomp_file);
3423 }
3424
3425 static void compress_threads_load_cleanup(void)
3426 {
3427     int i, thread_count;
3428
3429     if (!migrate_use_compression()) {
3430         return;
3431     }
3432     thread_count = migrate_decompress_threads();
3433     for (i = 0; i < thread_count; i++) {
3434         /*
3435          * we use it as a indicator which shows if the thread is
3436          * properly init'd or not
3437          */
3438         if (!decomp_param[i].compbuf) {
3439             break;
3440         }
3441
3442         qemu_mutex_lock(&decomp_param[i].mutex);
3443         decomp_param[i].quit = true;
3444         qemu_cond_signal(&decomp_param[i].cond);
3445         qemu_mutex_unlock(&decomp_param[i].mutex);
3446     }
3447     for (i = 0; i < thread_count; i++) {
3448         if (!decomp_param[i].compbuf) {
3449             break;
3450         }
3451
3452         qemu_thread_join(decompress_threads + i);
3453         qemu_mutex_destroy(&decomp_param[i].mutex);
3454         qemu_cond_destroy(&decomp_param[i].cond);
3455         inflateEnd(&decomp_param[i].stream);
3456         g_free(decomp_param[i].compbuf);
3457         decomp_param[i].compbuf = NULL;
3458     }
3459     g_free(decompress_threads);
3460     g_free(decomp_param);
3461     decompress_threads = NULL;
3462     decomp_param = NULL;
3463     decomp_file = NULL;
3464 }
3465
3466 static int compress_threads_load_setup(QEMUFile *f)
3467 {
3468     int i, thread_count;
3469
3470     if (!migrate_use_compression()) {
3471         return 0;
3472     }
3473
3474     thread_count = migrate_decompress_threads();
3475     decompress_threads = g_new0(QemuThread, thread_count);
3476     decomp_param = g_new0(DecompressParam, thread_count);
3477     qemu_mutex_init(&decomp_done_lock);
3478     qemu_cond_init(&decomp_done_cond);
3479     decomp_file = f;
3480     for (i = 0; i < thread_count; i++) {
3481         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3482             goto exit;
3483         }
3484
3485         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3486         qemu_mutex_init(&decomp_param[i].mutex);
3487         qemu_cond_init(&decomp_param[i].cond);
3488         decomp_param[i].done = true;
3489         decomp_param[i].quit = false;
3490         qemu_thread_create(decompress_threads + i, "decompress",
3491                            do_data_decompress, decomp_param + i,
3492                            QEMU_THREAD_JOINABLE);
3493     }
3494     return 0;
3495 exit:
3496     compress_threads_load_cleanup();
3497     return -1;
3498 }
3499
3500 static void decompress_data_with_multi_threads(QEMUFile *f,
3501                                                void *host, int len)
3502 {
3503     int idx, thread_count;
3504
3505     thread_count = migrate_decompress_threads();
3506     qemu_mutex_lock(&decomp_done_lock);
3507     while (true) {
3508         for (idx = 0; idx < thread_count; idx++) {
3509             if (decomp_param[idx].done) {
3510                 decomp_param[idx].done = false;
3511                 qemu_mutex_lock(&decomp_param[idx].mutex);
3512                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3513                 decomp_param[idx].des = host;
3514                 decomp_param[idx].len = len;
3515                 qemu_cond_signal(&decomp_param[idx].cond);
3516                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3517                 break;
3518             }
3519         }
3520         if (idx < thread_count) {
3521             break;
3522         } else {
3523             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3524         }
3525     }
3526     qemu_mutex_unlock(&decomp_done_lock);
3527 }
3528
3529 /**
3530  * ram_load_setup: Setup RAM for migration incoming side
3531  *
3532  * Returns zero to indicate success and negative for error
3533  *
3534  * @f: QEMUFile where to receive the data
3535  * @opaque: RAMState pointer
3536  */
3537 static int ram_load_setup(QEMUFile *f, void *opaque)
3538 {
3539     if (compress_threads_load_setup(f)) {
3540         return -1;
3541     }
3542
3543     xbzrle_load_setup();
3544     ramblock_recv_map_init();
3545     return 0;
3546 }
3547
3548 static int ram_load_cleanup(void *opaque)
3549 {
3550     RAMBlock *rb;
3551
3552     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3553         if (ramblock_is_pmem(rb)) {
3554             pmem_persist(rb->host, rb->used_length);
3555         }
3556     }
3557
3558     xbzrle_load_cleanup();
3559     compress_threads_load_cleanup();
3560
3561     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3562         g_free(rb->receivedmap);
3563         rb->receivedmap = NULL;
3564     }
3565     return 0;
3566 }
3567
3568 /**
3569  * ram_postcopy_incoming_init: allocate postcopy data structures
3570  *
3571  * Returns 0 for success and negative if there was one error
3572  *
3573  * @mis: current migration incoming state
3574  *
3575  * Allocate data structures etc needed by incoming migration with
3576  * postcopy-ram. postcopy-ram's similarly names
3577  * postcopy_ram_incoming_init does the work.
3578  */
3579 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3580 {
3581     return postcopy_ram_incoming_init(mis);
3582 }
3583
3584 /**
3585  * ram_load_postcopy: load a page in postcopy case
3586  *
3587  * Returns 0 for success or -errno in case of error
3588  *
3589  * Called in postcopy mode by ram_load().
3590  * rcu_read_lock is taken prior to this being called.
3591  *
3592  * @f: QEMUFile where to send the data
3593  */
3594 static int ram_load_postcopy(QEMUFile *f)
3595 {
3596     int flags = 0, ret = 0;
3597     bool place_needed = false;
3598     bool matches_target_page_size = false;
3599     MigrationIncomingState *mis = migration_incoming_get_current();
3600     /* Temporary page that is later 'placed' */
3601     void *postcopy_host_page = postcopy_get_tmp_page(mis);
3602     void *last_host = NULL;
3603     bool all_zero = false;
3604
3605     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3606         ram_addr_t addr;
3607         void *host = NULL;
3608         void *page_buffer = NULL;
3609         void *place_source = NULL;
3610         RAMBlock *block = NULL;
3611         uint8_t ch;
3612
3613         addr = qemu_get_be64(f);
3614
3615         /*
3616          * If qemu file error, we should stop here, and then "addr"
3617          * may be invalid
3618          */
3619         ret = qemu_file_get_error(f);
3620         if (ret) {
3621             break;
3622         }
3623
3624         flags = addr & ~TARGET_PAGE_MASK;
3625         addr &= TARGET_PAGE_MASK;
3626
3627         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3628         place_needed = false;
3629         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
3630             block = ram_block_from_stream(f, flags);
3631
3632             host = host_from_ram_block_offset(block, addr);
3633             if (!host) {
3634                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3635                 ret = -EINVAL;
3636                 break;
3637             }
3638             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3639             /*
3640              * Postcopy requires that we place whole host pages atomically;
3641              * these may be huge pages for RAMBlocks that are backed by
3642              * hugetlbfs.
3643              * To make it atomic, the data is read into a temporary page
3644              * that's moved into place later.
3645              * The migration protocol uses,  possibly smaller, target-pages
3646              * however the source ensures it always sends all the components
3647              * of a host page in order.
3648              */
3649             page_buffer = postcopy_host_page +
3650                           ((uintptr_t)host & (block->page_size - 1));
3651             /* If all TP are zero then we can optimise the place */
3652             if (!((uintptr_t)host & (block->page_size - 1))) {
3653                 all_zero = true;
3654             } else {
3655                 /* not the 1st TP within the HP */
3656                 if (host != (last_host + TARGET_PAGE_SIZE)) {
3657                     error_report("Non-sequential target page %p/%p",
3658                                   host, last_host);
3659                     ret = -EINVAL;
3660                     break;
3661                 }
3662             }
3663
3664
3665             /*
3666              * If it's the last part of a host page then we place the host
3667              * page
3668              */
3669             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
3670                                      (block->page_size - 1)) == 0;
3671             place_source = postcopy_host_page;
3672         }
3673         last_host = host;
3674
3675         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3676         case RAM_SAVE_FLAG_ZERO:
3677             ch = qemu_get_byte(f);
3678             memset(page_buffer, ch, TARGET_PAGE_SIZE);
3679             if (ch) {
3680                 all_zero = false;
3681             }
3682             break;
3683
3684         case RAM_SAVE_FLAG_PAGE:
3685             all_zero = false;
3686             if (!matches_target_page_size) {
3687                 /* For huge pages, we always use temporary buffer */
3688                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3689             } else {
3690                 /*
3691                  * For small pages that matches target page size, we
3692                  * avoid the qemu_file copy.  Instead we directly use
3693                  * the buffer of QEMUFile to place the page.  Note: we
3694                  * cannot do any QEMUFile operation before using that
3695                  * buffer to make sure the buffer is valid when
3696                  * placing the page.
3697                  */
3698                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3699                                          TARGET_PAGE_SIZE);
3700             }
3701             break;
3702         case RAM_SAVE_FLAG_EOS:
3703             /* normal exit */
3704             multifd_recv_sync_main();
3705             break;
3706         default:
3707             error_report("Unknown combination of migration flags: %#x"
3708                          " (postcopy mode)", flags);
3709             ret = -EINVAL;
3710             break;
3711         }
3712
3713         /* Detect for any possible file errors */
3714         if (!ret && qemu_file_get_error(f)) {
3715             ret = qemu_file_get_error(f);
3716         }
3717
3718         if (!ret && place_needed) {
3719             /* This gets called at the last target page in the host page */
3720             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3721
3722             if (all_zero) {
3723                 ret = postcopy_place_page_zero(mis, place_dest,
3724                                                block);
3725             } else {
3726                 ret = postcopy_place_page(mis, place_dest,
3727                                           place_source, block);
3728             }
3729         }
3730     }
3731
3732     return ret;
3733 }
3734
3735 static bool postcopy_is_advised(void)
3736 {
3737     PostcopyState ps = postcopy_state_get();
3738     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3739 }
3740
3741 static bool postcopy_is_running(void)
3742 {
3743     PostcopyState ps = postcopy_state_get();
3744     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3745 }
3746
3747 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3748 {
3749     int flags = 0, ret = 0, invalid_flags = 0;
3750     static uint64_t seq_iter;
3751     int len = 0;
3752     /*
3753      * If system is running in postcopy mode, page inserts to host memory must
3754      * be atomic
3755      */
3756     bool postcopy_running = postcopy_is_running();
3757     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3758     bool postcopy_advised = postcopy_is_advised();
3759
3760     seq_iter++;
3761
3762     if (version_id != 4) {
3763         ret = -EINVAL;
3764     }
3765
3766     if (!migrate_use_compression()) {
3767         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3768     }
3769     /* This RCU critical section can be very long running.
3770      * When RCU reclaims in the code start to become numerous,
3771      * it will be necessary to reduce the granularity of this
3772      * critical section.
3773      */
3774     rcu_read_lock();
3775
3776     if (postcopy_running) {
3777         ret = ram_load_postcopy(f);
3778     }
3779
3780     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3781         ram_addr_t addr, total_ram_bytes;
3782         void *host = NULL;
3783         uint8_t ch;
3784
3785         addr = qemu_get_be64(f);
3786         flags = addr & ~TARGET_PAGE_MASK;
3787         addr &= TARGET_PAGE_MASK;
3788
3789         if (flags & invalid_flags) {
3790             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3791                 error_report("Received an unexpected compressed page");
3792             }
3793
3794             ret = -EINVAL;
3795             break;
3796         }
3797
3798         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3799                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3800             RAMBlock *block = ram_block_from_stream(f, flags);
3801
3802             host = host_from_ram_block_offset(block, addr);
3803             if (!host) {
3804                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3805                 ret = -EINVAL;
3806                 break;
3807             }
3808             ramblock_recv_bitmap_set(block, host);
3809             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3810         }
3811
3812         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3813         case RAM_SAVE_FLAG_MEM_SIZE:
3814             /* Synchronize RAM block list */
3815             total_ram_bytes = addr;
3816             while (!ret && total_ram_bytes) {
3817                 RAMBlock *block;
3818                 char id[256];
3819                 ram_addr_t length;
3820
3821                 len = qemu_get_byte(f);
3822                 qemu_get_buffer(f, (uint8_t *)id, len);
3823                 id[len] = 0;
3824                 length = qemu_get_be64(f);
3825
3826                 block = qemu_ram_block_by_name(id);
3827                 if (block && !qemu_ram_is_migratable(block)) {
3828                     error_report("block %s should not be migrated !", id);
3829                     ret = -EINVAL;
3830                 } else if (block) {
3831                     if (length != block->used_length) {
3832                         Error *local_err = NULL;
3833
3834                         ret = qemu_ram_resize(block, length,
3835                                               &local_err);
3836                         if (local_err) {
3837                             error_report_err(local_err);
3838                         }
3839                     }
3840                     /* For postcopy we need to check hugepage sizes match */
3841                     if (postcopy_advised &&
3842                         block->page_size != qemu_host_page_size) {
3843                         uint64_t remote_page_size = qemu_get_be64(f);
3844                         if (remote_page_size != block->page_size) {
3845                             error_report("Mismatched RAM page size %s "
3846                                          "(local) %zd != %" PRId64,
3847                                          id, block->page_size,
3848                                          remote_page_size);
3849                             ret = -EINVAL;
3850                         }
3851                     }
3852                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3853                                           block->idstr);
3854                 } else {
3855                     error_report("Unknown ramblock \"%s\", cannot "
3856                                  "accept migration", id);
3857                     ret = -EINVAL;
3858                 }
3859
3860                 total_ram_bytes -= length;
3861             }
3862             break;
3863
3864         case RAM_SAVE_FLAG_ZERO:
3865             ch = qemu_get_byte(f);
3866             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3867             break;
3868
3869         case RAM_SAVE_FLAG_PAGE:
3870             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3871             break;
3872
3873         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3874             len = qemu_get_be32(f);
3875             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3876                 error_report("Invalid compressed data length: %d", len);
3877                 ret = -EINVAL;
3878                 break;
3879             }
3880             decompress_data_with_multi_threads(f, host, len);
3881             break;
3882
3883         case RAM_SAVE_FLAG_XBZRLE:
3884             if (load_xbzrle(f, addr, host) < 0) {
3885                 error_report("Failed to decompress XBZRLE page at "
3886                              RAM_ADDR_FMT, addr);
3887                 ret = -EINVAL;
3888                 break;
3889             }
3890             break;
3891         case RAM_SAVE_FLAG_EOS:
3892             /* normal exit */
3893             multifd_recv_sync_main();
3894             break;
3895         default:
3896             if (flags & RAM_SAVE_FLAG_HOOK) {
3897                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3898             } else {
3899                 error_report("Unknown combination of migration flags: %#x",
3900                              flags);
3901                 ret = -EINVAL;
3902             }
3903         }
3904         if (!ret) {
3905             ret = qemu_file_get_error(f);
3906         }
3907     }
3908
3909     ret |= wait_for_decompress_done();
3910     rcu_read_unlock();
3911     trace_ram_load_complete(ret, seq_iter);
3912     return ret;
3913 }
3914
3915 static bool ram_has_postcopy(void *opaque)
3916 {
3917     RAMBlock *rb;
3918     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3919         if (ramblock_is_pmem(rb)) {
3920             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3921                          "is not supported now!", rb->idstr, rb->host);
3922             return false;
3923         }
3924     }
3925
3926     return migrate_postcopy_ram();
3927 }
3928
3929 /* Sync all the dirty bitmap with destination VM.  */
3930 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3931 {
3932     RAMBlock *block;
3933     QEMUFile *file = s->to_dst_file;
3934     int ramblock_count = 0;
3935
3936     trace_ram_dirty_bitmap_sync_start();
3937
3938     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3939         qemu_savevm_send_recv_bitmap(file, block->idstr);
3940         trace_ram_dirty_bitmap_request(block->idstr);
3941         ramblock_count++;
3942     }
3943
3944     trace_ram_dirty_bitmap_sync_wait();
3945
3946     /* Wait until all the ramblocks' dirty bitmap synced */
3947     while (ramblock_count--) {
3948         qemu_sem_wait(&s->rp_state.rp_sem);
3949     }
3950
3951     trace_ram_dirty_bitmap_sync_complete();
3952
3953     return 0;
3954 }
3955
3956 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3957 {
3958     qemu_sem_post(&s->rp_state.rp_sem);
3959 }
3960
3961 /*
3962  * Read the received bitmap, revert it as the initial dirty bitmap.
3963  * This is only used when the postcopy migration is paused but wants
3964  * to resume from a middle point.
3965  */
3966 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3967 {
3968     int ret = -EINVAL;
3969     QEMUFile *file = s->rp_state.from_dst_file;
3970     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3971     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3972     uint64_t size, end_mark;
3973
3974     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3975
3976     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3977         error_report("%s: incorrect state %s", __func__,
3978                      MigrationStatus_str(s->state));
3979         return -EINVAL;
3980     }
3981
3982     /*
3983      * Note: see comments in ramblock_recv_bitmap_send() on why we
3984      * need the endianess convertion, and the paddings.
3985      */
3986     local_size = ROUND_UP(local_size, 8);
3987
3988     /* Add paddings */
3989     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3990
3991     size = qemu_get_be64(file);
3992
3993     /* The size of the bitmap should match with our ramblock */
3994     if (size != local_size) {
3995         error_report("%s: ramblock '%s' bitmap size mismatch "
3996                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3997                      block->idstr, size, local_size);
3998         ret = -EINVAL;
3999         goto out;
4000     }
4001
4002     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4003     end_mark = qemu_get_be64(file);
4004
4005     ret = qemu_file_get_error(file);
4006     if (ret || size != local_size) {
4007         error_report("%s: read bitmap failed for ramblock '%s': %d"
4008                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4009                      __func__, block->idstr, ret, local_size, size);
4010         ret = -EIO;
4011         goto out;
4012     }
4013
4014     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4015         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4016                      __func__, block->idstr, end_mark);
4017         ret = -EINVAL;
4018         goto out;
4019     }
4020
4021     /*
4022      * Endianess convertion. We are during postcopy (though paused).
4023      * The dirty bitmap won't change. We can directly modify it.
4024      */
4025     bitmap_from_le(block->bmap, le_bitmap, nbits);
4026
4027     /*
4028      * What we received is "received bitmap". Revert it as the initial
4029      * dirty bitmap for this ramblock.
4030      */
4031     bitmap_complement(block->bmap, block->bmap, nbits);
4032
4033     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4034
4035     /*
4036      * We succeeded to sync bitmap for current ramblock. If this is
4037      * the last one to sync, we need to notify the main send thread.
4038      */
4039     ram_dirty_bitmap_reload_notify(s);
4040
4041     ret = 0;
4042 out:
4043     g_free(le_bitmap);
4044     return ret;
4045 }
4046
4047 static int ram_resume_prepare(MigrationState *s, void *opaque)
4048 {
4049     RAMState *rs = *(RAMState **)opaque;
4050     int ret;
4051
4052     ret = ram_dirty_bitmap_sync_all(s, rs);
4053     if (ret) {
4054         return ret;
4055     }
4056
4057     ram_state_resume_prepare(rs, s->to_dst_file);
4058
4059     return 0;
4060 }
4061
4062 static SaveVMHandlers savevm_ram_handlers = {
4063     .save_setup = ram_save_setup,
4064     .save_live_iterate = ram_save_iterate,
4065     .save_live_complete_postcopy = ram_save_complete,
4066     .save_live_complete_precopy = ram_save_complete,
4067     .has_postcopy = ram_has_postcopy,
4068     .save_live_pending = ram_save_pending,
4069     .load_state = ram_load,
4070     .save_cleanup = ram_save_cleanup,
4071     .load_setup = ram_load_setup,
4072     .load_cleanup = ram_load_cleanup,
4073     .resume_prepare = ram_resume_prepare,
4074 };
4075
4076 void ram_mig_init(void)
4077 {
4078     qemu_mutex_init(&XBZRLE.lock);
4079     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4080 }