migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "qemu/pmem.h"
  37 #include "xbzrle.h"
  38 #include "ram.h"
  39 #include "migration.h"
  40 #include "socket.h"
  41 #include "migration/register.h"
  42 #include "migration/misc.h"
  43 #include "qemu-file.h"
  44 #include "postcopy-ram.h"
  45 #include "page_cache.h"
  46 #include "qemu/error-report.h"
  47 #include "qapi/error.h"
  48 #include "qapi/qapi-events-migration.h"
  49 #include "qapi/qmp/qerror.h"
  50 #include "trace.h"
  51 #include "exec/ram_addr.h"
  52 #include "exec/target_page.h"
  53 #include "qemu/rcu_queue.h"
  54 #include "migration/colo.h"
  55 #include "block.h"
  56 #include "sysemu/sysemu.h"
  57 #include "qemu/uuid.h"
  58 #include "savevm.h"
  59 #include "qemu/iov.h"
  60
  61 /***********************************************************/
  62 /* ram save/restore */
  63
  64 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  65  * worked for pages that where filled with the same char.  We switched
  66  * it to only search for the zero value.  And to avoid confusion with
  67  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  68  */
  69
  70 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  71 #define RAM_SAVE_FLAG_ZERO     0x02
  72 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  73 #define RAM_SAVE_FLAG_PAGE     0x08
  74 #define RAM_SAVE_FLAG_EOS      0x10
  75 #define RAM_SAVE_FLAG_CONTINUE 0x20
  76 #define RAM_SAVE_FLAG_XBZRLE   0x40
  77 /* 0x80 is reserved in migration.h start with 0x100 next */
  78 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  79
  80 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  81 {
  82     return buffer_is_zero(p, size);
  83 }
  84
  85 XBZRLECacheStats xbzrle_counters;
  86
  87 /* struct contains XBZRLE cache and a static page
  88    used by the compression */
  89 static struct {
  90     /* buffer used for XBZRLE encoding */
  91     uint8_t *encoded_buf;
  92     /* buffer for storing page content */
  93     uint8_t *current_buf;
  94     /* Cache for XBZRLE, Protected by lock. */
  95     PageCache *cache;
  96     QemuMutex lock;
  97     /* it will store a page full of zeros */
  98     uint8_t *zero_target_page;
  99     /* buffer used for XBZRLE decoding */
 100     uint8_t *decoded_buf;
 101 } XBZRLE;
 102
 103 static void XBZRLE_cache_lock(void)
 104 {
 105     if (migrate_use_xbzrle())
 106         qemu_mutex_lock(&XBZRLE.lock);
 107 }
 108
 109 static void XBZRLE_cache_unlock(void)
 110 {
 111     if (migrate_use_xbzrle())
 112         qemu_mutex_unlock(&XBZRLE.lock);
 113 }
 114
 115 /**
 116  * xbzrle_cache_resize: resize the xbzrle cache
 117  *
 118  * This function is called from qmp_migrate_set_cache_size in main
 119  * thread, possibly while a migration is in progress.  A running
 120  * migration may be using the cache and might finish during this call,
 121  * hence changes to the cache are protected by XBZRLE.lock().
 122  *
 123  * Returns 0 for success or -1 for error
 124  *
 125  * @new_size: new cache size
 126  * @errp: set *errp if the check failed, with reason
 127  */
 128 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 129 {
 130     PageCache *new_cache;
 131     int64_t ret = 0;
 132
 133     /* Check for truncation */
 134     if (new_size != (size_t)new_size) {
 135         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 136                    "exceeding address space");
 137         return -1;
 138     }
 139
 140     if (new_size == migrate_xbzrle_cache_size()) {
 141         /* nothing to do */
 142         return 0;
 143     }
 144
 145     XBZRLE_cache_lock();
 146
 147     if (XBZRLE.cache != NULL) {
 148         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 149         if (!new_cache) {
 150             ret = -1;
 151             goto out;
 152         }
 153
 154         cache_fini(XBZRLE.cache);
 155         XBZRLE.cache = new_cache;
 156     }
 157 out:
 158     XBZRLE_cache_unlock();
 159     return ret;
 160 }
 161
 162 /* Should be holding either ram_list.mutex, or the RCU lock. */
 163 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 164     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 165         if (!qemu_ram_is_migratable(block)) {} else
 166
 167 #undef RAMBLOCK_FOREACH
 168
 169 static void ramblock_recv_map_init(void)
 170 {
 171     RAMBlock *rb;
 172
 173     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
 174         assert(!rb->receivedmap);
 175         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 176     }
 177 }
 178
 179 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 180 {
 181     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 182                     rb->receivedmap);
 183 }
 184
 185 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 186 {
 187     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 188 }
 189
 190 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 191 {
 192     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 193 }
 194
 195 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 196                                     size_t nr)
 197 {
 198     bitmap_set_atomic(rb->receivedmap,
 199                       ramblock_recv_bitmap_offset(host_addr, rb),
 200                       nr);
 201 }
 202
 203 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 204
 205 /*
 206  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 207  *
 208  * Returns >0 if success with sent bytes, or <0 if error.
 209  */
 210 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 211                                   const char *block_name)
 212 {
 213     RAMBlock *block = qemu_ram_block_by_name(block_name);
 214     unsigned long *le_bitmap, nbits;
 215     uint64_t size;
 216
 217     if (!block) {
 218         error_report("%s: invalid block name: %s", __func__, block_name);
 219         return -1;
 220     }
 221
 222     nbits = block->used_length >> TARGET_PAGE_BITS;
 223
 224     /*
 225      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 226      * machines we may need 4 more bytes for padding (see below
 227      * comment). So extend it a bit before hand.
 228      */
 229     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 230
 231     /*
 232      * Always use little endian when sending the bitmap. This is
 233      * required that when source and destination VMs are not using the
 234      * same endianess. (Note: big endian won't work.)
 235      */
 236     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 237
 238     /* Size of the bitmap, in bytes */
 239     size = DIV_ROUND_UP(nbits, 8);
 240
 241     /*
 242      * size is always aligned to 8 bytes for 64bit machines, but it
 243      * may not be true for 32bit machines. We need this padding to
 244      * make sure the migration can survive even between 32bit and
 245      * 64bit machines.
 246      */
 247     size = ROUND_UP(size, 8);
 248
 249     qemu_put_be64(file, size);
 250     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 251     /*
 252      * Mark as an end, in case the middle part is screwed up due to
 253      * some "misterious" reason.
 254      */
 255     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 256     qemu_fflush(file);
 257
 258     g_free(le_bitmap);
 259
 260     if (qemu_file_get_error(file)) {
 261         return qemu_file_get_error(file);
 262     }
 263
 264     return size + sizeof(size);
 265 }
 266
 267 /*
 268  * An outstanding page request, on the source, having been received
 269  * and queued
 270  */
 271 struct RAMSrcPageRequest {
 272     RAMBlock *rb;
 273     hwaddr    offset;
 274     hwaddr    len;
 275
 276     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 277 };
 278
 279 /* State of RAM for migration */
 280 struct RAMState {
 281     /* QEMUFile used for this migration */
 282     QEMUFile *f;
 283     /* Last block that we have visited searching for dirty pages */
 284     RAMBlock *last_seen_block;
 285     /* Last block from where we have sent data */
 286     RAMBlock *last_sent_block;
 287     /* Last dirty target page we have sent */
 288     ram_addr_t last_page;
 289     /* last ram version we have seen */
 290     uint32_t last_version;
 291     /* We are in the first round */
 292     bool ram_bulk_stage;
 293     /* How many times we have dirty too many pages */
 294     int dirty_rate_high_cnt;
 295     /* these variables are used for bitmap sync */
 296     /* last time we did a full bitmap_sync */
 297     int64_t time_last_bitmap_sync;
 298     /* bytes transferred at start_time */
 299     uint64_t bytes_xfer_prev;
 300     /* number of dirty pages since start_time */
 301     uint64_t num_dirty_pages_period;
 302     /* xbzrle misses since the beginning of the period */
 303     uint64_t xbzrle_cache_miss_prev;
 304
 305     /* compression statistics since the beginning of the period */
 306     /* amount of count that no free thread to compress data */
 307     uint64_t compress_thread_busy_prev;
 308     /* amount bytes after compression */
 309     uint64_t compressed_size_prev;
 310     /* amount of compressed pages */
 311     uint64_t compress_pages_prev;
 312
 313     /* total handled target pages at the beginning of period */
 314     uint64_t target_page_count_prev;
 315     /* total handled target pages since start */
 316     uint64_t target_page_count;
 317     /* number of dirty bits in the bitmap */
 318     uint64_t migration_dirty_pages;
 319     /* protects modification of the bitmap */
 320     QemuMutex bitmap_mutex;
 321     /* The RAMBlock used in the last src_page_requests */
 322     RAMBlock *last_req_rb;
 323     /* Queue of outstanding page requests from the destination */
 324     QemuMutex src_page_req_mutex;
 325     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 326 };
 327 typedef struct RAMState RAMState;
 328
 329 static RAMState *ram_state;
 330
 331 uint64_t ram_bytes_remaining(void)
 332 {
 333     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 334                        0;
 335 }
 336
 337 MigrationStats ram_counters;
 338
 339 /* used by the search for pages to send */
 340 struct PageSearchStatus {
 341     /* Current block being searched */
 342     RAMBlock    *block;
 343     /* Current page to search from */
 344     unsigned long page;
 345     /* Set once we wrap around */
 346     bool         complete_round;
 347 };
 348 typedef struct PageSearchStatus PageSearchStatus;
 349
 350 CompressionStats compression_counters;
 351
 352 struct CompressParam {
 353     bool done;
 354     bool quit;
 355     bool zero_page;
 356     QEMUFile *file;
 357     QemuMutex mutex;
 358     QemuCond cond;
 359     RAMBlock *block;
 360     ram_addr_t offset;
 361
 362     /* internally used fields */
 363     z_stream stream;
 364     uint8_t *originbuf;
 365 };
 366 typedef struct CompressParam CompressParam;
 367
 368 struct DecompressParam {
 369     bool done;
 370     bool quit;
 371     QemuMutex mutex;
 372     QemuCond cond;
 373     void *des;
 374     uint8_t *compbuf;
 375     int len;
 376     z_stream stream;
 377 };
 378 typedef struct DecompressParam DecompressParam;
 379
 380 static CompressParam *comp_param;
 381 static QemuThread *compress_threads;
 382 /* comp_done_cond is used to wake up the migration thread when
 383  * one of the compression threads has finished the compression.
 384  * comp_done_lock is used to co-work with comp_done_cond.
 385  */
 386 static QemuMutex comp_done_lock;
 387 static QemuCond comp_done_cond;
 388 /* The empty QEMUFileOps will be used by file in CompressParam */
 389 static const QEMUFileOps empty_ops = { };
 390
 391 static QEMUFile *decomp_file;
 392 static DecompressParam *decomp_param;
 393 static QemuThread *decompress_threads;
 394 static QemuMutex decomp_done_lock;
 395 static QemuCond decomp_done_cond;
 396
 397 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 398                                  ram_addr_t offset, uint8_t *source_buf);
 399
 400 static void *do_data_compress(void *opaque)
 401 {
 402     CompressParam *param = opaque;
 403     RAMBlock *block;
 404     ram_addr_t offset;
 405     bool zero_page;
 406
 407     qemu_mutex_lock(&param->mutex);
 408     while (!param->quit) {
 409         if (param->block) {
 410             block = param->block;
 411             offset = param->offset;
 412             param->block = NULL;
 413             qemu_mutex_unlock(&param->mutex);
 414
 415             zero_page = do_compress_ram_page(param->file, &param->stream,
 416                                              block, offset, param->originbuf);
 417
 418             qemu_mutex_lock(&comp_done_lock);
 419             param->done = true;
 420             param->zero_page = zero_page;
 421             qemu_cond_signal(&comp_done_cond);
 422             qemu_mutex_unlock(&comp_done_lock);
 423
 424             qemu_mutex_lock(&param->mutex);
 425         } else {
 426             qemu_cond_wait(&param->cond, &param->mutex);
 427         }
 428     }
 429     qemu_mutex_unlock(&param->mutex);
 430
 431     return NULL;
 432 }
 433
 434 static void compress_threads_save_cleanup(void)
 435 {
 436     int i, thread_count;
 437
 438     if (!migrate_use_compression() || !comp_param) {
 439         return;
 440     }
 441
 442     thread_count = migrate_compress_threads();
 443     for (i = 0; i < thread_count; i++) {
 444         /*
 445          * we use it as a indicator which shows if the thread is
 446          * properly init'd or not
 447          */
 448         if (!comp_param[i].file) {
 449             break;
 450         }
 451
 452         qemu_mutex_lock(&comp_param[i].mutex);
 453         comp_param[i].quit = true;
 454         qemu_cond_signal(&comp_param[i].cond);
 455         qemu_mutex_unlock(&comp_param[i].mutex);
 456
 457         qemu_thread_join(compress_threads + i);
 458         qemu_mutex_destroy(&comp_param[i].mutex);
 459         qemu_cond_destroy(&comp_param[i].cond);
 460         deflateEnd(&comp_param[i].stream);
 461         g_free(comp_param[i].originbuf);
 462         qemu_fclose(comp_param[i].file);
 463         comp_param[i].file = NULL;
 464     }
 465     qemu_mutex_destroy(&comp_done_lock);
 466     qemu_cond_destroy(&comp_done_cond);
 467     g_free(compress_threads);
 468     g_free(comp_param);
 469     compress_threads = NULL;
 470     comp_param = NULL;
 471 }
 472
 473 static int compress_threads_save_setup(void)
 474 {
 475     int i, thread_count;
 476
 477     if (!migrate_use_compression()) {
 478         return 0;
 479     }
 480     thread_count = migrate_compress_threads();
 481     compress_threads = g_new0(QemuThread, thread_count);
 482     comp_param = g_new0(CompressParam, thread_count);
 483     qemu_cond_init(&comp_done_cond);
 484     qemu_mutex_init(&comp_done_lock);
 485     for (i = 0; i < thread_count; i++) {
 486         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 487         if (!comp_param[i].originbuf) {
 488             goto exit;
 489         }
 490
 491         if (deflateInit(&comp_param[i].stream,
 492                         migrate_compress_level()) != Z_OK) {
 493             g_free(comp_param[i].originbuf);
 494             goto exit;
 495         }
 496
 497         /* comp_param[i].file is just used as a dummy buffer to save data,
 498          * set its ops to empty.
 499          */
 500         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 501         comp_param[i].done = true;
 502         comp_param[i].quit = false;
 503         qemu_mutex_init(&comp_param[i].mutex);
 504         qemu_cond_init(&comp_param[i].cond);
 505         qemu_thread_create(compress_threads + i, "compress",
 506                            do_data_compress, comp_param + i,
 507                            QEMU_THREAD_JOINABLE);
 508     }
 509     return 0;
 510
 511 exit:
 512     compress_threads_save_cleanup();
 513     return -1;
 514 }
 515
 516 /* Multiple fd's */
 517
 518 #define MULTIFD_MAGIC 0x11223344U
 519 #define MULTIFD_VERSION 1
 520
 521 #define MULTIFD_FLAG_SYNC (1 << 0)
 522
 523 typedef struct {
 524     uint32_t magic;
 525     uint32_t version;
 526     unsigned char uuid[16]; /* QemuUUID */
 527     uint8_t id;
 528 } __attribute__((packed)) MultiFDInit_t;
 529
 530 typedef struct {
 531     uint32_t magic;
 532     uint32_t version;
 533     uint32_t flags;
 534     uint32_t size;
 535     uint32_t used;
 536     uint64_t packet_num;
 537     char ramblock[256];
 538     uint64_t offset[];
 539 } __attribute__((packed)) MultiFDPacket_t;
 540
 541 typedef struct {
 542     /* number of used pages */
 543     uint32_t used;
 544     /* number of allocated pages */
 545     uint32_t allocated;
 546     /* global number of generated multifd packets */
 547     uint64_t packet_num;
 548     /* offset of each page */
 549     ram_addr_t *offset;
 550     /* pointer to each page */
 551     struct iovec *iov;
 552     RAMBlock *block;
 553 } MultiFDPages_t;
 554
 555 typedef struct {
 556     /* this fields are not changed once the thread is created */
 557     /* channel number */
 558     uint8_t id;
 559     /* channel thread name */
 560     char *name;
 561     /* channel thread id */
 562     QemuThread thread;
 563     /* communication channel */
 564     QIOChannel *c;
 565     /* sem where to wait for more work */
 566     QemuSemaphore sem;
 567     /* this mutex protects the following parameters */
 568     QemuMutex mutex;
 569     /* is this channel thread running */
 570     bool running;
 571     /* should this thread finish */
 572     bool quit;
 573     /* thread has work to do */
 574     int pending_job;
 575     /* array of pages to sent */
 576     MultiFDPages_t *pages;
 577     /* packet allocated len */
 578     uint32_t packet_len;
 579     /* pointer to the packet */
 580     MultiFDPacket_t *packet;
 581     /* multifd flags for each packet */
 582     uint32_t flags;
 583     /* global number of generated multifd packets */
 584     uint64_t packet_num;
 585     /* thread local variables */
 586     /* packets sent through this channel */
 587     uint64_t num_packets;
 588     /* pages sent through this channel */
 589     uint64_t num_pages;
 590     /* syncs main thread and channels */
 591     QemuSemaphore sem_sync;
 592 }  MultiFDSendParams;
 593
 594 typedef struct {
 595     /* this fields are not changed once the thread is created */
 596     /* channel number */
 597     uint8_t id;
 598     /* channel thread name */
 599     char *name;
 600     /* channel thread id */
 601     QemuThread thread;
 602     /* communication channel */
 603     QIOChannel *c;
 604     /* this mutex protects the following parameters */
 605     QemuMutex mutex;
 606     /* is this channel thread running */
 607     bool running;
 608     /* array of pages to receive */
 609     MultiFDPages_t *pages;
 610     /* packet allocated len */
 611     uint32_t packet_len;
 612     /* pointer to the packet */
 613     MultiFDPacket_t *packet;
 614     /* multifd flags for each packet */
 615     uint32_t flags;
 616     /* global number of generated multifd packets */
 617     uint64_t packet_num;
 618     /* thread local variables */
 619     /* packets sent through this channel */
 620     uint64_t num_packets;
 621     /* pages sent through this channel */
 622     uint64_t num_pages;
 623     /* syncs main thread and channels */
 624     QemuSemaphore sem_sync;
 625 } MultiFDRecvParams;
 626
 627 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 628 {
 629     MultiFDInit_t msg;
 630     int ret;
 631
 632     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 633     msg.version = cpu_to_be32(MULTIFD_VERSION);
 634     msg.id = p->id;
 635     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 636
 637     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 638     if (ret != 0) {
 639         return -1;
 640     }
 641     return 0;
 642 }
 643
 644 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 645 {
 646     MultiFDInit_t msg;
 647     int ret;
 648
 649     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 650     if (ret != 0) {
 651         return -1;
 652     }
 653
 654     msg.magic = be32_to_cpu(msg.magic);
 655     msg.version = be32_to_cpu(msg.version);
 656
 657     if (msg.magic != MULTIFD_MAGIC) {
 658         error_setg(errp, "multifd: received packet magic %x "
 659                    "expected %x", msg.magic, MULTIFD_MAGIC);
 660         return -1;
 661     }
 662
 663     if (msg.version != MULTIFD_VERSION) {
 664         error_setg(errp, "multifd: received packet version %d "
 665                    "expected %d", msg.version, MULTIFD_VERSION);
 666         return -1;
 667     }
 668
 669     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 670         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 671         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 672
 673         error_setg(errp, "multifd: received uuid '%s' and expected "
 674                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 675         g_free(uuid);
 676         g_free(msg_uuid);
 677         return -1;
 678     }
 679
 680     if (msg.id > migrate_multifd_channels()) {
 681         error_setg(errp, "multifd: received channel version %d "
 682                    "expected %d", msg.version, MULTIFD_VERSION);
 683         return -1;
 684     }
 685
 686     return msg.id;
 687 }
 688
 689 static MultiFDPages_t *multifd_pages_init(size_t size)
 690 {
 691     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 692
 693     pages->allocated = size;
 694     pages->iov = g_new0(struct iovec, size);
 695     pages->offset = g_new0(ram_addr_t, size);
 696
 697     return pages;
 698 }
 699
 700 static void multifd_pages_clear(MultiFDPages_t *pages)
 701 {
 702     pages->used = 0;
 703     pages->allocated = 0;
 704     pages->packet_num = 0;
 705     pages->block = NULL;
 706     g_free(pages->iov);
 707     pages->iov = NULL;
 708     g_free(pages->offset);
 709     pages->offset = NULL;
 710     g_free(pages);
 711 }
 712
 713 static void multifd_send_fill_packet(MultiFDSendParams *p)
 714 {
 715     MultiFDPacket_t *packet = p->packet;
 716     int i;
 717
 718     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
 719     packet->version = cpu_to_be32(MULTIFD_VERSION);
 720     packet->flags = cpu_to_be32(p->flags);
 721     packet->size = cpu_to_be32(migrate_multifd_page_count());
 722     packet->used = cpu_to_be32(p->pages->used);
 723     packet->packet_num = cpu_to_be64(p->packet_num);
 724
 725     if (p->pages->block) {
 726         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 727     }
 728
 729     for (i = 0; i < p->pages->used; i++) {
 730         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 731     }
 732 }
 733
 734 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 735 {
 736     MultiFDPacket_t *packet = p->packet;
 737     RAMBlock *block;
 738     int i;
 739
 740     packet->magic = be32_to_cpu(packet->magic);
 741     if (packet->magic != MULTIFD_MAGIC) {
 742         error_setg(errp, "multifd: received packet "
 743                    "magic %x and expected magic %x",
 744                    packet->magic, MULTIFD_MAGIC);
 745         return -1;
 746     }
 747
 748     packet->version = be32_to_cpu(packet->version);
 749     if (packet->version != MULTIFD_VERSION) {
 750         error_setg(errp, "multifd: received packet "
 751                    "version %d and expected version %d",
 752                    packet->version, MULTIFD_VERSION);
 753         return -1;
 754     }
 755
 756     p->flags = be32_to_cpu(packet->flags);
 757
 758     packet->size = be32_to_cpu(packet->size);
 759     if (packet->size > migrate_multifd_page_count()) {
 760         error_setg(errp, "multifd: received packet "
 761                    "with size %d and expected maximum size %d",
 762                    packet->size, migrate_multifd_page_count()) ;
 763         return -1;
 764     }
 765
 766     p->pages->used = be32_to_cpu(packet->used);
 767     if (p->pages->used > packet->size) {
 768         error_setg(errp, "multifd: received packet "
 769                    "with size %d and expected maximum size %d",
 770                    p->pages->used, packet->size) ;
 771         return -1;
 772     }
 773
 774     p->packet_num = be64_to_cpu(packet->packet_num);
 775
 776     if (p->pages->used) {
 777         /* make sure that ramblock is 0 terminated */
 778         packet->ramblock[255] = 0;
 779         block = qemu_ram_block_by_name(packet->ramblock);
 780         if (!block) {
 781             error_setg(errp, "multifd: unknown ram block %s",
 782                        packet->ramblock);
 783             return -1;
 784         }
 785     }
 786
 787     for (i = 0; i < p->pages->used; i++) {
 788         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 789
 790         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 791             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 792                        " (max " RAM_ADDR_FMT ")",
 793                        offset, block->max_length);
 794             return -1;
 795         }
 796         p->pages->iov[i].iov_base = block->host + offset;
 797         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 798     }
 799
 800     return 0;
 801 }
 802
 803 struct {
 804     MultiFDSendParams *params;
 805     /* number of created threads */
 806     int count;
 807     /* array of pages to sent */
 808     MultiFDPages_t *pages;
 809     /* syncs main thread and channels */
 810     QemuSemaphore sem_sync;
 811     /* global number of generated multifd packets */
 812     uint64_t packet_num;
 813     /* send channels ready */
 814     QemuSemaphore channels_ready;
 815 } *multifd_send_state;
 816
 817 /*
 818  * How we use multifd_send_state->pages and channel->pages?
 819  *
 820  * We create a pages for each channel, and a main one.  Each time that
 821  * we need to send a batch of pages we interchange the ones between
 822  * multifd_send_state and the channel that is sending it.  There are
 823  * two reasons for that:
 824  *    - to not have to do so many mallocs during migration
 825  *    - to make easier to know what to free at the end of migration
 826  *
 827  * This way we always know who is the owner of each "pages" struct,
 828  * and we don't need any loocking.  It belongs to the migration thread
 829  * or to the channel thread.  Switching is safe because the migration
 830  * thread is using the channel mutex when changing it, and the channel
 831  * have to had finish with its own, otherwise pending_job can't be
 832  * false.
 833  */
 834
 835 static void multifd_send_pages(void)
 836 {
 837     int i;
 838     static int next_channel;
 839     MultiFDSendParams *p = NULL; /* make happy gcc */
 840     MultiFDPages_t *pages = multifd_send_state->pages;
 841     uint64_t transferred;
 842
 843     qemu_sem_wait(&multifd_send_state->channels_ready);
 844     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 845         p = &multifd_send_state->params[i];
 846
 847         qemu_mutex_lock(&p->mutex);
 848         if (!p->pending_job) {
 849             p->pending_job++;
 850             next_channel = (i + 1) % migrate_multifd_channels();
 851             break;
 852         }
 853         qemu_mutex_unlock(&p->mutex);
 854     }
 855     p->pages->used = 0;
 856
 857     p->packet_num = multifd_send_state->packet_num++;
 858     p->pages->block = NULL;
 859     multifd_send_state->pages = p->pages;
 860     p->pages = pages;
 861     transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
 862     ram_counters.multifd_bytes += transferred;
 863     ram_counters.transferred += transferred;;
 864     qemu_mutex_unlock(&p->mutex);
 865     qemu_sem_post(&p->sem);
 866 }
 867
 868 static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
 869 {
 870     MultiFDPages_t *pages = multifd_send_state->pages;
 871
 872     if (!pages->block) {
 873         pages->block = block;
 874     }
 875
 876     if (pages->block == block) {
 877         pages->offset[pages->used] = offset;
 878         pages->iov[pages->used].iov_base = block->host + offset;
 879         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
 880         pages->used++;
 881
 882         if (pages->used < pages->allocated) {
 883             return;
 884         }
 885     }
 886
 887     multifd_send_pages();
 888
 889     if (pages->block != block) {
 890         multifd_queue_page(block, offset);
 891     }
 892 }
 893
 894 static void multifd_send_terminate_threads(Error *err)
 895 {
 896     int i;
 897
 898     if (err) {
 899         MigrationState *s = migrate_get_current();
 900         migrate_set_error(s, err);
 901         if (s->state == MIGRATION_STATUS_SETUP ||
 902             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
 903             s->state == MIGRATION_STATUS_DEVICE ||
 904             s->state == MIGRATION_STATUS_ACTIVE) {
 905             migrate_set_state(&s->state, s->state,
 906                               MIGRATION_STATUS_FAILED);
 907         }
 908     }
 909
 910     for (i = 0; i < migrate_multifd_channels(); i++) {
 911         MultiFDSendParams *p = &multifd_send_state->params[i];
 912
 913         qemu_mutex_lock(&p->mutex);
 914         p->quit = true;
 915         qemu_sem_post(&p->sem);
 916         qemu_mutex_unlock(&p->mutex);
 917     }
 918 }
 919
 920 void multifd_save_cleanup(void)
 921 {
 922     int i;
 923
 924     if (!migrate_use_multifd()) {
 925         return;
 926     }
 927     multifd_send_terminate_threads(NULL);
 928     for (i = 0; i < migrate_multifd_channels(); i++) {
 929         MultiFDSendParams *p = &multifd_send_state->params[i];
 930
 931         if (p->running) {
 932             qemu_thread_join(&p->thread);
 933         }
 934         socket_send_channel_destroy(p->c);
 935         p->c = NULL;
 936         qemu_mutex_destroy(&p->mutex);
 937         qemu_sem_destroy(&p->sem);
 938         qemu_sem_destroy(&p->sem_sync);
 939         g_free(p->name);
 940         p->name = NULL;
 941         multifd_pages_clear(p->pages);
 942         p->pages = NULL;
 943         p->packet_len = 0;
 944         g_free(p->packet);
 945         p->packet = NULL;
 946     }
 947     qemu_sem_destroy(&multifd_send_state->channels_ready);
 948     qemu_sem_destroy(&multifd_send_state->sem_sync);
 949     g_free(multifd_send_state->params);
 950     multifd_send_state->params = NULL;
 951     multifd_pages_clear(multifd_send_state->pages);
 952     multifd_send_state->pages = NULL;
 953     g_free(multifd_send_state);
 954     multifd_send_state = NULL;
 955 }
 956
 957 static void multifd_send_sync_main(void)
 958 {
 959     int i;
 960
 961     if (!migrate_use_multifd()) {
 962         return;
 963     }
 964     if (multifd_send_state->pages->used) {
 965         multifd_send_pages();
 966     }
 967     for (i = 0; i < migrate_multifd_channels(); i++) {
 968         MultiFDSendParams *p = &multifd_send_state->params[i];
 969
 970         trace_multifd_send_sync_main_signal(p->id);
 971
 972         qemu_mutex_lock(&p->mutex);
 973
 974         p->packet_num = multifd_send_state->packet_num++;
 975         p->flags |= MULTIFD_FLAG_SYNC;
 976         p->pending_job++;
 977         qemu_mutex_unlock(&p->mutex);
 978         qemu_sem_post(&p->sem);
 979     }
 980     for (i = 0; i < migrate_multifd_channels(); i++) {
 981         MultiFDSendParams *p = &multifd_send_state->params[i];
 982
 983         trace_multifd_send_sync_main_wait(p->id);
 984         qemu_sem_wait(&multifd_send_state->sem_sync);
 985     }
 986     trace_multifd_send_sync_main(multifd_send_state->packet_num);
 987 }
 988
 989 static void *multifd_send_thread(void *opaque)
 990 {
 991     MultiFDSendParams *p = opaque;
 992     Error *local_err = NULL;
 993     int ret;
 994
 995     trace_multifd_send_thread_start(p->id);
 996     rcu_register_thread();
 997
 998     if (multifd_send_initial_packet(p, &local_err) < 0) {
 999         goto out;
1000     }
1001     /* initial packet */
1002     p->num_packets = 1;
1003
1004     while (true) {
1005         qemu_sem_wait(&p->sem);
1006         qemu_mutex_lock(&p->mutex);
1007
1008         if (p->pending_job) {
1009             uint32_t used = p->pages->used;
1010             uint64_t packet_num = p->packet_num;
1011             uint32_t flags = p->flags;
1012
1013             multifd_send_fill_packet(p);
1014             p->flags = 0;
1015             p->num_packets++;
1016             p->num_pages += used;
1017             p->pages->used = 0;
1018             qemu_mutex_unlock(&p->mutex);
1019
1020             trace_multifd_send(p->id, packet_num, used, flags);
1021
1022             ret = qio_channel_write_all(p->c, (void *)p->packet,
1023                                         p->packet_len, &local_err);
1024             if (ret != 0) {
1025                 break;
1026             }
1027
1028             ret = qio_channel_writev_all(p->c, p->pages->iov, used, &local_err);
1029             if (ret != 0) {
1030                 break;
1031             }
1032
1033             qemu_mutex_lock(&p->mutex);
1034             p->pending_job--;
1035             qemu_mutex_unlock(&p->mutex);
1036
1037             if (flags & MULTIFD_FLAG_SYNC) {
1038                 qemu_sem_post(&multifd_send_state->sem_sync);
1039             }
1040             qemu_sem_post(&multifd_send_state->channels_ready);
1041         } else if (p->quit) {
1042             qemu_mutex_unlock(&p->mutex);
1043             break;
1044         } else {
1045             qemu_mutex_unlock(&p->mutex);
1046             /* sometimes there are spurious wakeups */
1047         }
1048     }
1049
1050 out:
1051     if (local_err) {
1052         multifd_send_terminate_threads(local_err);
1053     }
1054
1055     qemu_mutex_lock(&p->mutex);
1056     p->running = false;
1057     qemu_mutex_unlock(&p->mutex);
1058
1059     rcu_unregister_thread();
1060     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1061
1062     return NULL;
1063 }
1064
1065 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1066 {
1067     MultiFDSendParams *p = opaque;
1068     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1069     Error *local_err = NULL;
1070
1071     if (qio_task_propagate_error(task, &local_err)) {
1072         migrate_set_error(migrate_get_current(), local_err);
1073         multifd_save_cleanup();
1074     } else {
1075         p->c = QIO_CHANNEL(sioc);
1076         qio_channel_set_delay(p->c, false);
1077         p->running = true;
1078         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1079                            QEMU_THREAD_JOINABLE);
1080
1081         atomic_inc(&multifd_send_state->count);
1082     }
1083 }
1084
1085 int multifd_save_setup(void)
1086 {
1087     int thread_count;
1088     uint32_t page_count = migrate_multifd_page_count();
1089     uint8_t i;
1090
1091     if (!migrate_use_multifd()) {
1092         return 0;
1093     }
1094     thread_count = migrate_multifd_channels();
1095     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1096     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1097     atomic_set(&multifd_send_state->count, 0);
1098     multifd_send_state->pages = multifd_pages_init(page_count);
1099     qemu_sem_init(&multifd_send_state->sem_sync, 0);
1100     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1101
1102     for (i = 0; i < thread_count; i++) {
1103         MultiFDSendParams *p = &multifd_send_state->params[i];
1104
1105         qemu_mutex_init(&p->mutex);
1106         qemu_sem_init(&p->sem, 0);
1107         qemu_sem_init(&p->sem_sync, 0);
1108         p->quit = false;
1109         p->pending_job = 0;
1110         p->id = i;
1111         p->pages = multifd_pages_init(page_count);
1112         p->packet_len = sizeof(MultiFDPacket_t)
1113                       + sizeof(ram_addr_t) * page_count;
1114         p->packet = g_malloc0(p->packet_len);
1115         p->name = g_strdup_printf("multifdsend_%d", i);
1116         socket_send_channel_create(multifd_new_send_channel_async, p);
1117     }
1118     return 0;
1119 }
1120
1121 struct {
1122     MultiFDRecvParams *params;
1123     /* number of created threads */
1124     int count;
1125     /* syncs main thread and channels */
1126     QemuSemaphore sem_sync;
1127     /* global number of generated multifd packets */
1128     uint64_t packet_num;
1129 } *multifd_recv_state;
1130
1131 static void multifd_recv_terminate_threads(Error *err)
1132 {
1133     int i;
1134
1135     if (err) {
1136         MigrationState *s = migrate_get_current();
1137         migrate_set_error(s, err);
1138         if (s->state == MIGRATION_STATUS_SETUP ||
1139             s->state == MIGRATION_STATUS_ACTIVE) {
1140             migrate_set_state(&s->state, s->state,
1141                               MIGRATION_STATUS_FAILED);
1142         }
1143     }
1144
1145     for (i = 0; i < migrate_multifd_channels(); i++) {
1146         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1147
1148         qemu_mutex_lock(&p->mutex);
1149         /* We could arrive here for two reasons:
1150            - normal quit, i.e. everything went fine, just finished
1151            - error quit: We close the channels so the channel threads
1152              finish the qio_channel_read_all_eof() */
1153         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1154         qemu_mutex_unlock(&p->mutex);
1155     }
1156 }
1157
1158 int multifd_load_cleanup(Error **errp)
1159 {
1160     int i;
1161     int ret = 0;
1162
1163     if (!migrate_use_multifd()) {
1164         return 0;
1165     }
1166     multifd_recv_terminate_threads(NULL);
1167     for (i = 0; i < migrate_multifd_channels(); i++) {
1168         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1169
1170         if (p->running) {
1171             qemu_thread_join(&p->thread);
1172         }
1173         object_unref(OBJECT(p->c));
1174         p->c = NULL;
1175         qemu_mutex_destroy(&p->mutex);
1176         qemu_sem_destroy(&p->sem_sync);
1177         g_free(p->name);
1178         p->name = NULL;
1179         multifd_pages_clear(p->pages);
1180         p->pages = NULL;
1181         p->packet_len = 0;
1182         g_free(p->packet);
1183         p->packet = NULL;
1184     }
1185     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1186     g_free(multifd_recv_state->params);
1187     multifd_recv_state->params = NULL;
1188     g_free(multifd_recv_state);
1189     multifd_recv_state = NULL;
1190
1191     return ret;
1192 }
1193
1194 static void multifd_recv_sync_main(void)
1195 {
1196     int i;
1197
1198     if (!migrate_use_multifd()) {
1199         return;
1200     }
1201     for (i = 0; i < migrate_multifd_channels(); i++) {
1202         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1203
1204         trace_multifd_recv_sync_main_wait(p->id);
1205         qemu_sem_wait(&multifd_recv_state->sem_sync);
1206         qemu_mutex_lock(&p->mutex);
1207         if (multifd_recv_state->packet_num < p->packet_num) {
1208             multifd_recv_state->packet_num = p->packet_num;
1209         }
1210         qemu_mutex_unlock(&p->mutex);
1211     }
1212     for (i = 0; i < migrate_multifd_channels(); i++) {
1213         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1214
1215         trace_multifd_recv_sync_main_signal(p->id);
1216         qemu_sem_post(&p->sem_sync);
1217     }
1218     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1219 }
1220
1221 static void *multifd_recv_thread(void *opaque)
1222 {
1223     MultiFDRecvParams *p = opaque;
1224     Error *local_err = NULL;
1225     int ret;
1226
1227     trace_multifd_recv_thread_start(p->id);
1228     rcu_register_thread();
1229
1230     while (true) {
1231         uint32_t used;
1232         uint32_t flags;
1233
1234         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1235                                        p->packet_len, &local_err);
1236         if (ret == 0) {   /* EOF */
1237             break;
1238         }
1239         if (ret == -1) {   /* Error */
1240             break;
1241         }
1242
1243         qemu_mutex_lock(&p->mutex);
1244         ret = multifd_recv_unfill_packet(p, &local_err);
1245         if (ret) {
1246             qemu_mutex_unlock(&p->mutex);
1247             break;
1248         }
1249
1250         used = p->pages->used;
1251         flags = p->flags;
1252         trace_multifd_recv(p->id, p->packet_num, used, flags);
1253         p->num_packets++;
1254         p->num_pages += used;
1255         qemu_mutex_unlock(&p->mutex);
1256
1257         ret = qio_channel_readv_all(p->c, p->pages->iov, used, &local_err);
1258         if (ret != 0) {
1259             break;
1260         }
1261
1262         if (flags & MULTIFD_FLAG_SYNC) {
1263             qemu_sem_post(&multifd_recv_state->sem_sync);
1264             qemu_sem_wait(&p->sem_sync);
1265         }
1266     }
1267
1268     if (local_err) {
1269         multifd_recv_terminate_threads(local_err);
1270     }
1271     qemu_mutex_lock(&p->mutex);
1272     p->running = false;
1273     qemu_mutex_unlock(&p->mutex);
1274
1275     rcu_unregister_thread();
1276     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1277
1278     return NULL;
1279 }
1280
1281 int multifd_load_setup(void)
1282 {
1283     int thread_count;
1284     uint32_t page_count = migrate_multifd_page_count();
1285     uint8_t i;
1286
1287     if (!migrate_use_multifd()) {
1288         return 0;
1289     }
1290     thread_count = migrate_multifd_channels();
1291     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1292     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1293     atomic_set(&multifd_recv_state->count, 0);
1294     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1295
1296     for (i = 0; i < thread_count; i++) {
1297         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1298
1299         qemu_mutex_init(&p->mutex);
1300         qemu_sem_init(&p->sem_sync, 0);
1301         p->id = i;
1302         p->pages = multifd_pages_init(page_count);
1303         p->packet_len = sizeof(MultiFDPacket_t)
1304                       + sizeof(ram_addr_t) * page_count;
1305         p->packet = g_malloc0(p->packet_len);
1306         p->name = g_strdup_printf("multifdrecv_%d", i);
1307     }
1308     return 0;
1309 }
1310
1311 bool multifd_recv_all_channels_created(void)
1312 {
1313     int thread_count = migrate_multifd_channels();
1314
1315     if (!migrate_use_multifd()) {
1316         return true;
1317     }
1318
1319     return thread_count == atomic_read(&multifd_recv_state->count);
1320 }
1321
1322 /*
1323  * Try to receive all multifd channels to get ready for the migration.
1324  * - Return true and do not set @errp when correctly receving all channels;
1325  * - Return false and do not set @errp when correctly receiving the current one;
1326  * - Return false and set @errp when failing to receive the current channel.
1327  */
1328 bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
1329 {
1330     MultiFDRecvParams *p;
1331     Error *local_err = NULL;
1332     int id;
1333
1334     id = multifd_recv_initial_packet(ioc, &local_err);
1335     if (id < 0) {
1336         multifd_recv_terminate_threads(local_err);
1337         error_propagate_prepend(errp, local_err,
1338                                 "failed to receive packet"
1339                                 " via multifd channel %d: ",
1340                                 atomic_read(&multifd_recv_state->count));
1341         return false;
1342     }
1343
1344     p = &multifd_recv_state->params[id];
1345     if (p->c != NULL) {
1346         error_setg(&local_err, "multifd: received id '%d' already setup'",
1347                    id);
1348         multifd_recv_terminate_threads(local_err);
1349         error_propagate(errp, local_err);
1350         return false;
1351     }
1352     p->c = ioc;
1353     object_ref(OBJECT(ioc));
1354     /* initial packet */
1355     p->num_packets = 1;
1356
1357     p->running = true;
1358     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1359                        QEMU_THREAD_JOINABLE);
1360     atomic_inc(&multifd_recv_state->count);
1361     return atomic_read(&multifd_recv_state->count) ==
1362            migrate_multifd_channels();
1363 }
1364
1365 /**
1366  * save_page_header: write page header to wire
1367  *
1368  * If this is the 1st block, it also writes the block identification
1369  *
1370  * Returns the number of bytes written
1371  *
1372  * @f: QEMUFile where to send the data
1373  * @block: block that contains the page we want to send
1374  * @offset: offset inside the block for the page
1375  *          in the lower bits, it contains flags
1376  */
1377 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1378                                ram_addr_t offset)
1379 {
1380     size_t size, len;
1381
1382     if (block == rs->last_sent_block) {
1383         offset |= RAM_SAVE_FLAG_CONTINUE;
1384     }
1385     qemu_put_be64(f, offset);
1386     size = 8;
1387
1388     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1389         len = strlen(block->idstr);
1390         qemu_put_byte(f, len);
1391         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1392         size += 1 + len;
1393         rs->last_sent_block = block;
1394     }
1395     return size;
1396 }
1397
1398 /**
1399  * mig_throttle_guest_down: throotle down the guest
1400  *
1401  * Reduce amount of guest cpu execution to hopefully slow down memory
1402  * writes. If guest dirty memory rate is reduced below the rate at
1403  * which we can transfer pages to the destination then we should be
1404  * able to complete migration. Some workloads dirty memory way too
1405  * fast and will not effectively converge, even with auto-converge.
1406  */
1407 static void mig_throttle_guest_down(void)
1408 {
1409     MigrationState *s = migrate_get_current();
1410     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1411     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1412     int pct_max = s->parameters.max_cpu_throttle;
1413
1414     /* We have not started throttling yet. Let's start it. */
1415     if (!cpu_throttle_active()) {
1416         cpu_throttle_set(pct_initial);
1417     } else {
1418         /* Throttling already on, just increase the rate */
1419         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1420                          pct_max));
1421     }
1422 }
1423
1424 /**
1425  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1426  *
1427  * @rs: current RAM state
1428  * @current_addr: address for the zero page
1429  *
1430  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1431  * The important thing is that a stale (not-yet-0'd) page be replaced
1432  * by the new data.
1433  * As a bonus, if the page wasn't in the cache it gets added so that
1434  * when a small write is made into the 0'd page it gets XBZRLE sent.
1435  */
1436 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1437 {
1438     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1439         return;
1440     }
1441
1442     /* We don't care if this fails to allocate a new cache page
1443      * as long as it updated an old one */
1444     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1445                  ram_counters.dirty_sync_count);
1446 }
1447
1448 #define ENCODING_FLAG_XBZRLE 0x1
1449
1450 /**
1451  * save_xbzrle_page: compress and send current page
1452  *
1453  * Returns: 1 means that we wrote the page
1454  *          0 means that page is identical to the one already sent
1455  *          -1 means that xbzrle would be longer than normal
1456  *
1457  * @rs: current RAM state
1458  * @current_data: pointer to the address of the page contents
1459  * @current_addr: addr of the page
1460  * @block: block that contains the page we want to send
1461  * @offset: offset inside the block for the page
1462  * @last_stage: if we are at the completion stage
1463  */
1464 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1465                             ram_addr_t current_addr, RAMBlock *block,
1466                             ram_addr_t offset, bool last_stage)
1467 {
1468     int encoded_len = 0, bytes_xbzrle;
1469     uint8_t *prev_cached_page;
1470
1471     if (!cache_is_cached(XBZRLE.cache, current_addr,
1472                          ram_counters.dirty_sync_count)) {
1473         xbzrle_counters.cache_miss++;
1474         if (!last_stage) {
1475             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1476                              ram_counters.dirty_sync_count) == -1) {
1477                 return -1;
1478             } else {
1479                 /* update *current_data when the page has been
1480                    inserted into cache */
1481                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1482             }
1483         }
1484         return -1;
1485     }
1486
1487     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1488
1489     /* save current buffer into memory */
1490     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1491
1492     /* XBZRLE encoding (if there is no overflow) */
1493     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1494                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1495                                        TARGET_PAGE_SIZE);
1496     if (encoded_len == 0) {
1497         trace_save_xbzrle_page_skipping();
1498         return 0;
1499     } else if (encoded_len == -1) {
1500         trace_save_xbzrle_page_overflow();
1501         xbzrle_counters.overflow++;
1502         /* update data in the cache */
1503         if (!last_stage) {
1504             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1505             *current_data = prev_cached_page;
1506         }
1507         return -1;
1508     }
1509
1510     /* we need to update the data in the cache, in order to get the same data */
1511     if (!last_stage) {
1512         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1513     }
1514
1515     /* Send XBZRLE based compressed page */
1516     bytes_xbzrle = save_page_header(rs, rs->f, block,
1517                                     offset | RAM_SAVE_FLAG_XBZRLE);
1518     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1519     qemu_put_be16(rs->f, encoded_len);
1520     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1521     bytes_xbzrle += encoded_len + 1 + 2;
1522     xbzrle_counters.pages++;
1523     xbzrle_counters.bytes += bytes_xbzrle;
1524     ram_counters.transferred += bytes_xbzrle;
1525
1526     return 1;
1527 }
1528
1529 /**
1530  * migration_bitmap_find_dirty: find the next dirty page from start
1531  *
1532  * Called with rcu_read_lock() to protect migration_bitmap
1533  *
1534  * Returns the byte offset within memory region of the start of a dirty page
1535  *
1536  * @rs: current RAM state
1537  * @rb: RAMBlock where to search for dirty pages
1538  * @start: page where we start the search
1539  */
1540 static inline
1541 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1542                                           unsigned long start)
1543 {
1544     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1545     unsigned long *bitmap = rb->bmap;
1546     unsigned long next;
1547
1548     if (!qemu_ram_is_migratable(rb)) {
1549         return size;
1550     }
1551
1552     if (rs->ram_bulk_stage && start > 0) {
1553         next = start + 1;
1554     } else {
1555         next = find_next_bit(bitmap, size, start);
1556     }
1557
1558     return next;
1559 }
1560
1561 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1562                                                 RAMBlock *rb,
1563                                                 unsigned long page)
1564 {
1565     bool ret;
1566
1567     ret = test_and_clear_bit(page, rb->bmap);
1568
1569     if (ret) {
1570         rs->migration_dirty_pages--;
1571     }
1572     return ret;
1573 }
1574
1575 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1576                                         ram_addr_t start, ram_addr_t length)
1577 {
1578     rs->migration_dirty_pages +=
1579         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1580                                               &rs->num_dirty_pages_period);
1581 }
1582
1583 /**
1584  * ram_pagesize_summary: calculate all the pagesizes of a VM
1585  *
1586  * Returns a summary bitmap of the page sizes of all RAMBlocks
1587  *
1588  * For VMs with just normal pages this is equivalent to the host page
1589  * size. If it's got some huge pages then it's the OR of all the
1590  * different page sizes.
1591  */
1592 uint64_t ram_pagesize_summary(void)
1593 {
1594     RAMBlock *block;
1595     uint64_t summary = 0;
1596
1597     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1598         summary |= block->page_size;
1599     }
1600
1601     return summary;
1602 }
1603
1604 uint64_t ram_get_total_transferred_pages(void)
1605 {
1606     return  ram_counters.normal + ram_counters.duplicate +
1607                 compression_counters.pages + xbzrle_counters.pages;
1608 }
1609
1610 static void migration_update_rates(RAMState *rs, int64_t end_time)
1611 {
1612     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1613     double compressed_size;
1614
1615     /* calculate period counters */
1616     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1617                 / (end_time - rs->time_last_bitmap_sync);
1618
1619     if (!page_count) {
1620         return;
1621     }
1622
1623     if (migrate_use_xbzrle()) {
1624         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1625             rs->xbzrle_cache_miss_prev) / page_count;
1626         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1627     }
1628
1629     if (migrate_use_compression()) {
1630         compression_counters.busy_rate = (double)(compression_counters.busy -
1631             rs->compress_thread_busy_prev) / page_count;
1632         rs->compress_thread_busy_prev = compression_counters.busy;
1633
1634         compressed_size = compression_counters.compressed_size -
1635                           rs->compressed_size_prev;
1636         if (compressed_size) {
1637             double uncompressed_size = (compression_counters.pages -
1638                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1639
1640             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1641             compression_counters.compression_rate =
1642                                         uncompressed_size / compressed_size;
1643
1644             rs->compress_pages_prev = compression_counters.pages;
1645             rs->compressed_size_prev = compression_counters.compressed_size;
1646         }
1647     }
1648 }
1649
1650 static void migration_bitmap_sync(RAMState *rs)
1651 {
1652     RAMBlock *block;
1653     int64_t end_time;
1654     uint64_t bytes_xfer_now;
1655
1656     ram_counters.dirty_sync_count++;
1657
1658     if (!rs->time_last_bitmap_sync) {
1659         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1660     }
1661
1662     trace_migration_bitmap_sync_start();
1663     memory_global_dirty_log_sync();
1664
1665     qemu_mutex_lock(&rs->bitmap_mutex);
1666     rcu_read_lock();
1667     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1668         migration_bitmap_sync_range(rs, block, 0, block->used_length);
1669     }
1670     ram_counters.remaining = ram_bytes_remaining();
1671     rcu_read_unlock();
1672     qemu_mutex_unlock(&rs->bitmap_mutex);
1673
1674     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1675
1676     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1677
1678     /* more than 1 second = 1000 millisecons */
1679     if (end_time > rs->time_last_bitmap_sync + 1000) {
1680         bytes_xfer_now = ram_counters.transferred;
1681
1682         /* During block migration the auto-converge logic incorrectly detects
1683          * that ram migration makes no progress. Avoid this by disabling the
1684          * throttling logic during the bulk phase of block migration. */
1685         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1686             /* The following detection logic can be refined later. For now:
1687                Check to see if the dirtied bytes is 50% more than the approx.
1688                amount of bytes that just got transferred since the last time we
1689                were in this routine. If that happens twice, start or increase
1690                throttling */
1691
1692             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1693                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1694                 (++rs->dirty_rate_high_cnt >= 2)) {
1695                     trace_migration_throttle();
1696                     rs->dirty_rate_high_cnt = 0;
1697                     mig_throttle_guest_down();
1698             }
1699         }
1700
1701         migration_update_rates(rs, end_time);
1702
1703         rs->target_page_count_prev = rs->target_page_count;
1704
1705         /* reset period counters */
1706         rs->time_last_bitmap_sync = end_time;
1707         rs->num_dirty_pages_period = 0;
1708         rs->bytes_xfer_prev = bytes_xfer_now;
1709     }
1710     if (migrate_use_events()) {
1711         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1712     }
1713 }
1714
1715 /**
1716  * save_zero_page_to_file: send the zero page to the file
1717  *
1718  * Returns the size of data written to the file, 0 means the page is not
1719  * a zero page
1720  *
1721  * @rs: current RAM state
1722  * @file: the file where the data is saved
1723  * @block: block that contains the page we want to send
1724  * @offset: offset inside the block for the page
1725  */
1726 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1727                                   RAMBlock *block, ram_addr_t offset)
1728 {
1729     uint8_t *p = block->host + offset;
1730     int len = 0;
1731
1732     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1733         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1734         qemu_put_byte(file, 0);
1735         len += 1;
1736     }
1737     return len;
1738 }
1739
1740 /**
1741  * save_zero_page: send the zero page to the stream
1742  *
1743  * Returns the number of pages written.
1744  *
1745  * @rs: current RAM state
1746  * @block: block that contains the page we want to send
1747  * @offset: offset inside the block for the page
1748  */
1749 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1750 {
1751     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1752
1753     if (len) {
1754         ram_counters.duplicate++;
1755         ram_counters.transferred += len;
1756         return 1;
1757     }
1758     return -1;
1759 }
1760
1761 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1762 {
1763     if (!migrate_release_ram() || !migration_in_postcopy()) {
1764         return;
1765     }
1766
1767     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1768 }
1769
1770 /*
1771  * @pages: the number of pages written by the control path,
1772  *        < 0 - error
1773  *        > 0 - number of pages written
1774  *
1775  * Return true if the pages has been saved, otherwise false is returned.
1776  */
1777 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1778                               int *pages)
1779 {
1780     uint64_t bytes_xmit = 0;
1781     int ret;
1782
1783     *pages = -1;
1784     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1785                                 &bytes_xmit);
1786     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1787         return false;
1788     }
1789
1790     if (bytes_xmit) {
1791         ram_counters.transferred += bytes_xmit;
1792         *pages = 1;
1793     }
1794
1795     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1796         return true;
1797     }
1798
1799     if (bytes_xmit > 0) {
1800         ram_counters.normal++;
1801     } else if (bytes_xmit == 0) {
1802         ram_counters.duplicate++;
1803     }
1804
1805     return true;
1806 }
1807
1808 /*
1809  * directly send the page to the stream
1810  *
1811  * Returns the number of pages written.
1812  *
1813  * @rs: current RAM state
1814  * @block: block that contains the page we want to send
1815  * @offset: offset inside the block for the page
1816  * @buf: the page to be sent
1817  * @async: send to page asyncly
1818  */
1819 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1820                             uint8_t *buf, bool async)
1821 {
1822     ram_counters.transferred += save_page_header(rs, rs->f, block,
1823                                                  offset | RAM_SAVE_FLAG_PAGE);
1824     if (async) {
1825         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1826                               migrate_release_ram() &
1827                               migration_in_postcopy());
1828     } else {
1829         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1830     }
1831     ram_counters.transferred += TARGET_PAGE_SIZE;
1832     ram_counters.normal++;
1833     return 1;
1834 }
1835
1836 /**
1837  * ram_save_page: send the given page to the stream
1838  *
1839  * Returns the number of pages written.
1840  *          < 0 - error
1841  *          >=0 - Number of pages written - this might legally be 0
1842  *                if xbzrle noticed the page was the same.
1843  *
1844  * @rs: current RAM state
1845  * @block: block that contains the page we want to send
1846  * @offset: offset inside the block for the page
1847  * @last_stage: if we are at the completion stage
1848  */
1849 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1850 {
1851     int pages = -1;
1852     uint8_t *p;
1853     bool send_async = true;
1854     RAMBlock *block = pss->block;
1855     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1856     ram_addr_t current_addr = block->offset + offset;
1857
1858     p = block->host + offset;
1859     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1860
1861     XBZRLE_cache_lock();
1862     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1863         migrate_use_xbzrle()) {
1864         pages = save_xbzrle_page(rs, &p, current_addr, block,
1865                                  offset, last_stage);
1866         if (!last_stage) {
1867             /* Can't send this cached data async, since the cache page
1868              * might get updated before it gets to the wire
1869              */
1870             send_async = false;
1871         }
1872     }
1873
1874     /* XBZRLE overflow or normal page */
1875     if (pages == -1) {
1876         pages = save_normal_page(rs, block, offset, p, send_async);
1877     }
1878
1879     XBZRLE_cache_unlock();
1880
1881     return pages;
1882 }
1883
1884 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1885                                  ram_addr_t offset)
1886 {
1887     multifd_queue_page(block, offset);
1888     ram_counters.normal++;
1889
1890     return 1;
1891 }
1892
1893 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1894                                  ram_addr_t offset, uint8_t *source_buf)
1895 {
1896     RAMState *rs = ram_state;
1897     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1898     bool zero_page = false;
1899     int ret;
1900
1901     if (save_zero_page_to_file(rs, f, block, offset)) {
1902         zero_page = true;
1903         goto exit;
1904     }
1905
1906     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1907
1908     /*
1909      * copy it to a internal buffer to avoid it being modified by VM
1910      * so that we can catch up the error during compression and
1911      * decompression
1912      */
1913     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1914     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1915     if (ret < 0) {
1916         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1917         error_report("compressed data failed!");
1918         return false;
1919     }
1920
1921 exit:
1922     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1923     return zero_page;
1924 }
1925
1926 static void
1927 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1928 {
1929     ram_counters.transferred += bytes_xmit;
1930
1931     if (param->zero_page) {
1932         ram_counters.duplicate++;
1933         return;
1934     }
1935
1936     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1937     compression_counters.compressed_size += bytes_xmit - 8;
1938     compression_counters.pages++;
1939 }
1940
1941 static bool save_page_use_compression(RAMState *rs);
1942
1943 static void flush_compressed_data(RAMState *rs)
1944 {
1945     int idx, len, thread_count;
1946
1947     if (!save_page_use_compression(rs)) {
1948         return;
1949     }
1950     thread_count = migrate_compress_threads();
1951
1952     qemu_mutex_lock(&comp_done_lock);
1953     for (idx = 0; idx < thread_count; idx++) {
1954         while (!comp_param[idx].done) {
1955             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1956         }
1957     }
1958     qemu_mutex_unlock(&comp_done_lock);
1959
1960     for (idx = 0; idx < thread_count; idx++) {
1961         qemu_mutex_lock(&comp_param[idx].mutex);
1962         if (!comp_param[idx].quit) {
1963             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1964             /*
1965              * it's safe to fetch zero_page without holding comp_done_lock
1966              * as there is no further request submitted to the thread,
1967              * i.e, the thread should be waiting for a request at this point.
1968              */
1969             update_compress_thread_counts(&comp_param[idx], len);
1970         }
1971         qemu_mutex_unlock(&comp_param[idx].mutex);
1972     }
1973 }
1974
1975 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1976                                        ram_addr_t offset)
1977 {
1978     param->block = block;
1979     param->offset = offset;
1980 }
1981
1982 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1983                                            ram_addr_t offset)
1984 {
1985     int idx, thread_count, bytes_xmit = -1, pages = -1;
1986     bool wait = migrate_compress_wait_thread();
1987
1988     thread_count = migrate_compress_threads();
1989     qemu_mutex_lock(&comp_done_lock);
1990 retry:
1991     for (idx = 0; idx < thread_count; idx++) {
1992         if (comp_param[idx].done) {
1993             comp_param[idx].done = false;
1994             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1995             qemu_mutex_lock(&comp_param[idx].mutex);
1996             set_compress_params(&comp_param[idx], block, offset);
1997             qemu_cond_signal(&comp_param[idx].cond);
1998             qemu_mutex_unlock(&comp_param[idx].mutex);
1999             pages = 1;
2000             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
2001             break;
2002         }
2003     }
2004
2005     /*
2006      * wait for the free thread if the user specifies 'compress-wait-thread',
2007      * otherwise we will post the page out in the main thread as normal page.
2008      */
2009     if (pages < 0 && wait) {
2010         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2011         goto retry;
2012     }
2013     qemu_mutex_unlock(&comp_done_lock);
2014
2015     return pages;
2016 }
2017
2018 /**
2019  * find_dirty_block: find the next dirty page and update any state
2020  * associated with the search process.
2021  *
2022  * Returns if a page is found
2023  *
2024  * @rs: current RAM state
2025  * @pss: data about the state of the current dirty page scan
2026  * @again: set to false if the search has scanned the whole of RAM
2027  */
2028 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
2029 {
2030     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2031     if (pss->complete_round && pss->block == rs->last_seen_block &&
2032         pss->page >= rs->last_page) {
2033         /*
2034          * We've been once around the RAM and haven't found anything.
2035          * Give up.
2036          */
2037         *again = false;
2038         return false;
2039     }
2040     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
2041         /* Didn't find anything in this RAM Block */
2042         pss->page = 0;
2043         pss->block = QLIST_NEXT_RCU(pss->block, next);
2044         if (!pss->block) {
2045             /*
2046              * If memory migration starts over, we will meet a dirtied page
2047              * which may still exists in compression threads's ring, so we
2048              * should flush the compressed data to make sure the new page
2049              * is not overwritten by the old one in the destination.
2050              *
2051              * Also If xbzrle is on, stop using the data compression at this
2052              * point. In theory, xbzrle can do better than compression.
2053              */
2054             flush_compressed_data(rs);
2055
2056             /* Hit the end of the list */
2057             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2058             /* Flag that we've looped */
2059             pss->complete_round = true;
2060             rs->ram_bulk_stage = false;
2061         }
2062         /* Didn't find anything this time, but try again on the new block */
2063         *again = true;
2064         return false;
2065     } else {
2066         /* Can go around again, but... */
2067         *again = true;
2068         /* We've found something so probably don't need to */
2069         return true;
2070     }
2071 }
2072
2073 /**
2074  * unqueue_page: gets a page of the queue
2075  *
2076  * Helper for 'get_queued_page' - gets a page off the queue
2077  *
2078  * Returns the block of the page (or NULL if none available)
2079  *
2080  * @rs: current RAM state
2081  * @offset: used to return the offset within the RAMBlock
2082  */
2083 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
2084 {
2085     RAMBlock *block = NULL;
2086
2087     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2088         return NULL;
2089     }
2090
2091     qemu_mutex_lock(&rs->src_page_req_mutex);
2092     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2093         struct RAMSrcPageRequest *entry =
2094                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
2095         block = entry->rb;
2096         *offset = entry->offset;
2097
2098         if (entry->len > TARGET_PAGE_SIZE) {
2099             entry->len -= TARGET_PAGE_SIZE;
2100             entry->offset += TARGET_PAGE_SIZE;
2101         } else {
2102             memory_region_unref(block->mr);
2103             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2104             g_free(entry);
2105             migration_consume_urgent_request();
2106         }
2107     }
2108     qemu_mutex_unlock(&rs->src_page_req_mutex);
2109
2110     return block;
2111 }
2112
2113 /**
2114  * get_queued_page: unqueue a page from the postocpy requests
2115  *
2116  * Skips pages that are already sent (!dirty)
2117  *
2118  * Returns if a queued page is found
2119  *
2120  * @rs: current RAM state
2121  * @pss: data about the state of the current dirty page scan
2122  */
2123 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2124 {
2125     RAMBlock  *block;
2126     ram_addr_t offset;
2127     bool dirty;
2128
2129     do {
2130         block = unqueue_page(rs, &offset);
2131         /*
2132          * We're sending this page, and since it's postcopy nothing else
2133          * will dirty it, and we must make sure it doesn't get sent again
2134          * even if this queue request was received after the background
2135          * search already sent it.
2136          */
2137         if (block) {
2138             unsigned long page;
2139
2140             page = offset >> TARGET_PAGE_BITS;
2141             dirty = test_bit(page, block->bmap);
2142             if (!dirty) {
2143                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2144                        page, test_bit(page, block->unsentmap));
2145             } else {
2146                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2147             }
2148         }
2149
2150     } while (block && !dirty);
2151
2152     if (block) {
2153         /*
2154          * As soon as we start servicing pages out of order, then we have
2155          * to kill the bulk stage, since the bulk stage assumes
2156          * in (migration_bitmap_find_and_reset_dirty) that every page is
2157          * dirty, that's no longer true.
2158          */
2159         rs->ram_bulk_stage = false;
2160
2161         /*
2162          * We want the background search to continue from the queued page
2163          * since the guest is likely to want other pages near to the page
2164          * it just requested.
2165          */
2166         pss->block = block;
2167         pss->page = offset >> TARGET_PAGE_BITS;
2168     }
2169
2170     return !!block;
2171 }
2172
2173 /**
2174  * migration_page_queue_free: drop any remaining pages in the ram
2175  * request queue
2176  *
2177  * It should be empty at the end anyway, but in error cases there may
2178  * be some left.  in case that there is any page left, we drop it.
2179  *
2180  */
2181 static void migration_page_queue_free(RAMState *rs)
2182 {
2183     struct RAMSrcPageRequest *mspr, *next_mspr;
2184     /* This queue generally should be empty - but in the case of a failed
2185      * migration might have some droppings in.
2186      */
2187     rcu_read_lock();
2188     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2189         memory_region_unref(mspr->rb->mr);
2190         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2191         g_free(mspr);
2192     }
2193     rcu_read_unlock();
2194 }
2195
2196 /**
2197  * ram_save_queue_pages: queue the page for transmission
2198  *
2199  * A request from postcopy destination for example.
2200  *
2201  * Returns zero on success or negative on error
2202  *
2203  * @rbname: Name of the RAMBLock of the request. NULL means the
2204  *          same that last one.
2205  * @start: starting address from the start of the RAMBlock
2206  * @len: length (in bytes) to send
2207  */
2208 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2209 {
2210     RAMBlock *ramblock;
2211     RAMState *rs = ram_state;
2212
2213     ram_counters.postcopy_requests++;
2214     rcu_read_lock();
2215     if (!rbname) {
2216         /* Reuse last RAMBlock */
2217         ramblock = rs->last_req_rb;
2218
2219         if (!ramblock) {
2220             /*
2221              * Shouldn't happen, we can't reuse the last RAMBlock if
2222              * it's the 1st request.
2223              */
2224             error_report("ram_save_queue_pages no previous block");
2225             goto err;
2226         }
2227     } else {
2228         ramblock = qemu_ram_block_by_name(rbname);
2229
2230         if (!ramblock) {
2231             /* We shouldn't be asked for a non-existent RAMBlock */
2232             error_report("ram_save_queue_pages no block '%s'", rbname);
2233             goto err;
2234         }
2235         rs->last_req_rb = ramblock;
2236     }
2237     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2238     if (start+len > ramblock->used_length) {
2239         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2240                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2241                      __func__, start, len, ramblock->used_length);
2242         goto err;
2243     }
2244
2245     struct RAMSrcPageRequest *new_entry =
2246         g_malloc0(sizeof(struct RAMSrcPageRequest));
2247     new_entry->rb = ramblock;
2248     new_entry->offset = start;
2249     new_entry->len = len;
2250
2251     memory_region_ref(ramblock->mr);
2252     qemu_mutex_lock(&rs->src_page_req_mutex);
2253     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2254     migration_make_urgent_request();
2255     qemu_mutex_unlock(&rs->src_page_req_mutex);
2256     rcu_read_unlock();
2257
2258     return 0;
2259
2260 err:
2261     rcu_read_unlock();
2262     return -1;
2263 }
2264
2265 static bool save_page_use_compression(RAMState *rs)
2266 {
2267     if (!migrate_use_compression()) {
2268         return false;
2269     }
2270
2271     /*
2272      * If xbzrle is on, stop using the data compression after first
2273      * round of migration even if compression is enabled. In theory,
2274      * xbzrle can do better than compression.
2275      */
2276     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2277         return true;
2278     }
2279
2280     return false;
2281 }
2282
2283 /*
2284  * try to compress the page before posting it out, return true if the page
2285  * has been properly handled by compression, otherwise needs other
2286  * paths to handle it
2287  */
2288 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2289 {
2290     if (!save_page_use_compression(rs)) {
2291         return false;
2292     }
2293
2294     /*
2295      * When starting the process of a new block, the first page of
2296      * the block should be sent out before other pages in the same
2297      * block, and all the pages in last block should have been sent
2298      * out, keeping this order is important, because the 'cont' flag
2299      * is used to avoid resending the block name.
2300      *
2301      * We post the fist page as normal page as compression will take
2302      * much CPU resource.
2303      */
2304     if (block != rs->last_sent_block) {
2305         flush_compressed_data(rs);
2306         return false;
2307     }
2308
2309     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2310         return true;
2311     }
2312
2313     compression_counters.busy++;
2314     return false;
2315 }
2316
2317 /**
2318  * ram_save_target_page: save one target page
2319  *
2320  * Returns the number of pages written
2321  *
2322  * @rs: current RAM state
2323  * @pss: data about the page we want to send
2324  * @last_stage: if we are at the completion stage
2325  */
2326 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2327                                 bool last_stage)
2328 {
2329     RAMBlock *block = pss->block;
2330     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2331     int res;
2332
2333     if (control_save_page(rs, block, offset, &res)) {
2334         return res;
2335     }
2336
2337     if (save_compress_page(rs, block, offset)) {
2338         return 1;
2339     }
2340
2341     res = save_zero_page(rs, block, offset);
2342     if (res > 0) {
2343         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2344          * page would be stale
2345          */
2346         if (!save_page_use_compression(rs)) {
2347             XBZRLE_cache_lock();
2348             xbzrle_cache_zero_page(rs, block->offset + offset);
2349             XBZRLE_cache_unlock();
2350         }
2351         ram_release_pages(block->idstr, offset, res);
2352         return res;
2353     }
2354
2355     /*
2356      * do not use multifd for compression as the first page in the new
2357      * block should be posted out before sending the compressed page
2358      */
2359     if (!save_page_use_compression(rs) && migrate_use_multifd()) {
2360         return ram_save_multifd_page(rs, block, offset);
2361     }
2362
2363     return ram_save_page(rs, pss, last_stage);
2364 }
2365
2366 /**
2367  * ram_save_host_page: save a whole host page
2368  *
2369  * Starting at *offset send pages up to the end of the current host
2370  * page. It's valid for the initial offset to point into the middle of
2371  * a host page in which case the remainder of the hostpage is sent.
2372  * Only dirty target pages are sent. Note that the host page size may
2373  * be a huge page for this block.
2374  * The saving stops at the boundary of the used_length of the block
2375  * if the RAMBlock isn't a multiple of the host page size.
2376  *
2377  * Returns the number of pages written or negative on error
2378  *
2379  * @rs: current RAM state
2380  * @ms: current migration state
2381  * @pss: data about the page we want to send
2382  * @last_stage: if we are at the completion stage
2383  */
2384 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2385                               bool last_stage)
2386 {
2387     int tmppages, pages = 0;
2388     size_t pagesize_bits =
2389         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2390
2391     if (!qemu_ram_is_migratable(pss->block)) {
2392         error_report("block %s should not be migrated !", pss->block->idstr);
2393         return 0;
2394     }
2395
2396     do {
2397         /* Check the pages is dirty and if it is send it */
2398         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2399             pss->page++;
2400             continue;
2401         }
2402
2403         tmppages = ram_save_target_page(rs, pss, last_stage);
2404         if (tmppages < 0) {
2405             return tmppages;
2406         }
2407
2408         pages += tmppages;
2409         if (pss->block->unsentmap) {
2410             clear_bit(pss->page, pss->block->unsentmap);
2411         }
2412
2413         pss->page++;
2414     } while ((pss->page & (pagesize_bits - 1)) &&
2415              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2416
2417     /* The offset we leave with is the last one we looked at */
2418     pss->page--;
2419     return pages;
2420 }
2421
2422 /**
2423  * ram_find_and_save_block: finds a dirty page and sends it to f
2424  *
2425  * Called within an RCU critical section.
2426  *
2427  * Returns the number of pages written where zero means no dirty pages,
2428  * or negative on error
2429  *
2430  * @rs: current RAM state
2431  * @last_stage: if we are at the completion stage
2432  *
2433  * On systems where host-page-size > target-page-size it will send all the
2434  * pages in a host page that are dirty.
2435  */
2436
2437 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2438 {
2439     PageSearchStatus pss;
2440     int pages = 0;
2441     bool again, found;
2442
2443     /* No dirty page as there is zero RAM */
2444     if (!ram_bytes_total()) {
2445         return pages;
2446     }
2447
2448     pss.block = rs->last_seen_block;
2449     pss.page = rs->last_page;
2450     pss.complete_round = false;
2451
2452     if (!pss.block) {
2453         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2454     }
2455
2456     do {
2457         again = true;
2458         found = get_queued_page(rs, &pss);
2459
2460         if (!found) {
2461             /* priority queue empty, so just search for something dirty */
2462             found = find_dirty_block(rs, &pss, &again);
2463         }
2464
2465         if (found) {
2466             pages = ram_save_host_page(rs, &pss, last_stage);
2467         }
2468     } while (!pages && again);
2469
2470     rs->last_seen_block = pss.block;
2471     rs->last_page = pss.page;
2472
2473     return pages;
2474 }
2475
2476 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2477 {
2478     uint64_t pages = size / TARGET_PAGE_SIZE;
2479
2480     if (zero) {
2481         ram_counters.duplicate += pages;
2482     } else {
2483         ram_counters.normal += pages;
2484         ram_counters.transferred += size;
2485         qemu_update_position(f, size);
2486     }
2487 }
2488
2489 uint64_t ram_bytes_total(void)
2490 {
2491     RAMBlock *block;
2492     uint64_t total = 0;
2493
2494     rcu_read_lock();
2495     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2496         total += block->used_length;
2497     }
2498     rcu_read_unlock();
2499     return total;
2500 }
2501
2502 static void xbzrle_load_setup(void)
2503 {
2504     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2505 }
2506
2507 static void xbzrle_load_cleanup(void)
2508 {
2509     g_free(XBZRLE.decoded_buf);
2510     XBZRLE.decoded_buf = NULL;
2511 }
2512
2513 static void ram_state_cleanup(RAMState **rsp)
2514 {
2515     if (*rsp) {
2516         migration_page_queue_free(*rsp);
2517         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2518         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2519         g_free(*rsp);
2520         *rsp = NULL;
2521     }
2522 }
2523
2524 static void xbzrle_cleanup(void)
2525 {
2526     XBZRLE_cache_lock();
2527     if (XBZRLE.cache) {
2528         cache_fini(XBZRLE.cache);
2529         g_free(XBZRLE.encoded_buf);
2530         g_free(XBZRLE.current_buf);
2531         g_free(XBZRLE.zero_target_page);
2532         XBZRLE.cache = NULL;
2533         XBZRLE.encoded_buf = NULL;
2534         XBZRLE.current_buf = NULL;
2535         XBZRLE.zero_target_page = NULL;
2536     }
2537     XBZRLE_cache_unlock();
2538 }
2539
2540 static void ram_save_cleanup(void *opaque)
2541 {
2542     RAMState **rsp = opaque;
2543     RAMBlock *block;
2544
2545     /* caller have hold iothread lock or is in a bh, so there is
2546      * no writing race against this migration_bitmap
2547      */
2548     memory_global_dirty_log_stop();
2549
2550     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2551         g_free(block->bmap);
2552         block->bmap = NULL;
2553         g_free(block->unsentmap);
2554         block->unsentmap = NULL;
2555     }
2556
2557     xbzrle_cleanup();
2558     compress_threads_save_cleanup();
2559     ram_state_cleanup(rsp);
2560 }
2561
2562 static void ram_state_reset(RAMState *rs)
2563 {
2564     rs->last_seen_block = NULL;
2565     rs->last_sent_block = NULL;
2566     rs->last_page = 0;
2567     rs->last_version = ram_list.version;
2568     rs->ram_bulk_stage = true;
2569 }
2570
2571 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2572
2573 /*
2574  * 'expected' is the value you expect the bitmap mostly to be full
2575  * of; it won't bother printing lines that are all this value.
2576  * If 'todump' is null the migration bitmap is dumped.
2577  */
2578 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2579                            unsigned long pages)
2580 {
2581     int64_t cur;
2582     int64_t linelen = 128;
2583     char linebuf[129];
2584
2585     for (cur = 0; cur < pages; cur += linelen) {
2586         int64_t curb;
2587         bool found = false;
2588         /*
2589          * Last line; catch the case where the line length
2590          * is longer than remaining ram
2591          */
2592         if (cur + linelen > pages) {
2593             linelen = pages - cur;
2594         }
2595         for (curb = 0; curb < linelen; curb++) {
2596             bool thisbit = test_bit(cur + curb, todump);
2597             linebuf[curb] = thisbit ? '1' : '.';
2598             found = found || (thisbit != expected);
2599         }
2600         if (found) {
2601             linebuf[curb] = '\0';
2602             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2603         }
2604     }
2605 }
2606
2607 /* **** functions for postcopy ***** */
2608
2609 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2610 {
2611     struct RAMBlock *block;
2612
2613     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2614         unsigned long *bitmap = block->bmap;
2615         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2616         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2617
2618         while (run_start < range) {
2619             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2620             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2621                               (run_end - run_start) << TARGET_PAGE_BITS);
2622             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2623         }
2624     }
2625 }
2626
2627 /**
2628  * postcopy_send_discard_bm_ram: discard a RAMBlock
2629  *
2630  * Returns zero on success
2631  *
2632  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2633  * Note: At this point the 'unsentmap' is the processed bitmap combined
2634  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2635  *
2636  * @ms: current migration state
2637  * @pds: state for postcopy
2638  * @start: RAMBlock starting page
2639  * @length: RAMBlock size
2640  */
2641 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2642                                         PostcopyDiscardState *pds,
2643                                         RAMBlock *block)
2644 {
2645     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2646     unsigned long current;
2647     unsigned long *unsentmap = block->unsentmap;
2648
2649     for (current = 0; current < end; ) {
2650         unsigned long one = find_next_bit(unsentmap, end, current);
2651
2652         if (one <= end) {
2653             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2654             unsigned long discard_length;
2655
2656             if (zero >= end) {
2657                 discard_length = end - one;
2658             } else {
2659                 discard_length = zero - one;
2660             }
2661             if (discard_length) {
2662                 postcopy_discard_send_range(ms, pds, one, discard_length);
2663             }
2664             current = one + discard_length;
2665         } else {
2666             current = one;
2667         }
2668     }
2669
2670     return 0;
2671 }
2672
2673 /**
2674  * postcopy_each_ram_send_discard: discard all RAMBlocks
2675  *
2676  * Returns 0 for success or negative for error
2677  *
2678  * Utility for the outgoing postcopy code.
2679  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2680  *   passing it bitmap indexes and name.
2681  * (qemu_ram_foreach_block ends up passing unscaled lengths
2682  *  which would mean postcopy code would have to deal with target page)
2683  *
2684  * @ms: current migration state
2685  */
2686 static int postcopy_each_ram_send_discard(MigrationState *ms)
2687 {
2688     struct RAMBlock *block;
2689     int ret;
2690
2691     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2692         PostcopyDiscardState *pds =
2693             postcopy_discard_send_init(ms, block->idstr);
2694
2695         /*
2696          * Postcopy sends chunks of bitmap over the wire, but it
2697          * just needs indexes at this point, avoids it having
2698          * target page specific code.
2699          */
2700         ret = postcopy_send_discard_bm_ram(ms, pds, block);
2701         postcopy_discard_send_finish(ms, pds);
2702         if (ret) {
2703             return ret;
2704         }
2705     }
2706
2707     return 0;
2708 }
2709
2710 /**
2711  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2712  *
2713  * Helper for postcopy_chunk_hostpages; it's called twice to
2714  * canonicalize the two bitmaps, that are similar, but one is
2715  * inverted.
2716  *
2717  * Postcopy requires that all target pages in a hostpage are dirty or
2718  * clean, not a mix.  This function canonicalizes the bitmaps.
2719  *
2720  * @ms: current migration state
2721  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2722  *               otherwise we need to canonicalize partially dirty host pages
2723  * @block: block that contains the page we want to canonicalize
2724  * @pds: state for postcopy
2725  */
2726 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2727                                           RAMBlock *block,
2728                                           PostcopyDiscardState *pds)
2729 {
2730     RAMState *rs = ram_state;
2731     unsigned long *bitmap = block->bmap;
2732     unsigned long *unsentmap = block->unsentmap;
2733     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2734     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2735     unsigned long run_start;
2736
2737     if (block->page_size == TARGET_PAGE_SIZE) {
2738         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2739         return;
2740     }
2741
2742     if (unsent_pass) {
2743         /* Find a sent page */
2744         run_start = find_next_zero_bit(unsentmap, pages, 0);
2745     } else {
2746         /* Find a dirty page */
2747         run_start = find_next_bit(bitmap, pages, 0);
2748     }
2749
2750     while (run_start < pages) {
2751         bool do_fixup = false;
2752         unsigned long fixup_start_addr;
2753         unsigned long host_offset;
2754
2755         /*
2756          * If the start of this run of pages is in the middle of a host
2757          * page, then we need to fixup this host page.
2758          */
2759         host_offset = run_start % host_ratio;
2760         if (host_offset) {
2761             do_fixup = true;
2762             run_start -= host_offset;
2763             fixup_start_addr = run_start;
2764             /* For the next pass */
2765             run_start = run_start + host_ratio;
2766         } else {
2767             /* Find the end of this run */
2768             unsigned long run_end;
2769             if (unsent_pass) {
2770                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2771             } else {
2772                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2773             }
2774             /*
2775              * If the end isn't at the start of a host page, then the
2776              * run doesn't finish at the end of a host page
2777              * and we need to discard.
2778              */
2779             host_offset = run_end % host_ratio;
2780             if (host_offset) {
2781                 do_fixup = true;
2782                 fixup_start_addr = run_end - host_offset;
2783                 /*
2784                  * This host page has gone, the next loop iteration starts
2785                  * from after the fixup
2786                  */
2787                 run_start = fixup_start_addr + host_ratio;
2788             } else {
2789                 /*
2790                  * No discards on this iteration, next loop starts from
2791                  * next sent/dirty page
2792                  */
2793                 run_start = run_end + 1;
2794             }
2795         }
2796
2797         if (do_fixup) {
2798             unsigned long page;
2799
2800             /* Tell the destination to discard this page */
2801             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2802                 /* For the unsent_pass we:
2803                  *     discard partially sent pages
2804                  * For the !unsent_pass (dirty) we:
2805                  *     discard partially dirty pages that were sent
2806                  *     (any partially sent pages were already discarded
2807                  *     by the previous unsent_pass)
2808                  */
2809                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2810                                             host_ratio);
2811             }
2812
2813             /* Clean up the bitmap */
2814             for (page = fixup_start_addr;
2815                  page < fixup_start_addr + host_ratio; page++) {
2816                 /* All pages in this host page are now not sent */
2817                 set_bit(page, unsentmap);
2818
2819                 /*
2820                  * Remark them as dirty, updating the count for any pages
2821                  * that weren't previously dirty.
2822                  */
2823                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2824             }
2825         }
2826
2827         if (unsent_pass) {
2828             /* Find the next sent page for the next iteration */
2829             run_start = find_next_zero_bit(unsentmap, pages, run_start);
2830         } else {
2831             /* Find the next dirty page for the next iteration */
2832             run_start = find_next_bit(bitmap, pages, run_start);
2833         }
2834     }
2835 }
2836
2837 /**
2838  * postcopy_chuck_hostpages: discrad any partially sent host page
2839  *
2840  * Utility for the outgoing postcopy code.
2841  *
2842  * Discard any partially sent host-page size chunks, mark any partially
2843  * dirty host-page size chunks as all dirty.  In this case the host-page
2844  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2845  *
2846  * Returns zero on success
2847  *
2848  * @ms: current migration state
2849  * @block: block we want to work with
2850  */
2851 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2852 {
2853     PostcopyDiscardState *pds =
2854         postcopy_discard_send_init(ms, block->idstr);
2855
2856     /* First pass: Discard all partially sent host pages */
2857     postcopy_chunk_hostpages_pass(ms, true, block, pds);
2858     /*
2859      * Second pass: Ensure that all partially dirty host pages are made
2860      * fully dirty.
2861      */
2862     postcopy_chunk_hostpages_pass(ms, false, block, pds);
2863
2864     postcopy_discard_send_finish(ms, pds);
2865     return 0;
2866 }
2867
2868 /**
2869  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2870  *
2871  * Returns zero on success
2872  *
2873  * Transmit the set of pages to be discarded after precopy to the target
2874  * these are pages that:
2875  *     a) Have been previously transmitted but are now dirty again
2876  *     b) Pages that have never been transmitted, this ensures that
2877  *        any pages on the destination that have been mapped by background
2878  *        tasks get discarded (transparent huge pages is the specific concern)
2879  * Hopefully this is pretty sparse
2880  *
2881  * @ms: current migration state
2882  */
2883 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2884 {
2885     RAMState *rs = ram_state;
2886     RAMBlock *block;
2887     int ret;
2888
2889     rcu_read_lock();
2890
2891     /* This should be our last sync, the src is now paused */
2892     migration_bitmap_sync(rs);
2893
2894     /* Easiest way to make sure we don't resume in the middle of a host-page */
2895     rs->last_seen_block = NULL;
2896     rs->last_sent_block = NULL;
2897     rs->last_page = 0;
2898
2899     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2900         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2901         unsigned long *bitmap = block->bmap;
2902         unsigned long *unsentmap = block->unsentmap;
2903
2904         if (!unsentmap) {
2905             /* We don't have a safe way to resize the sentmap, so
2906              * if the bitmap was resized it will be NULL at this
2907              * point.
2908              */
2909             error_report("migration ram resized during precopy phase");
2910             rcu_read_unlock();
2911             return -EINVAL;
2912         }
2913         /* Deal with TPS != HPS and huge pages */
2914         ret = postcopy_chunk_hostpages(ms, block);
2915         if (ret) {
2916             rcu_read_unlock();
2917             return ret;
2918         }
2919
2920         /*
2921          * Update the unsentmap to be unsentmap = unsentmap | dirty
2922          */
2923         bitmap_or(unsentmap, unsentmap, bitmap, pages);
2924 #ifdef DEBUG_POSTCOPY
2925         ram_debug_dump_bitmap(unsentmap, true, pages);
2926 #endif
2927     }
2928     trace_ram_postcopy_send_discard_bitmap();
2929
2930     ret = postcopy_each_ram_send_discard(ms);
2931     rcu_read_unlock();
2932
2933     return ret;
2934 }
2935
2936 /**
2937  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2938  *
2939  * Returns zero on success
2940  *
2941  * @rbname: name of the RAMBlock of the request. NULL means the
2942  *          same that last one.
2943  * @start: RAMBlock starting page
2944  * @length: RAMBlock size
2945  */
2946 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2947 {
2948     int ret = -1;
2949
2950     trace_ram_discard_range(rbname, start, length);
2951
2952     rcu_read_lock();
2953     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2954
2955     if (!rb) {
2956         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2957         goto err;
2958     }
2959
2960     /*
2961      * On source VM, we don't need to update the received bitmap since
2962      * we don't even have one.
2963      */
2964     if (rb->receivedmap) {
2965         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2966                      length >> qemu_target_page_bits());
2967     }
2968
2969     ret = ram_block_discard_range(rb, start, length);
2970
2971 err:
2972     rcu_read_unlock();
2973
2974     return ret;
2975 }
2976
2977 /*
2978  * For every allocation, we will try not to crash the VM if the
2979  * allocation failed.
2980  */
2981 static int xbzrle_init(void)
2982 {
2983     Error *local_err = NULL;
2984
2985     if (!migrate_use_xbzrle()) {
2986         return 0;
2987     }
2988
2989     XBZRLE_cache_lock();
2990
2991     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2992     if (!XBZRLE.zero_target_page) {
2993         error_report("%s: Error allocating zero page", __func__);
2994         goto err_out;
2995     }
2996
2997     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2998                               TARGET_PAGE_SIZE, &local_err);
2999     if (!XBZRLE.cache) {
3000         error_report_err(local_err);
3001         goto free_zero_page;
3002     }
3003
3004     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3005     if (!XBZRLE.encoded_buf) {
3006         error_report("%s: Error allocating encoded_buf", __func__);
3007         goto free_cache;
3008     }
3009
3010     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3011     if (!XBZRLE.current_buf) {
3012         error_report("%s: Error allocating current_buf", __func__);
3013         goto free_encoded_buf;
3014     }
3015
3016     /* We are all good */
3017     XBZRLE_cache_unlock();
3018     return 0;
3019
3020 free_encoded_buf:
3021     g_free(XBZRLE.encoded_buf);
3022     XBZRLE.encoded_buf = NULL;
3023 free_cache:
3024     cache_fini(XBZRLE.cache);
3025     XBZRLE.cache = NULL;
3026 free_zero_page:
3027     g_free(XBZRLE.zero_target_page);
3028     XBZRLE.zero_target_page = NULL;
3029 err_out:
3030     XBZRLE_cache_unlock();
3031     return -ENOMEM;
3032 }
3033
3034 static int ram_state_init(RAMState **rsp)
3035 {
3036     *rsp = g_try_new0(RAMState, 1);
3037
3038     if (!*rsp) {
3039         error_report("%s: Init ramstate fail", __func__);
3040         return -1;
3041     }
3042
3043     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3044     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3045     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3046
3047     /*
3048      * Count the total number of pages used by ram blocks not including any
3049      * gaps due to alignment or unplugs.
3050      */
3051     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3052
3053     ram_state_reset(*rsp);
3054
3055     return 0;
3056 }
3057
3058 static void ram_list_init_bitmaps(void)
3059 {
3060     RAMBlock *block;
3061     unsigned long pages;
3062
3063     /* Skip setting bitmap if there is no RAM */
3064     if (ram_bytes_total()) {
3065         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3066             pages = block->max_length >> TARGET_PAGE_BITS;
3067             block->bmap = bitmap_new(pages);
3068             bitmap_set(block->bmap, 0, pages);
3069             if (migrate_postcopy_ram()) {
3070                 block->unsentmap = bitmap_new(pages);
3071                 bitmap_set(block->unsentmap, 0, pages);
3072             }
3073         }
3074     }
3075 }
3076
3077 static void ram_init_bitmaps(RAMState *rs)
3078 {
3079     /* For memory_global_dirty_log_start below.  */
3080     qemu_mutex_lock_iothread();
3081     qemu_mutex_lock_ramlist();
3082     rcu_read_lock();
3083
3084     ram_list_init_bitmaps();
3085     memory_global_dirty_log_start();
3086     migration_bitmap_sync(rs);
3087
3088     rcu_read_unlock();
3089     qemu_mutex_unlock_ramlist();
3090     qemu_mutex_unlock_iothread();
3091 }
3092
3093 static int ram_init_all(RAMState **rsp)
3094 {
3095     if (ram_state_init(rsp)) {
3096         return -1;
3097     }
3098
3099     if (xbzrle_init()) {
3100         ram_state_cleanup(rsp);
3101         return -1;
3102     }
3103
3104     ram_init_bitmaps(*rsp);
3105
3106     return 0;
3107 }
3108
3109 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3110 {
3111     RAMBlock *block;
3112     uint64_t pages = 0;
3113
3114     /*
3115      * Postcopy is not using xbzrle/compression, so no need for that.
3116      * Also, since source are already halted, we don't need to care
3117      * about dirty page logging as well.
3118      */
3119
3120     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3121         pages += bitmap_count_one(block->bmap,
3122                                   block->used_length >> TARGET_PAGE_BITS);
3123     }
3124
3125     /* This may not be aligned with current bitmaps. Recalculate. */
3126     rs->migration_dirty_pages = pages;
3127
3128     rs->last_seen_block = NULL;
3129     rs->last_sent_block = NULL;
3130     rs->last_page = 0;
3131     rs->last_version = ram_list.version;
3132     /*
3133      * Disable the bulk stage, otherwise we'll resend the whole RAM no
3134      * matter what we have sent.
3135      */
3136     rs->ram_bulk_stage = false;
3137
3138     /* Update RAMState cache of output QEMUFile */
3139     rs->f = out;
3140
3141     trace_ram_state_resume_prepare(pages);
3142 }
3143
3144 /*
3145  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3146  * long-running RCU critical section.  When rcu-reclaims in the code
3147  * start to become numerous it will be necessary to reduce the
3148  * granularity of these critical sections.
3149  */
3150
3151 /**
3152  * ram_save_setup: Setup RAM for migration
3153  *
3154  * Returns zero to indicate success and negative for error
3155  *
3156  * @f: QEMUFile where to send the data
3157  * @opaque: RAMState pointer
3158  */
3159 static int ram_save_setup(QEMUFile *f, void *opaque)
3160 {
3161     RAMState **rsp = opaque;
3162     RAMBlock *block;
3163
3164     if (compress_threads_save_setup()) {
3165         return -1;
3166     }
3167
3168     /* migration has already setup the bitmap, reuse it. */
3169     if (!migration_in_colo_state()) {
3170         if (ram_init_all(rsp) != 0) {
3171             compress_threads_save_cleanup();
3172             return -1;
3173         }
3174     }
3175     (*rsp)->f = f;
3176
3177     rcu_read_lock();
3178
3179     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
3180
3181     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3182         qemu_put_byte(f, strlen(block->idstr));
3183         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3184         qemu_put_be64(f, block->used_length);
3185         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3186             qemu_put_be64(f, block->page_size);
3187         }
3188     }
3189
3190     rcu_read_unlock();
3191
3192     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3193     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3194
3195     multifd_send_sync_main();
3196     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3197     qemu_fflush(f);
3198
3199     return 0;
3200 }
3201
3202 /**
3203  * ram_save_iterate: iterative stage for migration
3204  *
3205  * Returns zero to indicate success and negative for error
3206  *
3207  * @f: QEMUFile where to send the data
3208  * @opaque: RAMState pointer
3209  */
3210 static int ram_save_iterate(QEMUFile *f, void *opaque)
3211 {
3212     RAMState **temp = opaque;
3213     RAMState *rs = *temp;
3214     int ret;
3215     int i;
3216     int64_t t0;
3217     int done = 0;
3218
3219     if (blk_mig_bulk_active()) {
3220         /* Avoid transferring ram during bulk phase of block migration as
3221          * the bulk phase will usually take a long time and transferring
3222          * ram updates during that time is pointless. */
3223         goto out;
3224     }
3225
3226     rcu_read_lock();
3227     if (ram_list.version != rs->last_version) {
3228         ram_state_reset(rs);
3229     }
3230
3231     /* Read version before ram_list.blocks */
3232     smp_rmb();
3233
3234     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3235
3236     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3237     i = 0;
3238     while ((ret = qemu_file_rate_limit(f)) == 0 ||
3239             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3240         int pages;
3241
3242         if (qemu_file_get_error(f)) {
3243             break;
3244         }
3245
3246         pages = ram_find_and_save_block(rs, false);
3247         /* no more pages to sent */
3248         if (pages == 0) {
3249             done = 1;
3250             break;
3251         }
3252
3253         if (pages < 0) {
3254             qemu_file_set_error(f, pages);
3255             break;
3256         }
3257
3258         rs->target_page_count += pages;
3259
3260         /* we want to check in the 1st loop, just in case it was the 1st time
3261            and we had to sync the dirty bitmap.
3262            qemu_get_clock_ns() is a bit expensive, so we only check each some
3263            iterations
3264         */
3265         if ((i & 63) == 0) {
3266             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3267             if (t1 > MAX_WAIT) {
3268                 trace_ram_save_iterate_big_wait(t1, i);
3269                 break;
3270             }
3271         }
3272         i++;
3273     }
3274     rcu_read_unlock();
3275
3276     /*
3277      * Must occur before EOS (or any QEMUFile operation)
3278      * because of RDMA protocol.
3279      */
3280     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3281
3282     multifd_send_sync_main();
3283 out:
3284     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3285     qemu_fflush(f);
3286     ram_counters.transferred += 8;
3287
3288     ret = qemu_file_get_error(f);
3289     if (ret < 0) {
3290         return ret;
3291     }
3292
3293     return done;
3294 }
3295
3296 /**
3297  * ram_save_complete: function called to send the remaining amount of ram
3298  *
3299  * Returns zero to indicate success or negative on error
3300  *
3301  * Called with iothread lock
3302  *
3303  * @f: QEMUFile where to send the data
3304  * @opaque: RAMState pointer
3305  */
3306 static int ram_save_complete(QEMUFile *f, void *opaque)
3307 {
3308     RAMState **temp = opaque;
3309     RAMState *rs = *temp;
3310     int ret = 0;
3311
3312     rcu_read_lock();
3313
3314     if (!migration_in_postcopy()) {
3315         migration_bitmap_sync(rs);
3316     }
3317
3318     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3319
3320     /* try transferring iterative blocks of memory */
3321
3322     /* flush all remaining blocks regardless of rate limiting */
3323     while (true) {
3324         int pages;
3325
3326         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3327         /* no more blocks to sent */
3328         if (pages == 0) {
3329             break;
3330         }
3331         if (pages < 0) {
3332             ret = pages;
3333             break;
3334         }
3335     }
3336
3337     flush_compressed_data(rs);
3338     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3339
3340     rcu_read_unlock();
3341
3342     multifd_send_sync_main();
3343     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3344     qemu_fflush(f);
3345
3346     return ret;
3347 }
3348
3349 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3350                              uint64_t *res_precopy_only,
3351                              uint64_t *res_compatible,
3352                              uint64_t *res_postcopy_only)
3353 {
3354     RAMState **temp = opaque;
3355     RAMState *rs = *temp;
3356     uint64_t remaining_size;
3357
3358     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3359
3360     if (!migration_in_postcopy() &&
3361         remaining_size < max_size) {
3362         qemu_mutex_lock_iothread();
3363         rcu_read_lock();
3364         migration_bitmap_sync(rs);
3365         rcu_read_unlock();
3366         qemu_mutex_unlock_iothread();
3367         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3368     }
3369
3370     if (migrate_postcopy_ram()) {
3371         /* We can do postcopy, and all the data is postcopiable */
3372         *res_compatible += remaining_size;
3373     } else {
3374         *res_precopy_only += remaining_size;
3375     }
3376 }
3377
3378 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3379 {
3380     unsigned int xh_len;
3381     int xh_flags;
3382     uint8_t *loaded_data;
3383
3384     /* extract RLE header */
3385     xh_flags = qemu_get_byte(f);
3386     xh_len = qemu_get_be16(f);
3387
3388     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3389         error_report("Failed to load XBZRLE page - wrong compression!");
3390         return -1;
3391     }
3392
3393     if (xh_len > TARGET_PAGE_SIZE) {
3394         error_report("Failed to load XBZRLE page - len overflow!");
3395         return -1;
3396     }
3397     loaded_data = XBZRLE.decoded_buf;
3398     /* load data and decode */
3399     /* it can change loaded_data to point to an internal buffer */
3400     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3401
3402     /* decode RLE */
3403     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3404                              TARGET_PAGE_SIZE) == -1) {
3405         error_report("Failed to load XBZRLE page - decode error!");
3406         return -1;
3407     }
3408
3409     return 0;
3410 }
3411
3412 /**
3413  * ram_block_from_stream: read a RAMBlock id from the migration stream
3414  *
3415  * Must be called from within a rcu critical section.
3416  *
3417  * Returns a pointer from within the RCU-protected ram_list.
3418  *
3419  * @f: QEMUFile where to read the data from
3420  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3421  */
3422 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3423 {
3424     static RAMBlock *block = NULL;
3425     char id[256];
3426     uint8_t len;
3427
3428     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3429         if (!block) {
3430             error_report("Ack, bad migration stream!");
3431             return NULL;
3432         }
3433         return block;
3434     }
3435
3436     len = qemu_get_byte(f);
3437     qemu_get_buffer(f, (uint8_t *)id, len);
3438     id[len] = 0;
3439
3440     block = qemu_ram_block_by_name(id);
3441     if (!block) {
3442         error_report("Can't find block %s", id);
3443         return NULL;
3444     }
3445
3446     if (!qemu_ram_is_migratable(block)) {
3447         error_report("block %s should not be migrated !", id);
3448         return NULL;
3449     }
3450
3451     return block;
3452 }
3453
3454 static inline void *host_from_ram_block_offset(RAMBlock *block,
3455                                                ram_addr_t offset)
3456 {
3457     if (!offset_in_ramblock(block, offset)) {
3458         return NULL;
3459     }
3460
3461     return block->host + offset;
3462 }
3463
3464 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3465                                                  ram_addr_t offset)
3466 {
3467     if (!offset_in_ramblock(block, offset)) {
3468         return NULL;
3469     }
3470     if (!block->colo_cache) {
3471         error_report("%s: colo_cache is NULL in block :%s",
3472                      __func__, block->idstr);
3473         return NULL;
3474     }
3475
3476     /*
3477     * During colo checkpoint, we need bitmap of these migrated pages.
3478     * It help us to decide which pages in ram cache should be flushed
3479     * into VM's RAM later.
3480     */
3481     if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3482         ram_state->migration_dirty_pages++;
3483     }
3484     return block->colo_cache + offset;
3485 }
3486
3487 /**
3488  * ram_handle_compressed: handle the zero page case
3489  *
3490  * If a page (or a whole RDMA chunk) has been
3491  * determined to be zero, then zap it.
3492  *
3493  * @host: host address for the zero page
3494  * @ch: what the page is filled from.  We only support zero
3495  * @size: size of the zero page
3496  */
3497 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3498 {
3499     if (ch != 0 || !is_zero_range(host, size)) {
3500         memset(host, ch, size);
3501     }
3502 }
3503
3504 /* return the size after decompression, or negative value on error */
3505 static int
3506 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3507                      const uint8_t *source, size_t source_len)
3508 {
3509     int err;
3510
3511     err = inflateReset(stream);
3512     if (err != Z_OK) {
3513         return -1;
3514     }
3515
3516     stream->avail_in = source_len;
3517     stream->next_in = (uint8_t *)source;
3518     stream->avail_out = dest_len;
3519     stream->next_out = dest;
3520
3521     err = inflate(stream, Z_NO_FLUSH);
3522     if (err != Z_STREAM_END) {
3523         return -1;
3524     }
3525
3526     return stream->total_out;
3527 }
3528
3529 static void *do_data_decompress(void *opaque)
3530 {
3531     DecompressParam *param = opaque;
3532     unsigned long pagesize;
3533     uint8_t *des;
3534     int len, ret;
3535
3536     qemu_mutex_lock(&param->mutex);
3537     while (!param->quit) {
3538         if (param->des) {
3539             des = param->des;
3540             len = param->len;
3541             param->des = 0;
3542             qemu_mutex_unlock(&param->mutex);
3543
3544             pagesize = TARGET_PAGE_SIZE;
3545
3546             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3547                                        param->compbuf, len);
3548             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3549                 error_report("decompress data failed");
3550                 qemu_file_set_error(decomp_file, ret);
3551             }
3552
3553             qemu_mutex_lock(&decomp_done_lock);
3554             param->done = true;
3555             qemu_cond_signal(&decomp_done_cond);
3556             qemu_mutex_unlock(&decomp_done_lock);
3557
3558             qemu_mutex_lock(&param->mutex);
3559         } else {
3560             qemu_cond_wait(&param->cond, &param->mutex);
3561         }
3562     }
3563     qemu_mutex_unlock(&param->mutex);
3564
3565     return NULL;
3566 }
3567
3568 static int wait_for_decompress_done(void)
3569 {
3570     int idx, thread_count;
3571
3572     if (!migrate_use_compression()) {
3573         return 0;
3574     }
3575
3576     thread_count = migrate_decompress_threads();
3577     qemu_mutex_lock(&decomp_done_lock);
3578     for (idx = 0; idx < thread_count; idx++) {
3579         while (!decomp_param[idx].done) {
3580             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3581         }
3582     }
3583     qemu_mutex_unlock(&decomp_done_lock);
3584     return qemu_file_get_error(decomp_file);
3585 }
3586
3587 static void compress_threads_load_cleanup(void)
3588 {
3589     int i, thread_count;
3590
3591     if (!migrate_use_compression()) {
3592         return;
3593     }
3594     thread_count = migrate_decompress_threads();
3595     for (i = 0; i < thread_count; i++) {
3596         /*
3597          * we use it as a indicator which shows if the thread is
3598          * properly init'd or not
3599          */
3600         if (!decomp_param[i].compbuf) {
3601             break;
3602         }
3603
3604         qemu_mutex_lock(&decomp_param[i].mutex);
3605         decomp_param[i].quit = true;
3606         qemu_cond_signal(&decomp_param[i].cond);
3607         qemu_mutex_unlock(&decomp_param[i].mutex);
3608     }
3609     for (i = 0; i < thread_count; i++) {
3610         if (!decomp_param[i].compbuf) {
3611             break;
3612         }
3613
3614         qemu_thread_join(decompress_threads + i);
3615         qemu_mutex_destroy(&decomp_param[i].mutex);
3616         qemu_cond_destroy(&decomp_param[i].cond);
3617         inflateEnd(&decomp_param[i].stream);
3618         g_free(decomp_param[i].compbuf);
3619         decomp_param[i].compbuf = NULL;
3620     }
3621     g_free(decompress_threads);
3622     g_free(decomp_param);
3623     decompress_threads = NULL;
3624     decomp_param = NULL;
3625     decomp_file = NULL;
3626 }
3627
3628 static int compress_threads_load_setup(QEMUFile *f)
3629 {
3630     int i, thread_count;
3631
3632     if (!migrate_use_compression()) {
3633         return 0;
3634     }
3635
3636     thread_count = migrate_decompress_threads();
3637     decompress_threads = g_new0(QemuThread, thread_count);
3638     decomp_param = g_new0(DecompressParam, thread_count);
3639     qemu_mutex_init(&decomp_done_lock);
3640     qemu_cond_init(&decomp_done_cond);
3641     decomp_file = f;
3642     for (i = 0; i < thread_count; i++) {
3643         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3644             goto exit;
3645         }
3646
3647         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3648         qemu_mutex_init(&decomp_param[i].mutex);
3649         qemu_cond_init(&decomp_param[i].cond);
3650         decomp_param[i].done = true;
3651         decomp_param[i].quit = false;
3652         qemu_thread_create(decompress_threads + i, "decompress",
3653                            do_data_decompress, decomp_param + i,
3654                            QEMU_THREAD_JOINABLE);
3655     }
3656     return 0;
3657 exit:
3658     compress_threads_load_cleanup();
3659     return -1;
3660 }
3661
3662 static void decompress_data_with_multi_threads(QEMUFile *f,
3663                                                void *host, int len)
3664 {
3665     int idx, thread_count;
3666
3667     thread_count = migrate_decompress_threads();
3668     qemu_mutex_lock(&decomp_done_lock);
3669     while (true) {
3670         for (idx = 0; idx < thread_count; idx++) {
3671             if (decomp_param[idx].done) {
3672                 decomp_param[idx].done = false;
3673                 qemu_mutex_lock(&decomp_param[idx].mutex);
3674                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3675                 decomp_param[idx].des = host;
3676                 decomp_param[idx].len = len;
3677                 qemu_cond_signal(&decomp_param[idx].cond);
3678                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3679                 break;
3680             }
3681         }
3682         if (idx < thread_count) {
3683             break;
3684         } else {
3685             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3686         }
3687     }
3688     qemu_mutex_unlock(&decomp_done_lock);
3689 }
3690
3691 /*
3692  * colo cache: this is for secondary VM, we cache the whole
3693  * memory of the secondary VM, it is need to hold the global lock
3694  * to call this helper.
3695  */
3696 int colo_init_ram_cache(void)
3697 {
3698     RAMBlock *block;
3699
3700     rcu_read_lock();
3701     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3702         block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3703                                                 NULL,
3704                                                 false);
3705         if (!block->colo_cache) {
3706             error_report("%s: Can't alloc memory for COLO cache of block %s,"
3707                          "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3708                          block->used_length);
3709             goto out_locked;
3710         }
3711         memcpy(block->colo_cache, block->host, block->used_length);
3712     }
3713     rcu_read_unlock();
3714     /*
3715     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3716     * with to decide which page in cache should be flushed into SVM's RAM. Here
3717     * we use the same name 'ram_bitmap' as for migration.
3718     */
3719     if (ram_bytes_total()) {
3720         RAMBlock *block;
3721
3722         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3723             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3724
3725             block->bmap = bitmap_new(pages);
3726             bitmap_set(block->bmap, 0, pages);
3727         }
3728     }
3729     ram_state = g_new0(RAMState, 1);
3730     ram_state->migration_dirty_pages = 0;
3731     memory_global_dirty_log_start();
3732
3733     return 0;
3734
3735 out_locked:
3736
3737     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3738         if (block->colo_cache) {
3739             qemu_anon_ram_free(block->colo_cache, block->used_length);
3740             block->colo_cache = NULL;
3741         }
3742     }
3743
3744     rcu_read_unlock();
3745     return -errno;
3746 }
3747
3748 /* It is need to hold the global lock to call this helper */
3749 void colo_release_ram_cache(void)
3750 {
3751     RAMBlock *block;
3752
3753     memory_global_dirty_log_stop();
3754     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3755         g_free(block->bmap);
3756         block->bmap = NULL;
3757     }
3758
3759     rcu_read_lock();
3760
3761     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3762         if (block->colo_cache) {
3763             qemu_anon_ram_free(block->colo_cache, block->used_length);
3764             block->colo_cache = NULL;
3765         }
3766     }
3767
3768     rcu_read_unlock();
3769     g_free(ram_state);
3770     ram_state = NULL;
3771 }
3772
3773 /**
3774  * ram_load_setup: Setup RAM for migration incoming side
3775  *
3776  * Returns zero to indicate success and negative for error
3777  *
3778  * @f: QEMUFile where to receive the data
3779  * @opaque: RAMState pointer
3780  */
3781 static int ram_load_setup(QEMUFile *f, void *opaque)
3782 {
3783     if (compress_threads_load_setup(f)) {
3784         return -1;
3785     }
3786
3787     xbzrle_load_setup();
3788     ramblock_recv_map_init();
3789
3790     return 0;
3791 }
3792
3793 static int ram_load_cleanup(void *opaque)
3794 {
3795     RAMBlock *rb;
3796
3797     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3798         if (ramblock_is_pmem(rb)) {
3799             pmem_persist(rb->host, rb->used_length);
3800         }
3801     }
3802
3803     xbzrle_load_cleanup();
3804     compress_threads_load_cleanup();
3805
3806     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3807         g_free(rb->receivedmap);
3808         rb->receivedmap = NULL;
3809     }
3810
3811     return 0;
3812 }
3813
3814 /**
3815  * ram_postcopy_incoming_init: allocate postcopy data structures
3816  *
3817  * Returns 0 for success and negative if there was one error
3818  *
3819  * @mis: current migration incoming state
3820  *
3821  * Allocate data structures etc needed by incoming migration with
3822  * postcopy-ram. postcopy-ram's similarly names
3823  * postcopy_ram_incoming_init does the work.
3824  */
3825 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3826 {
3827     return postcopy_ram_incoming_init(mis);
3828 }
3829
3830 /**
3831  * ram_load_postcopy: load a page in postcopy case
3832  *
3833  * Returns 0 for success or -errno in case of error
3834  *
3835  * Called in postcopy mode by ram_load().
3836  * rcu_read_lock is taken prior to this being called.
3837  *
3838  * @f: QEMUFile where to send the data
3839  */
3840 static int ram_load_postcopy(QEMUFile *f)
3841 {
3842     int flags = 0, ret = 0;
3843     bool place_needed = false;
3844     bool matches_target_page_size = false;
3845     MigrationIncomingState *mis = migration_incoming_get_current();
3846     /* Temporary page that is later 'placed' */
3847     void *postcopy_host_page = postcopy_get_tmp_page(mis);
3848     void *last_host = NULL;
3849     bool all_zero = false;
3850
3851     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3852         ram_addr_t addr;
3853         void *host = NULL;
3854         void *page_buffer = NULL;
3855         void *place_source = NULL;
3856         RAMBlock *block = NULL;
3857         uint8_t ch;
3858
3859         addr = qemu_get_be64(f);
3860
3861         /*
3862          * If qemu file error, we should stop here, and then "addr"
3863          * may be invalid
3864          */
3865         ret = qemu_file_get_error(f);
3866         if (ret) {
3867             break;
3868         }
3869
3870         flags = addr & ~TARGET_PAGE_MASK;
3871         addr &= TARGET_PAGE_MASK;
3872
3873         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3874         place_needed = false;
3875         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
3876             block = ram_block_from_stream(f, flags);
3877
3878             host = host_from_ram_block_offset(block, addr);
3879             if (!host) {
3880                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3881                 ret = -EINVAL;
3882                 break;
3883             }
3884             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3885             /*
3886              * Postcopy requires that we place whole host pages atomically;
3887              * these may be huge pages for RAMBlocks that are backed by
3888              * hugetlbfs.
3889              * To make it atomic, the data is read into a temporary page
3890              * that's moved into place later.
3891              * The migration protocol uses,  possibly smaller, target-pages
3892              * however the source ensures it always sends all the components
3893              * of a host page in order.
3894              */
3895             page_buffer = postcopy_host_page +
3896                           ((uintptr_t)host & (block->page_size - 1));
3897             /* If all TP are zero then we can optimise the place */
3898             if (!((uintptr_t)host & (block->page_size - 1))) {
3899                 all_zero = true;
3900             } else {
3901                 /* not the 1st TP within the HP */
3902                 if (host != (last_host + TARGET_PAGE_SIZE)) {
3903                     error_report("Non-sequential target page %p/%p",
3904                                   host, last_host);
3905                     ret = -EINVAL;
3906                     break;
3907                 }
3908             }
3909
3910
3911             /*
3912              * If it's the last part of a host page then we place the host
3913              * page
3914              */
3915             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
3916                                      (block->page_size - 1)) == 0;
3917             place_source = postcopy_host_page;
3918         }
3919         last_host = host;
3920
3921         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3922         case RAM_SAVE_FLAG_ZERO:
3923             ch = qemu_get_byte(f);
3924             memset(page_buffer, ch, TARGET_PAGE_SIZE);
3925             if (ch) {
3926                 all_zero = false;
3927             }
3928             break;
3929
3930         case RAM_SAVE_FLAG_PAGE:
3931             all_zero = false;
3932             if (!matches_target_page_size) {
3933                 /* For huge pages, we always use temporary buffer */
3934                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3935             } else {
3936                 /*
3937                  * For small pages that matches target page size, we
3938                  * avoid the qemu_file copy.  Instead we directly use
3939                  * the buffer of QEMUFile to place the page.  Note: we
3940                  * cannot do any QEMUFile operation before using that
3941                  * buffer to make sure the buffer is valid when
3942                  * placing the page.
3943                  */
3944                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3945                                          TARGET_PAGE_SIZE);
3946             }
3947             break;
3948         case RAM_SAVE_FLAG_EOS:
3949             /* normal exit */
3950             multifd_recv_sync_main();
3951             break;
3952         default:
3953             error_report("Unknown combination of migration flags: %#x"
3954                          " (postcopy mode)", flags);
3955             ret = -EINVAL;
3956             break;
3957         }
3958
3959         /* Detect for any possible file errors */
3960         if (!ret && qemu_file_get_error(f)) {
3961             ret = qemu_file_get_error(f);
3962         }
3963
3964         if (!ret && place_needed) {
3965             /* This gets called at the last target page in the host page */
3966             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3967
3968             if (all_zero) {
3969                 ret = postcopy_place_page_zero(mis, place_dest,
3970                                                block);
3971             } else {
3972                 ret = postcopy_place_page(mis, place_dest,
3973                                           place_source, block);
3974             }
3975         }
3976     }
3977
3978     return ret;
3979 }
3980
3981 static bool postcopy_is_advised(void)
3982 {
3983     PostcopyState ps = postcopy_state_get();
3984     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3985 }
3986
3987 static bool postcopy_is_running(void)
3988 {
3989     PostcopyState ps = postcopy_state_get();
3990     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3991 }
3992
3993 /*
3994  * Flush content of RAM cache into SVM's memory.
3995  * Only flush the pages that be dirtied by PVM or SVM or both.
3996  */
3997 static void colo_flush_ram_cache(void)
3998 {
3999     RAMBlock *block = NULL;
4000     void *dst_host;
4001     void *src_host;
4002     unsigned long offset = 0;
4003
4004     memory_global_dirty_log_sync();
4005     rcu_read_lock();
4006     RAMBLOCK_FOREACH_MIGRATABLE(block) {
4007         migration_bitmap_sync_range(ram_state, block, 0, block->used_length);
4008     }
4009     rcu_read_unlock();
4010
4011     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4012     rcu_read_lock();
4013     block = QLIST_FIRST_RCU(&ram_list.blocks);
4014
4015     while (block) {
4016         offset = migration_bitmap_find_dirty(ram_state, block, offset);
4017
4018         if (offset << TARGET_PAGE_BITS >= block->used_length) {
4019             offset = 0;
4020             block = QLIST_NEXT_RCU(block, next);
4021         } else {
4022             migration_bitmap_clear_dirty(ram_state, block, offset);
4023             dst_host = block->host + (offset << TARGET_PAGE_BITS);
4024             src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
4025             memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4026         }
4027     }
4028
4029     rcu_read_unlock();
4030     trace_colo_flush_ram_cache_end();
4031 }
4032
4033 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4034 {
4035     int flags = 0, ret = 0, invalid_flags = 0;
4036     static uint64_t seq_iter;
4037     int len = 0;
4038     /*
4039      * If system is running in postcopy mode, page inserts to host memory must
4040      * be atomic
4041      */
4042     bool postcopy_running = postcopy_is_running();
4043     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4044     bool postcopy_advised = postcopy_is_advised();
4045
4046     seq_iter++;
4047
4048     if (version_id != 4) {
4049         ret = -EINVAL;
4050     }
4051
4052     if (!migrate_use_compression()) {
4053         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4054     }
4055     /* This RCU critical section can be very long running.
4056      * When RCU reclaims in the code start to become numerous,
4057      * it will be necessary to reduce the granularity of this
4058      * critical section.
4059      */
4060     rcu_read_lock();
4061
4062     if (postcopy_running) {
4063         ret = ram_load_postcopy(f);
4064     }
4065
4066     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4067         ram_addr_t addr, total_ram_bytes;
4068         void *host = NULL;
4069         uint8_t ch;
4070
4071         addr = qemu_get_be64(f);
4072         flags = addr & ~TARGET_PAGE_MASK;
4073         addr &= TARGET_PAGE_MASK;
4074
4075         if (flags & invalid_flags) {
4076             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4077                 error_report("Received an unexpected compressed page");
4078             }
4079
4080             ret = -EINVAL;
4081             break;
4082         }
4083
4084         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4085                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4086             RAMBlock *block = ram_block_from_stream(f, flags);
4087
4088             /*
4089              * After going into COLO, we should load the Page into colo_cache.
4090              */
4091             if (migration_incoming_in_colo_state()) {
4092                 host = colo_cache_from_block_offset(block, addr);
4093             } else {
4094                 host = host_from_ram_block_offset(block, addr);
4095             }
4096             if (!host) {
4097                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4098                 ret = -EINVAL;
4099                 break;
4100             }
4101
4102             if (!migration_incoming_in_colo_state()) {
4103                 ramblock_recv_bitmap_set(block, host);
4104             }
4105
4106             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4107         }
4108
4109         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4110         case RAM_SAVE_FLAG_MEM_SIZE:
4111             /* Synchronize RAM block list */
4112             total_ram_bytes = addr;
4113             while (!ret && total_ram_bytes) {
4114                 RAMBlock *block;
4115                 char id[256];
4116                 ram_addr_t length;
4117
4118                 len = qemu_get_byte(f);
4119                 qemu_get_buffer(f, (uint8_t *)id, len);
4120                 id[len] = 0;
4121                 length = qemu_get_be64(f);
4122
4123                 block = qemu_ram_block_by_name(id);
4124                 if (block && !qemu_ram_is_migratable(block)) {
4125                     error_report("block %s should not be migrated !", id);
4126                     ret = -EINVAL;
4127                 } else if (block) {
4128                     if (length != block->used_length) {
4129                         Error *local_err = NULL;
4130
4131                         ret = qemu_ram_resize(block, length,
4132                                               &local_err);
4133                         if (local_err) {
4134                             error_report_err(local_err);
4135                         }
4136                     }
4137                     /* For postcopy we need to check hugepage sizes match */
4138                     if (postcopy_advised &&
4139                         block->page_size != qemu_host_page_size) {
4140                         uint64_t remote_page_size = qemu_get_be64(f);
4141                         if (remote_page_size != block->page_size) {
4142                             error_report("Mismatched RAM page size %s "
4143                                          "(local) %zd != %" PRId64,
4144                                          id, block->page_size,
4145                                          remote_page_size);
4146                             ret = -EINVAL;
4147                         }
4148                     }
4149                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4150                                           block->idstr);
4151                 } else {
4152                     error_report("Unknown ramblock \"%s\", cannot "
4153                                  "accept migration", id);
4154                     ret = -EINVAL;
4155                 }
4156
4157                 total_ram_bytes -= length;
4158             }
4159             break;
4160
4161         case RAM_SAVE_FLAG_ZERO:
4162             ch = qemu_get_byte(f);
4163             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4164             break;
4165
4166         case RAM_SAVE_FLAG_PAGE:
4167             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4168             break;
4169
4170         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4171             len = qemu_get_be32(f);
4172             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4173                 error_report("Invalid compressed data length: %d", len);
4174                 ret = -EINVAL;
4175                 break;
4176             }
4177             decompress_data_with_multi_threads(f, host, len);
4178             break;
4179
4180         case RAM_SAVE_FLAG_XBZRLE:
4181             if (load_xbzrle(f, addr, host) < 0) {
4182                 error_report("Failed to decompress XBZRLE page at "
4183                              RAM_ADDR_FMT, addr);
4184                 ret = -EINVAL;
4185                 break;
4186             }
4187             break;
4188         case RAM_SAVE_FLAG_EOS:
4189             /* normal exit */
4190             multifd_recv_sync_main();
4191             break;
4192         default:
4193             if (flags & RAM_SAVE_FLAG_HOOK) {
4194                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4195             } else {
4196                 error_report("Unknown combination of migration flags: %#x",
4197                              flags);
4198                 ret = -EINVAL;
4199             }
4200         }
4201         if (!ret) {
4202             ret = qemu_file_get_error(f);
4203         }
4204     }
4205
4206     ret |= wait_for_decompress_done();
4207     rcu_read_unlock();
4208     trace_ram_load_complete(ret, seq_iter);
4209
4210     if (!ret  && migration_incoming_in_colo_state()) {
4211         colo_flush_ram_cache();
4212     }
4213     return ret;
4214 }
4215
4216 static bool ram_has_postcopy(void *opaque)
4217 {
4218     RAMBlock *rb;
4219     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
4220         if (ramblock_is_pmem(rb)) {
4221             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4222                          "is not supported now!", rb->idstr, rb->host);
4223             return false;
4224         }
4225     }
4226
4227     return migrate_postcopy_ram();
4228 }
4229
4230 /* Sync all the dirty bitmap with destination VM.  */
4231 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4232 {
4233     RAMBlock *block;
4234     QEMUFile *file = s->to_dst_file;
4235     int ramblock_count = 0;
4236
4237     trace_ram_dirty_bitmap_sync_start();
4238
4239     RAMBLOCK_FOREACH_MIGRATABLE(block) {
4240         qemu_savevm_send_recv_bitmap(file, block->idstr);
4241         trace_ram_dirty_bitmap_request(block->idstr);
4242         ramblock_count++;
4243     }
4244
4245     trace_ram_dirty_bitmap_sync_wait();
4246
4247     /* Wait until all the ramblocks' dirty bitmap synced */
4248     while (ramblock_count--) {
4249         qemu_sem_wait(&s->rp_state.rp_sem);
4250     }
4251
4252     trace_ram_dirty_bitmap_sync_complete();
4253
4254     return 0;
4255 }
4256
4257 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4258 {
4259     qemu_sem_post(&s->rp_state.rp_sem);
4260 }
4261
4262 /*
4263  * Read the received bitmap, revert it as the initial dirty bitmap.
4264  * This is only used when the postcopy migration is paused but wants
4265  * to resume from a middle point.
4266  */
4267 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4268 {
4269     int ret = -EINVAL;
4270     QEMUFile *file = s->rp_state.from_dst_file;
4271     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4272     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4273     uint64_t size, end_mark;
4274
4275     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4276
4277     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4278         error_report("%s: incorrect state %s", __func__,
4279                      MigrationStatus_str(s->state));
4280         return -EINVAL;
4281     }
4282
4283     /*
4284      * Note: see comments in ramblock_recv_bitmap_send() on why we
4285      * need the endianess convertion, and the paddings.
4286      */
4287     local_size = ROUND_UP(local_size, 8);
4288
4289     /* Add paddings */
4290     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4291
4292     size = qemu_get_be64(file);
4293
4294     /* The size of the bitmap should match with our ramblock */
4295     if (size != local_size) {
4296         error_report("%s: ramblock '%s' bitmap size mismatch "
4297                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4298                      block->idstr, size, local_size);
4299         ret = -EINVAL;
4300         goto out;
4301     }
4302
4303     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4304     end_mark = qemu_get_be64(file);
4305
4306     ret = qemu_file_get_error(file);
4307     if (ret || size != local_size) {
4308         error_report("%s: read bitmap failed for ramblock '%s': %d"
4309                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4310                      __func__, block->idstr, ret, local_size, size);
4311         ret = -EIO;
4312         goto out;
4313     }
4314
4315     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4316         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4317                      __func__, block->idstr, end_mark);
4318         ret = -EINVAL;
4319         goto out;
4320     }
4321
4322     /*
4323      * Endianess convertion. We are during postcopy (though paused).
4324      * The dirty bitmap won't change. We can directly modify it.
4325      */
4326     bitmap_from_le(block->bmap, le_bitmap, nbits);
4327
4328     /*
4329      * What we received is "received bitmap". Revert it as the initial
4330      * dirty bitmap for this ramblock.
4331      */
4332     bitmap_complement(block->bmap, block->bmap, nbits);
4333
4334     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4335
4336     /*
4337      * We succeeded to sync bitmap for current ramblock. If this is
4338      * the last one to sync, we need to notify the main send thread.
4339      */
4340     ram_dirty_bitmap_reload_notify(s);
4341
4342     ret = 0;
4343 out:
4344     g_free(le_bitmap);
4345     return ret;
4346 }
4347
4348 static int ram_resume_prepare(MigrationState *s, void *opaque)
4349 {
4350     RAMState *rs = *(RAMState **)opaque;
4351     int ret;
4352
4353     ret = ram_dirty_bitmap_sync_all(s, rs);
4354     if (ret) {
4355         return ret;
4356     }
4357
4358     ram_state_resume_prepare(rs, s->to_dst_file);
4359
4360     return 0;
4361 }
4362
4363 static SaveVMHandlers savevm_ram_handlers = {
4364     .save_setup = ram_save_setup,
4365     .save_live_iterate = ram_save_iterate,
4366     .save_live_complete_postcopy = ram_save_complete,
4367     .save_live_complete_precopy = ram_save_complete,
4368     .has_postcopy = ram_has_postcopy,
4369     .save_live_pending = ram_save_pending,
4370     .load_state = ram_load,
4371     .save_cleanup = ram_save_cleanup,
4372     .load_setup = ram_load_setup,
4373     .load_cleanup = ram_load_cleanup,
4374     .resume_prepare = ram_resume_prepare,
4375 };
4376
4377 void ram_mig_init(void)
4378 {
4379     qemu_mutex_init(&XBZRLE.lock);
4380     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4381 }