migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "socket.h"
  40 #include "migration/register.h"
  41 #include "migration/misc.h"
  42 #include "qemu-file.h"
  43 #include "postcopy-ram.h"
  44 #include "page_cache.h"
  45 #include "qemu/error-report.h"
  46 #include "qapi/error.h"
  47 #include "qapi/qapi-events-migration.h"
  48 #include "qapi/qmp/qerror.h"
  49 #include "trace.h"
  50 #include "exec/ram_addr.h"
  51 #include "exec/target_page.h"
  52 #include "qemu/rcu_queue.h"
  53 #include "migration/colo.h"
  54 #include "block.h"
  55 #include "sysemu/sysemu.h"
  56 #include "qemu/uuid.h"
  57 #include "savevm.h"
  58 #include "qemu/iov.h"
  59
  60 /***********************************************************/
  61 /* ram save/restore */
  62
  63 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  64  * worked for pages that where filled with the same char.  We switched
  65  * it to only search for the zero value.  And to avoid confusion with
  66  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  67  */
  68
  69 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  70 #define RAM_SAVE_FLAG_ZERO     0x02
  71 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  72 #define RAM_SAVE_FLAG_PAGE     0x08
  73 #define RAM_SAVE_FLAG_EOS      0x10
  74 #define RAM_SAVE_FLAG_CONTINUE 0x20
  75 #define RAM_SAVE_FLAG_XBZRLE   0x40
  76 /* 0x80 is reserved in migration.h start with 0x100 next */
  77 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  78
  79 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  80 {
  81     return buffer_is_zero(p, size);
  82 }
  83
  84 XBZRLECacheStats xbzrle_counters;
  85
  86 /* struct contains XBZRLE cache and a static page
  87    used by the compression */
  88 static struct {
  89     /* buffer used for XBZRLE encoding */
  90     uint8_t *encoded_buf;
  91     /* buffer for storing page content */
  92     uint8_t *current_buf;
  93     /* Cache for XBZRLE, Protected by lock. */
  94     PageCache *cache;
  95     QemuMutex lock;
  96     /* it will store a page full of zeros */
  97     uint8_t *zero_target_page;
  98     /* buffer used for XBZRLE decoding */
  99     uint8_t *decoded_buf;
 100 } XBZRLE;
 101
 102 static void XBZRLE_cache_lock(void)
 103 {
 104     if (migrate_use_xbzrle())
 105         qemu_mutex_lock(&XBZRLE.lock);
 106 }
 107
 108 static void XBZRLE_cache_unlock(void)
 109 {
 110     if (migrate_use_xbzrle())
 111         qemu_mutex_unlock(&XBZRLE.lock);
 112 }
 113
 114 /**
 115  * xbzrle_cache_resize: resize the xbzrle cache
 116  *
 117  * This function is called from qmp_migrate_set_cache_size in main
 118  * thread, possibly while a migration is in progress.  A running
 119  * migration may be using the cache and might finish during this call,
 120  * hence changes to the cache are protected by XBZRLE.lock().
 121  *
 122  * Returns 0 for success or -1 for error
 123  *
 124  * @new_size: new cache size
 125  * @errp: set *errp if the check failed, with reason
 126  */
 127 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 128 {
 129     PageCache *new_cache;
 130     int64_t ret = 0;
 131
 132     /* Check for truncation */
 133     if (new_size != (size_t)new_size) {
 134         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 135                    "exceeding address space");
 136         return -1;
 137     }
 138
 139     if (new_size == migrate_xbzrle_cache_size()) {
 140         /* nothing to do */
 141         return 0;
 142     }
 143
 144     XBZRLE_cache_lock();
 145
 146     if (XBZRLE.cache != NULL) {
 147         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 148         if (!new_cache) {
 149             ret = -1;
 150             goto out;
 151         }
 152
 153         cache_fini(XBZRLE.cache);
 154         XBZRLE.cache = new_cache;
 155     }
 156 out:
 157     XBZRLE_cache_unlock();
 158     return ret;
 159 }
 160
 161 /* Should be holding either ram_list.mutex, or the RCU lock. */
 162 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 163     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 164         if (!qemu_ram_is_migratable(block)) {} else
 165
 166 #undef RAMBLOCK_FOREACH
 167
 168 static void ramblock_recv_map_init(void)
 169 {
 170     RAMBlock *rb;
 171
 172     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
 173         assert(!rb->receivedmap);
 174         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 175     }
 176 }
 177
 178 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 179 {
 180     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 181                     rb->receivedmap);
 182 }
 183
 184 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 185 {
 186     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 187 }
 188
 189 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 190 {
 191     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 192 }
 193
 194 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 195                                     size_t nr)
 196 {
 197     bitmap_set_atomic(rb->receivedmap,
 198                       ramblock_recv_bitmap_offset(host_addr, rb),
 199                       nr);
 200 }
 201
 202 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 203
 204 /*
 205  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 206  *
 207  * Returns >0 if success with sent bytes, or <0 if error.
 208  */
 209 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 210                                   const char *block_name)
 211 {
 212     RAMBlock *block = qemu_ram_block_by_name(block_name);
 213     unsigned long *le_bitmap, nbits;
 214     uint64_t size;
 215
 216     if (!block) {
 217         error_report("%s: invalid block name: %s", __func__, block_name);
 218         return -1;
 219     }
 220
 221     nbits = block->used_length >> TARGET_PAGE_BITS;
 222
 223     /*
 224      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 225      * machines we may need 4 more bytes for padding (see below
 226      * comment). So extend it a bit before hand.
 227      */
 228     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 229
 230     /*
 231      * Always use little endian when sending the bitmap. This is
 232      * required that when source and destination VMs are not using the
 233      * same endianess. (Note: big endian won't work.)
 234      */
 235     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 236
 237     /* Size of the bitmap, in bytes */
 238     size = DIV_ROUND_UP(nbits, 8);
 239
 240     /*
 241      * size is always aligned to 8 bytes for 64bit machines, but it
 242      * may not be true for 32bit machines. We need this padding to
 243      * make sure the migration can survive even between 32bit and
 244      * 64bit machines.
 245      */
 246     size = ROUND_UP(size, 8);
 247
 248     qemu_put_be64(file, size);
 249     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 250     /*
 251      * Mark as an end, in case the middle part is screwed up due to
 252      * some "misterious" reason.
 253      */
 254     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 255     qemu_fflush(file);
 256
 257     g_free(le_bitmap);
 258
 259     if (qemu_file_get_error(file)) {
 260         return qemu_file_get_error(file);
 261     }
 262
 263     return size + sizeof(size);
 264 }
 265
 266 /*
 267  * An outstanding page request, on the source, having been received
 268  * and queued
 269  */
 270 struct RAMSrcPageRequest {
 271     RAMBlock *rb;
 272     hwaddr    offset;
 273     hwaddr    len;
 274
 275     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 276 };
 277
 278 /* State of RAM for migration */
 279 struct RAMState {
 280     /* QEMUFile used for this migration */
 281     QEMUFile *f;
 282     /* Last block that we have visited searching for dirty pages */
 283     RAMBlock *last_seen_block;
 284     /* Last block from where we have sent data */
 285     RAMBlock *last_sent_block;
 286     /* Last dirty target page we have sent */
 287     ram_addr_t last_page;
 288     /* last ram version we have seen */
 289     uint32_t last_version;
 290     /* We are in the first round */
 291     bool ram_bulk_stage;
 292     /* How many times we have dirty too many pages */
 293     int dirty_rate_high_cnt;
 294     /* these variables are used for bitmap sync */
 295     /* last time we did a full bitmap_sync */
 296     int64_t time_last_bitmap_sync;
 297     /* bytes transferred at start_time */
 298     uint64_t bytes_xfer_prev;
 299     /* number of dirty pages since start_time */
 300     uint64_t num_dirty_pages_period;
 301     /* xbzrle misses since the beginning of the period */
 302     uint64_t xbzrle_cache_miss_prev;
 303     /* number of iterations at the beginning of period */
 304     uint64_t iterations_prev;
 305     /* Iterations since start */
 306     uint64_t iterations;
 307     /* number of dirty bits in the bitmap */
 308     uint64_t migration_dirty_pages;
 309     /* protects modification of the bitmap */
 310     QemuMutex bitmap_mutex;
 311     /* The RAMBlock used in the last src_page_requests */
 312     RAMBlock *last_req_rb;
 313     /* Queue of outstanding page requests from the destination */
 314     QemuMutex src_page_req_mutex;
 315     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 316 };
 317 typedef struct RAMState RAMState;
 318
 319 static RAMState *ram_state;
 320
 321 uint64_t ram_bytes_remaining(void)
 322 {
 323     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 324                        0;
 325 }
 326
 327 MigrationStats ram_counters;
 328
 329 /* used by the search for pages to send */
 330 struct PageSearchStatus {
 331     /* Current block being searched */
 332     RAMBlock    *block;
 333     /* Current page to search from */
 334     unsigned long page;
 335     /* Set once we wrap around */
 336     bool         complete_round;
 337 };
 338 typedef struct PageSearchStatus PageSearchStatus;
 339
 340 struct CompressParam {
 341     bool done;
 342     bool quit;
 343     QEMUFile *file;
 344     QemuMutex mutex;
 345     QemuCond cond;
 346     RAMBlock *block;
 347     ram_addr_t offset;
 348
 349     /* internally used fields */
 350     z_stream stream;
 351     uint8_t *originbuf;
 352 };
 353 typedef struct CompressParam CompressParam;
 354
 355 struct DecompressParam {
 356     bool done;
 357     bool quit;
 358     QemuMutex mutex;
 359     QemuCond cond;
 360     void *des;
 361     uint8_t *compbuf;
 362     int len;
 363     z_stream stream;
 364 };
 365 typedef struct DecompressParam DecompressParam;
 366
 367 static CompressParam *comp_param;
 368 static QemuThread *compress_threads;
 369 /* comp_done_cond is used to wake up the migration thread when
 370  * one of the compression threads has finished the compression.
 371  * comp_done_lock is used to co-work with comp_done_cond.
 372  */
 373 static QemuMutex comp_done_lock;
 374 static QemuCond comp_done_cond;
 375 /* The empty QEMUFileOps will be used by file in CompressParam */
 376 static const QEMUFileOps empty_ops = { };
 377
 378 static QEMUFile *decomp_file;
 379 static DecompressParam *decomp_param;
 380 static QemuThread *decompress_threads;
 381 static QemuMutex decomp_done_lock;
 382 static QemuCond decomp_done_cond;
 383
 384 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 385                                 ram_addr_t offset, uint8_t *source_buf);
 386
 387 static void *do_data_compress(void *opaque)
 388 {
 389     CompressParam *param = opaque;
 390     RAMBlock *block;
 391     ram_addr_t offset;
 392
 393     qemu_mutex_lock(&param->mutex);
 394     while (!param->quit) {
 395         if (param->block) {
 396             block = param->block;
 397             offset = param->offset;
 398             param->block = NULL;
 399             qemu_mutex_unlock(&param->mutex);
 400
 401             do_compress_ram_page(param->file, &param->stream, block, offset,
 402                                  param->originbuf);
 403
 404             qemu_mutex_lock(&comp_done_lock);
 405             param->done = true;
 406             qemu_cond_signal(&comp_done_cond);
 407             qemu_mutex_unlock(&comp_done_lock);
 408
 409             qemu_mutex_lock(&param->mutex);
 410         } else {
 411             qemu_cond_wait(&param->cond, &param->mutex);
 412         }
 413     }
 414     qemu_mutex_unlock(&param->mutex);
 415
 416     return NULL;
 417 }
 418
 419 static inline void terminate_compression_threads(void)
 420 {
 421     int idx, thread_count;
 422
 423     thread_count = migrate_compress_threads();
 424
 425     for (idx = 0; idx < thread_count; idx++) {
 426         qemu_mutex_lock(&comp_param[idx].mutex);
 427         comp_param[idx].quit = true;
 428         qemu_cond_signal(&comp_param[idx].cond);
 429         qemu_mutex_unlock(&comp_param[idx].mutex);
 430     }
 431 }
 432
 433 static void compress_threads_save_cleanup(void)
 434 {
 435     int i, thread_count;
 436
 437     if (!migrate_use_compression()) {
 438         return;
 439     }
 440     terminate_compression_threads();
 441     thread_count = migrate_compress_threads();
 442     for (i = 0; i < thread_count; i++) {
 443         /*
 444          * we use it as a indicator which shows if the thread is
 445          * properly init'd or not
 446          */
 447         if (!comp_param[i].file) {
 448             break;
 449         }
 450         qemu_thread_join(compress_threads + i);
 451         qemu_mutex_destroy(&comp_param[i].mutex);
 452         qemu_cond_destroy(&comp_param[i].cond);
 453         deflateEnd(&comp_param[i].stream);
 454         g_free(comp_param[i].originbuf);
 455         qemu_fclose(comp_param[i].file);
 456         comp_param[i].file = NULL;
 457     }
 458     qemu_mutex_destroy(&comp_done_lock);
 459     qemu_cond_destroy(&comp_done_cond);
 460     g_free(compress_threads);
 461     g_free(comp_param);
 462     compress_threads = NULL;
 463     comp_param = NULL;
 464 }
 465
 466 static int compress_threads_save_setup(void)
 467 {
 468     int i, thread_count;
 469
 470     if (!migrate_use_compression()) {
 471         return 0;
 472     }
 473     thread_count = migrate_compress_threads();
 474     compress_threads = g_new0(QemuThread, thread_count);
 475     comp_param = g_new0(CompressParam, thread_count);
 476     qemu_cond_init(&comp_done_cond);
 477     qemu_mutex_init(&comp_done_lock);
 478     for (i = 0; i < thread_count; i++) {
 479         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 480         if (!comp_param[i].originbuf) {
 481             goto exit;
 482         }
 483
 484         if (deflateInit(&comp_param[i].stream,
 485                         migrate_compress_level()) != Z_OK) {
 486             g_free(comp_param[i].originbuf);
 487             goto exit;
 488         }
 489
 490         /* comp_param[i].file is just used as a dummy buffer to save data,
 491          * set its ops to empty.
 492          */
 493         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 494         comp_param[i].done = true;
 495         comp_param[i].quit = false;
 496         qemu_mutex_init(&comp_param[i].mutex);
 497         qemu_cond_init(&comp_param[i].cond);
 498         qemu_thread_create(compress_threads + i, "compress",
 499                            do_data_compress, comp_param + i,
 500                            QEMU_THREAD_JOINABLE);
 501     }
 502     return 0;
 503
 504 exit:
 505     compress_threads_save_cleanup();
 506     return -1;
 507 }
 508
 509 /* Multiple fd's */
 510
 511 #define MULTIFD_MAGIC 0x11223344U
 512 #define MULTIFD_VERSION 1
 513
 514 #define MULTIFD_FLAG_SYNC (1 << 0)
 515
 516 typedef struct {
 517     uint32_t magic;
 518     uint32_t version;
 519     unsigned char uuid[16]; /* QemuUUID */
 520     uint8_t id;
 521 } __attribute__((packed)) MultiFDInit_t;
 522
 523 typedef struct {
 524     uint32_t magic;
 525     uint32_t version;
 526     uint32_t flags;
 527     uint32_t size;
 528     uint32_t used;
 529     uint64_t packet_num;
 530     char ramblock[256];
 531     uint64_t offset[];
 532 } __attribute__((packed)) MultiFDPacket_t;
 533
 534 typedef struct {
 535     /* number of used pages */
 536     uint32_t used;
 537     /* number of allocated pages */
 538     uint32_t allocated;
 539     /* global number of generated multifd packets */
 540     uint64_t packet_num;
 541     /* offset of each page */
 542     ram_addr_t *offset;
 543     /* pointer to each page */
 544     struct iovec *iov;
 545     RAMBlock *block;
 546 } MultiFDPages_t;
 547
 548 typedef struct {
 549     /* this fields are not changed once the thread is created */
 550     /* channel number */
 551     uint8_t id;
 552     /* channel thread name */
 553     char *name;
 554     /* channel thread id */
 555     QemuThread thread;
 556     /* communication channel */
 557     QIOChannel *c;
 558     /* sem where to wait for more work */
 559     QemuSemaphore sem;
 560     /* this mutex protects the following parameters */
 561     QemuMutex mutex;
 562     /* is this channel thread running */
 563     bool running;
 564     /* should this thread finish */
 565     bool quit;
 566     /* thread has work to do */
 567     int pending_job;
 568     /* array of pages to sent */
 569     MultiFDPages_t *pages;
 570     /* packet allocated len */
 571     uint32_t packet_len;
 572     /* pointer to the packet */
 573     MultiFDPacket_t *packet;
 574     /* multifd flags for each packet */
 575     uint32_t flags;
 576     /* global number of generated multifd packets */
 577     uint64_t packet_num;
 578     /* thread local variables */
 579     /* packets sent through this channel */
 580     uint64_t num_packets;
 581     /* pages sent through this channel */
 582     uint64_t num_pages;
 583     /* syncs main thread and channels */
 584     QemuSemaphore sem_sync;
 585 }  MultiFDSendParams;
 586
 587 typedef struct {
 588     /* this fields are not changed once the thread is created */
 589     /* channel number */
 590     uint8_t id;
 591     /* channel thread name */
 592     char *name;
 593     /* channel thread id */
 594     QemuThread thread;
 595     /* communication channel */
 596     QIOChannel *c;
 597     /* this mutex protects the following parameters */
 598     QemuMutex mutex;
 599     /* is this channel thread running */
 600     bool running;
 601     /* array of pages to receive */
 602     MultiFDPages_t *pages;
 603     /* packet allocated len */
 604     uint32_t packet_len;
 605     /* pointer to the packet */
 606     MultiFDPacket_t *packet;
 607     /* multifd flags for each packet */
 608     uint32_t flags;
 609     /* global number of generated multifd packets */
 610     uint64_t packet_num;
 611     /* thread local variables */
 612     /* packets sent through this channel */
 613     uint64_t num_packets;
 614     /* pages sent through this channel */
 615     uint64_t num_pages;
 616     /* syncs main thread and channels */
 617     QemuSemaphore sem_sync;
 618 } MultiFDRecvParams;
 619
 620 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 621 {
 622     MultiFDInit_t msg;
 623     int ret;
 624
 625     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 626     msg.version = cpu_to_be32(MULTIFD_VERSION);
 627     msg.id = p->id;
 628     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 629
 630     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 631     if (ret != 0) {
 632         return -1;
 633     }
 634     return 0;
 635 }
 636
 637 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 638 {
 639     MultiFDInit_t msg;
 640     int ret;
 641
 642     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 643     if (ret != 0) {
 644         return -1;
 645     }
 646
 647     be32_to_cpus(&msg.magic);
 648     be32_to_cpus(&msg.version);
 649
 650     if (msg.magic != MULTIFD_MAGIC) {
 651         error_setg(errp, "multifd: received packet magic %x "
 652                    "expected %x", msg.magic, MULTIFD_MAGIC);
 653         return -1;
 654     }
 655
 656     if (msg.version != MULTIFD_VERSION) {
 657         error_setg(errp, "multifd: received packet version %d "
 658                    "expected %d", msg.version, MULTIFD_VERSION);
 659         return -1;
 660     }
 661
 662     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 663         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 664         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 665
 666         error_setg(errp, "multifd: received uuid '%s' and expected "
 667                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 668         g_free(uuid);
 669         g_free(msg_uuid);
 670         return -1;
 671     }
 672
 673     if (msg.id > migrate_multifd_channels()) {
 674         error_setg(errp, "multifd: received channel version %d "
 675                    "expected %d", msg.version, MULTIFD_VERSION);
 676         return -1;
 677     }
 678
 679     return msg.id;
 680 }
 681
 682 static MultiFDPages_t *multifd_pages_init(size_t size)
 683 {
 684     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 685
 686     pages->allocated = size;
 687     pages->iov = g_new0(struct iovec, size);
 688     pages->offset = g_new0(ram_addr_t, size);
 689
 690     return pages;
 691 }
 692
 693 static void multifd_pages_clear(MultiFDPages_t *pages)
 694 {
 695     pages->used = 0;
 696     pages->allocated = 0;
 697     pages->packet_num = 0;
 698     pages->block = NULL;
 699     g_free(pages->iov);
 700     pages->iov = NULL;
 701     g_free(pages->offset);
 702     pages->offset = NULL;
 703     g_free(pages);
 704 }
 705
 706 static void multifd_send_fill_packet(MultiFDSendParams *p)
 707 {
 708     MultiFDPacket_t *packet = p->packet;
 709     int i;
 710
 711     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
 712     packet->version = cpu_to_be32(MULTIFD_VERSION);
 713     packet->flags = cpu_to_be32(p->flags);
 714     packet->size = cpu_to_be32(migrate_multifd_page_count());
 715     packet->used = cpu_to_be32(p->pages->used);
 716     packet->packet_num = cpu_to_be64(p->packet_num);
 717
 718     if (p->pages->block) {
 719         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 720     }
 721
 722     for (i = 0; i < p->pages->used; i++) {
 723         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 724     }
 725 }
 726
 727 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 728 {
 729     MultiFDPacket_t *packet = p->packet;
 730     RAMBlock *block;
 731     int i;
 732
 733     be32_to_cpus(&packet->magic);
 734     if (packet->magic != MULTIFD_MAGIC) {
 735         error_setg(errp, "multifd: received packet "
 736                    "magic %x and expected magic %x",
 737                    packet->magic, MULTIFD_MAGIC);
 738         return -1;
 739     }
 740
 741     be32_to_cpus(&packet->version);
 742     if (packet->version != MULTIFD_VERSION) {
 743         error_setg(errp, "multifd: received packet "
 744                    "version %d and expected version %d",
 745                    packet->version, MULTIFD_VERSION);
 746         return -1;
 747     }
 748
 749     p->flags = be32_to_cpu(packet->flags);
 750
 751     be32_to_cpus(&packet->size);
 752     if (packet->size > migrate_multifd_page_count()) {
 753         error_setg(errp, "multifd: received packet "
 754                    "with size %d and expected maximum size %d",
 755                    packet->size, migrate_multifd_page_count()) ;
 756         return -1;
 757     }
 758
 759     p->pages->used = be32_to_cpu(packet->used);
 760     if (p->pages->used > packet->size) {
 761         error_setg(errp, "multifd: received packet "
 762                    "with size %d and expected maximum size %d",
 763                    p->pages->used, packet->size) ;
 764         return -1;
 765     }
 766
 767     p->packet_num = be64_to_cpu(packet->packet_num);
 768
 769     if (p->pages->used) {
 770         /* make sure that ramblock is 0 terminated */
 771         packet->ramblock[255] = 0;
 772         block = qemu_ram_block_by_name(packet->ramblock);
 773         if (!block) {
 774             error_setg(errp, "multifd: unknown ram block %s",
 775                        packet->ramblock);
 776             return -1;
 777         }
 778     }
 779
 780     for (i = 0; i < p->pages->used; i++) {
 781         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 782
 783         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 784             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 785                        " (max " RAM_ADDR_FMT ")",
 786                        offset, block->max_length);
 787             return -1;
 788         }
 789         p->pages->iov[i].iov_base = block->host + offset;
 790         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 791     }
 792
 793     return 0;
 794 }
 795
 796 struct {
 797     MultiFDSendParams *params;
 798     /* number of created threads */
 799     int count;
 800     /* array of pages to sent */
 801     MultiFDPages_t *pages;
 802     /* syncs main thread and channels */
 803     QemuSemaphore sem_sync;
 804     /* global number of generated multifd packets */
 805     uint64_t packet_num;
 806     /* send channels ready */
 807     QemuSemaphore channels_ready;
 808 } *multifd_send_state;
 809
 810 /*
 811  * How we use multifd_send_state->pages and channel->pages?
 812  *
 813  * We create a pages for each channel, and a main one.  Each time that
 814  * we need to send a batch of pages we interchange the ones between
 815  * multifd_send_state and the channel that is sending it.  There are
 816  * two reasons for that:
 817  *    - to not have to do so many mallocs during migration
 818  *    - to make easier to know what to free at the end of migration
 819  *
 820  * This way we always know who is the owner of each "pages" struct,
 821  * and we don't need any loocking.  It belongs to the migration thread
 822  * or to the channel thread.  Switching is safe because the migration
 823  * thread is using the channel mutex when changing it, and the channel
 824  * have to had finish with its own, otherwise pending_job can't be
 825  * false.
 826  */
 827
 828 static void multifd_send_pages(void)
 829 {
 830     int i;
 831     static int next_channel;
 832     MultiFDSendParams *p = NULL; /* make happy gcc */
 833     MultiFDPages_t *pages = multifd_send_state->pages;
 834     uint64_t transferred;
 835
 836     qemu_sem_wait(&multifd_send_state->channels_ready);
 837     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 838         p = &multifd_send_state->params[i];
 839
 840         qemu_mutex_lock(&p->mutex);
 841         if (!p->pending_job) {
 842             p->pending_job++;
 843             next_channel = (i + 1) % migrate_multifd_channels();
 844             break;
 845         }
 846         qemu_mutex_unlock(&p->mutex);
 847     }
 848     p->pages->used = 0;
 849
 850     p->packet_num = multifd_send_state->packet_num++;
 851     p->pages->block = NULL;
 852     multifd_send_state->pages = p->pages;
 853     p->pages = pages;
 854     transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
 855     ram_counters.multifd_bytes += transferred;
 856     ram_counters.transferred += transferred;;
 857     qemu_mutex_unlock(&p->mutex);
 858     qemu_sem_post(&p->sem);
 859 }
 860
 861 static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
 862 {
 863     MultiFDPages_t *pages = multifd_send_state->pages;
 864
 865     if (!pages->block) {
 866         pages->block = block;
 867     }
 868
 869     if (pages->block == block) {
 870         pages->offset[pages->used] = offset;
 871         pages->iov[pages->used].iov_base = block->host + offset;
 872         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
 873         pages->used++;
 874
 875         if (pages->used < pages->allocated) {
 876             return;
 877         }
 878     }
 879
 880     multifd_send_pages();
 881
 882     if (pages->block != block) {
 883         multifd_queue_page(block, offset);
 884     }
 885 }
 886
 887 static void multifd_send_terminate_threads(Error *err)
 888 {
 889     int i;
 890
 891     if (err) {
 892         MigrationState *s = migrate_get_current();
 893         migrate_set_error(s, err);
 894         if (s->state == MIGRATION_STATUS_SETUP ||
 895             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
 896             s->state == MIGRATION_STATUS_DEVICE ||
 897             s->state == MIGRATION_STATUS_ACTIVE) {
 898             migrate_set_state(&s->state, s->state,
 899                               MIGRATION_STATUS_FAILED);
 900         }
 901     }
 902
 903     for (i = 0; i < migrate_multifd_channels(); i++) {
 904         MultiFDSendParams *p = &multifd_send_state->params[i];
 905
 906         qemu_mutex_lock(&p->mutex);
 907         p->quit = true;
 908         qemu_sem_post(&p->sem);
 909         qemu_mutex_unlock(&p->mutex);
 910     }
 911 }
 912
 913 int multifd_save_cleanup(Error **errp)
 914 {
 915     int i;
 916     int ret = 0;
 917
 918     if (!migrate_use_multifd()) {
 919         return 0;
 920     }
 921     multifd_send_terminate_threads(NULL);
 922     for (i = 0; i < migrate_multifd_channels(); i++) {
 923         MultiFDSendParams *p = &multifd_send_state->params[i];
 924
 925         if (p->running) {
 926             qemu_thread_join(&p->thread);
 927         }
 928         socket_send_channel_destroy(p->c);
 929         p->c = NULL;
 930         qemu_mutex_destroy(&p->mutex);
 931         qemu_sem_destroy(&p->sem);
 932         qemu_sem_destroy(&p->sem_sync);
 933         g_free(p->name);
 934         p->name = NULL;
 935         multifd_pages_clear(p->pages);
 936         p->pages = NULL;
 937         p->packet_len = 0;
 938         g_free(p->packet);
 939         p->packet = NULL;
 940     }
 941     qemu_sem_destroy(&multifd_send_state->channels_ready);
 942     qemu_sem_destroy(&multifd_send_state->sem_sync);
 943     g_free(multifd_send_state->params);
 944     multifd_send_state->params = NULL;
 945     multifd_pages_clear(multifd_send_state->pages);
 946     multifd_send_state->pages = NULL;
 947     g_free(multifd_send_state);
 948     multifd_send_state = NULL;
 949     return ret;
 950 }
 951
 952 static void multifd_send_sync_main(void)
 953 {
 954     int i;
 955
 956     if (!migrate_use_multifd()) {
 957         return;
 958     }
 959     if (multifd_send_state->pages->used) {
 960         multifd_send_pages();
 961     }
 962     for (i = 0; i < migrate_multifd_channels(); i++) {
 963         MultiFDSendParams *p = &multifd_send_state->params[i];
 964
 965         trace_multifd_send_sync_main_signal(p->id);
 966
 967         qemu_mutex_lock(&p->mutex);
 968
 969         p->packet_num = multifd_send_state->packet_num++;
 970         p->flags |= MULTIFD_FLAG_SYNC;
 971         p->pending_job++;
 972         qemu_mutex_unlock(&p->mutex);
 973         qemu_sem_post(&p->sem);
 974     }
 975     for (i = 0; i < migrate_multifd_channels(); i++) {
 976         MultiFDSendParams *p = &multifd_send_state->params[i];
 977
 978         trace_multifd_send_sync_main_wait(p->id);
 979         qemu_sem_wait(&multifd_send_state->sem_sync);
 980     }
 981     trace_multifd_send_sync_main(multifd_send_state->packet_num);
 982 }
 983
 984 static void *multifd_send_thread(void *opaque)
 985 {
 986     MultiFDSendParams *p = opaque;
 987     Error *local_err = NULL;
 988     int ret;
 989
 990     trace_multifd_send_thread_start(p->id);
 991
 992     if (multifd_send_initial_packet(p, &local_err) < 0) {
 993         goto out;
 994     }
 995     /* initial packet */
 996     p->num_packets = 1;
 997
 998     while (true) {
 999         qemu_sem_wait(&p->sem);
1000         qemu_mutex_lock(&p->mutex);
1001
1002         if (p->pending_job) {
1003             uint32_t used = p->pages->used;
1004             uint64_t packet_num = p->packet_num;
1005             uint32_t flags = p->flags;
1006
1007             multifd_send_fill_packet(p);
1008             p->flags = 0;
1009             p->num_packets++;
1010             p->num_pages += used;
1011             p->pages->used = 0;
1012             qemu_mutex_unlock(&p->mutex);
1013
1014             trace_multifd_send(p->id, packet_num, used, flags);
1015
1016             ret = qio_channel_write_all(p->c, (void *)p->packet,
1017                                         p->packet_len, &local_err);
1018             if (ret != 0) {
1019                 break;
1020             }
1021
1022             ret = qio_channel_writev_all(p->c, p->pages->iov, used, &local_err);
1023             if (ret != 0) {
1024                 break;
1025             }
1026
1027             qemu_mutex_lock(&p->mutex);
1028             p->pending_job--;
1029             qemu_mutex_unlock(&p->mutex);
1030
1031             if (flags & MULTIFD_FLAG_SYNC) {
1032                 qemu_sem_post(&multifd_send_state->sem_sync);
1033             }
1034             qemu_sem_post(&multifd_send_state->channels_ready);
1035         } else if (p->quit) {
1036             qemu_mutex_unlock(&p->mutex);
1037             break;
1038         } else {
1039             qemu_mutex_unlock(&p->mutex);
1040             /* sometimes there are spurious wakeups */
1041         }
1042     }
1043
1044 out:
1045     if (local_err) {
1046         multifd_send_terminate_threads(local_err);
1047     }
1048
1049     qemu_mutex_lock(&p->mutex);
1050     p->running = false;
1051     qemu_mutex_unlock(&p->mutex);
1052
1053     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1054
1055     return NULL;
1056 }
1057
1058 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1059 {
1060     MultiFDSendParams *p = opaque;
1061     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1062     Error *local_err = NULL;
1063
1064     if (qio_task_propagate_error(task, &local_err)) {
1065         if (multifd_save_cleanup(&local_err) != 0) {
1066             migrate_set_error(migrate_get_current(), local_err);
1067         }
1068     } else {
1069         p->c = QIO_CHANNEL(sioc);
1070         qio_channel_set_delay(p->c, false);
1071         p->running = true;
1072         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1073                            QEMU_THREAD_JOINABLE);
1074
1075         atomic_inc(&multifd_send_state->count);
1076     }
1077 }
1078
1079 int multifd_save_setup(void)
1080 {
1081     int thread_count;
1082     uint32_t page_count = migrate_multifd_page_count();
1083     uint8_t i;
1084
1085     if (!migrate_use_multifd()) {
1086         return 0;
1087     }
1088     thread_count = migrate_multifd_channels();
1089     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1090     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1091     atomic_set(&multifd_send_state->count, 0);
1092     multifd_send_state->pages = multifd_pages_init(page_count);
1093     qemu_sem_init(&multifd_send_state->sem_sync, 0);
1094     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1095
1096     for (i = 0; i < thread_count; i++) {
1097         MultiFDSendParams *p = &multifd_send_state->params[i];
1098
1099         qemu_mutex_init(&p->mutex);
1100         qemu_sem_init(&p->sem, 0);
1101         qemu_sem_init(&p->sem_sync, 0);
1102         p->quit = false;
1103         p->pending_job = 0;
1104         p->id = i;
1105         p->pages = multifd_pages_init(page_count);
1106         p->packet_len = sizeof(MultiFDPacket_t)
1107                       + sizeof(ram_addr_t) * page_count;
1108         p->packet = g_malloc0(p->packet_len);
1109         p->name = g_strdup_printf("multifdsend_%d", i);
1110         socket_send_channel_create(multifd_new_send_channel_async, p);
1111     }
1112     return 0;
1113 }
1114
1115 struct {
1116     MultiFDRecvParams *params;
1117     /* number of created threads */
1118     int count;
1119     /* syncs main thread and channels */
1120     QemuSemaphore sem_sync;
1121     /* global number of generated multifd packets */
1122     uint64_t packet_num;
1123 } *multifd_recv_state;
1124
1125 static void multifd_recv_terminate_threads(Error *err)
1126 {
1127     int i;
1128
1129     if (err) {
1130         MigrationState *s = migrate_get_current();
1131         migrate_set_error(s, err);
1132         if (s->state == MIGRATION_STATUS_SETUP ||
1133             s->state == MIGRATION_STATUS_ACTIVE) {
1134             migrate_set_state(&s->state, s->state,
1135                               MIGRATION_STATUS_FAILED);
1136         }
1137     }
1138
1139     for (i = 0; i < migrate_multifd_channels(); i++) {
1140         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1141
1142         qemu_mutex_lock(&p->mutex);
1143         /* We could arrive here for two reasons:
1144            - normal quit, i.e. everything went fine, just finished
1145            - error quit: We close the channels so the channel threads
1146              finish the qio_channel_read_all_eof() */
1147         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1148         qemu_mutex_unlock(&p->mutex);
1149     }
1150 }
1151
1152 int multifd_load_cleanup(Error **errp)
1153 {
1154     int i;
1155     int ret = 0;
1156
1157     if (!migrate_use_multifd()) {
1158         return 0;
1159     }
1160     multifd_recv_terminate_threads(NULL);
1161     for (i = 0; i < migrate_multifd_channels(); i++) {
1162         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1163
1164         if (p->running) {
1165             qemu_thread_join(&p->thread);
1166         }
1167         object_unref(OBJECT(p->c));
1168         p->c = NULL;
1169         qemu_mutex_destroy(&p->mutex);
1170         qemu_sem_destroy(&p->sem_sync);
1171         g_free(p->name);
1172         p->name = NULL;
1173         multifd_pages_clear(p->pages);
1174         p->pages = NULL;
1175         p->packet_len = 0;
1176         g_free(p->packet);
1177         p->packet = NULL;
1178     }
1179     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1180     g_free(multifd_recv_state->params);
1181     multifd_recv_state->params = NULL;
1182     g_free(multifd_recv_state);
1183     multifd_recv_state = NULL;
1184
1185     return ret;
1186 }
1187
1188 static void multifd_recv_sync_main(void)
1189 {
1190     int i;
1191
1192     if (!migrate_use_multifd()) {
1193         return;
1194     }
1195     for (i = 0; i < migrate_multifd_channels(); i++) {
1196         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1197
1198         trace_multifd_recv_sync_main_wait(p->id);
1199         qemu_sem_wait(&multifd_recv_state->sem_sync);
1200         qemu_mutex_lock(&p->mutex);
1201         if (multifd_recv_state->packet_num < p->packet_num) {
1202             multifd_recv_state->packet_num = p->packet_num;
1203         }
1204         qemu_mutex_unlock(&p->mutex);
1205     }
1206     for (i = 0; i < migrate_multifd_channels(); i++) {
1207         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1208
1209         trace_multifd_recv_sync_main_signal(p->id);
1210         qemu_sem_post(&p->sem_sync);
1211     }
1212     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1213 }
1214
1215 static void *multifd_recv_thread(void *opaque)
1216 {
1217     MultiFDRecvParams *p = opaque;
1218     Error *local_err = NULL;
1219     int ret;
1220
1221     trace_multifd_recv_thread_start(p->id);
1222
1223     while (true) {
1224         uint32_t used;
1225         uint32_t flags;
1226
1227         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1228                                        p->packet_len, &local_err);
1229         if (ret == 0) {   /* EOF */
1230             break;
1231         }
1232         if (ret == -1) {   /* Error */
1233             break;
1234         }
1235
1236         qemu_mutex_lock(&p->mutex);
1237         ret = multifd_recv_unfill_packet(p, &local_err);
1238         if (ret) {
1239             qemu_mutex_unlock(&p->mutex);
1240             break;
1241         }
1242
1243         used = p->pages->used;
1244         flags = p->flags;
1245         trace_multifd_recv(p->id, p->packet_num, used, flags);
1246         p->num_packets++;
1247         p->num_pages += used;
1248         qemu_mutex_unlock(&p->mutex);
1249
1250         ret = qio_channel_readv_all(p->c, p->pages->iov, used, &local_err);
1251         if (ret != 0) {
1252             break;
1253         }
1254
1255         if (flags & MULTIFD_FLAG_SYNC) {
1256             qemu_sem_post(&multifd_recv_state->sem_sync);
1257             qemu_sem_wait(&p->sem_sync);
1258         }
1259     }
1260
1261     if (local_err) {
1262         multifd_recv_terminate_threads(local_err);
1263     }
1264     qemu_mutex_lock(&p->mutex);
1265     p->running = false;
1266     qemu_mutex_unlock(&p->mutex);
1267
1268     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1269
1270     return NULL;
1271 }
1272
1273 int multifd_load_setup(void)
1274 {
1275     int thread_count;
1276     uint32_t page_count = migrate_multifd_page_count();
1277     uint8_t i;
1278
1279     if (!migrate_use_multifd()) {
1280         return 0;
1281     }
1282     thread_count = migrate_multifd_channels();
1283     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1284     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1285     atomic_set(&multifd_recv_state->count, 0);
1286     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1287
1288     for (i = 0; i < thread_count; i++) {
1289         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1290
1291         qemu_mutex_init(&p->mutex);
1292         qemu_sem_init(&p->sem_sync, 0);
1293         p->id = i;
1294         p->pages = multifd_pages_init(page_count);
1295         p->packet_len = sizeof(MultiFDPacket_t)
1296                       + sizeof(ram_addr_t) * page_count;
1297         p->packet = g_malloc0(p->packet_len);
1298         p->name = g_strdup_printf("multifdrecv_%d", i);
1299     }
1300     return 0;
1301 }
1302
1303 bool multifd_recv_all_channels_created(void)
1304 {
1305     int thread_count = migrate_multifd_channels();
1306
1307     if (!migrate_use_multifd()) {
1308         return true;
1309     }
1310
1311     return thread_count == atomic_read(&multifd_recv_state->count);
1312 }
1313
1314 /* Return true if multifd is ready for the migration, otherwise false */
1315 bool multifd_recv_new_channel(QIOChannel *ioc)
1316 {
1317     MultiFDRecvParams *p;
1318     Error *local_err = NULL;
1319     int id;
1320
1321     id = multifd_recv_initial_packet(ioc, &local_err);
1322     if (id < 0) {
1323         multifd_recv_terminate_threads(local_err);
1324         return false;
1325     }
1326
1327     p = &multifd_recv_state->params[id];
1328     if (p->c != NULL) {
1329         error_setg(&local_err, "multifd: received id '%d' already setup'",
1330                    id);
1331         multifd_recv_terminate_threads(local_err);
1332         return false;
1333     }
1334     p->c = ioc;
1335     object_ref(OBJECT(ioc));
1336     /* initial packet */
1337     p->num_packets = 1;
1338
1339     p->running = true;
1340     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1341                        QEMU_THREAD_JOINABLE);
1342     atomic_inc(&multifd_recv_state->count);
1343     return multifd_recv_state->count == migrate_multifd_channels();
1344 }
1345
1346 /**
1347  * save_page_header: write page header to wire
1348  *
1349  * If this is the 1st block, it also writes the block identification
1350  *
1351  * Returns the number of bytes written
1352  *
1353  * @f: QEMUFile where to send the data
1354  * @block: block that contains the page we want to send
1355  * @offset: offset inside the block for the page
1356  *          in the lower bits, it contains flags
1357  */
1358 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1359                                ram_addr_t offset)
1360 {
1361     size_t size, len;
1362
1363     if (block == rs->last_sent_block) {
1364         offset |= RAM_SAVE_FLAG_CONTINUE;
1365     }
1366     qemu_put_be64(f, offset);
1367     size = 8;
1368
1369     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1370         len = strlen(block->idstr);
1371         qemu_put_byte(f, len);
1372         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1373         size += 1 + len;
1374         rs->last_sent_block = block;
1375     }
1376     return size;
1377 }
1378
1379 /**
1380  * mig_throttle_guest_down: throotle down the guest
1381  *
1382  * Reduce amount of guest cpu execution to hopefully slow down memory
1383  * writes. If guest dirty memory rate is reduced below the rate at
1384  * which we can transfer pages to the destination then we should be
1385  * able to complete migration. Some workloads dirty memory way too
1386  * fast and will not effectively converge, even with auto-converge.
1387  */
1388 static void mig_throttle_guest_down(void)
1389 {
1390     MigrationState *s = migrate_get_current();
1391     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1392     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1393
1394     /* We have not started throttling yet. Let's start it. */
1395     if (!cpu_throttle_active()) {
1396         cpu_throttle_set(pct_initial);
1397     } else {
1398         /* Throttling already on, just increase the rate */
1399         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
1400     }
1401 }
1402
1403 /**
1404  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1405  *
1406  * @rs: current RAM state
1407  * @current_addr: address for the zero page
1408  *
1409  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1410  * The important thing is that a stale (not-yet-0'd) page be replaced
1411  * by the new data.
1412  * As a bonus, if the page wasn't in the cache it gets added so that
1413  * when a small write is made into the 0'd page it gets XBZRLE sent.
1414  */
1415 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1416 {
1417     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1418         return;
1419     }
1420
1421     /* We don't care if this fails to allocate a new cache page
1422      * as long as it updated an old one */
1423     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1424                  ram_counters.dirty_sync_count);
1425 }
1426
1427 #define ENCODING_FLAG_XBZRLE 0x1
1428
1429 /**
1430  * save_xbzrle_page: compress and send current page
1431  *
1432  * Returns: 1 means that we wrote the page
1433  *          0 means that page is identical to the one already sent
1434  *          -1 means that xbzrle would be longer than normal
1435  *
1436  * @rs: current RAM state
1437  * @current_data: pointer to the address of the page contents
1438  * @current_addr: addr of the page
1439  * @block: block that contains the page we want to send
1440  * @offset: offset inside the block for the page
1441  * @last_stage: if we are at the completion stage
1442  */
1443 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1444                             ram_addr_t current_addr, RAMBlock *block,
1445                             ram_addr_t offset, bool last_stage)
1446 {
1447     int encoded_len = 0, bytes_xbzrle;
1448     uint8_t *prev_cached_page;
1449
1450     if (!cache_is_cached(XBZRLE.cache, current_addr,
1451                          ram_counters.dirty_sync_count)) {
1452         xbzrle_counters.cache_miss++;
1453         if (!last_stage) {
1454             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1455                              ram_counters.dirty_sync_count) == -1) {
1456                 return -1;
1457             } else {
1458                 /* update *current_data when the page has been
1459                    inserted into cache */
1460                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1461             }
1462         }
1463         return -1;
1464     }
1465
1466     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1467
1468     /* save current buffer into memory */
1469     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1470
1471     /* XBZRLE encoding (if there is no overflow) */
1472     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1473                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1474                                        TARGET_PAGE_SIZE);
1475     if (encoded_len == 0) {
1476         trace_save_xbzrle_page_skipping();
1477         return 0;
1478     } else if (encoded_len == -1) {
1479         trace_save_xbzrle_page_overflow();
1480         xbzrle_counters.overflow++;
1481         /* update data in the cache */
1482         if (!last_stage) {
1483             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1484             *current_data = prev_cached_page;
1485         }
1486         return -1;
1487     }
1488
1489     /* we need to update the data in the cache, in order to get the same data */
1490     if (!last_stage) {
1491         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1492     }
1493
1494     /* Send XBZRLE based compressed page */
1495     bytes_xbzrle = save_page_header(rs, rs->f, block,
1496                                     offset | RAM_SAVE_FLAG_XBZRLE);
1497     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1498     qemu_put_be16(rs->f, encoded_len);
1499     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1500     bytes_xbzrle += encoded_len + 1 + 2;
1501     xbzrle_counters.pages++;
1502     xbzrle_counters.bytes += bytes_xbzrle;
1503     ram_counters.transferred += bytes_xbzrle;
1504
1505     return 1;
1506 }
1507
1508 /**
1509  * migration_bitmap_find_dirty: find the next dirty page from start
1510  *
1511  * Called with rcu_read_lock() to protect migration_bitmap
1512  *
1513  * Returns the byte offset within memory region of the start of a dirty page
1514  *
1515  * @rs: current RAM state
1516  * @rb: RAMBlock where to search for dirty pages
1517  * @start: page where we start the search
1518  */
1519 static inline
1520 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1521                                           unsigned long start)
1522 {
1523     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1524     unsigned long *bitmap = rb->bmap;
1525     unsigned long next;
1526
1527     if (!qemu_ram_is_migratable(rb)) {
1528         return size;
1529     }
1530
1531     if (rs->ram_bulk_stage && start > 0) {
1532         next = start + 1;
1533     } else {
1534         next = find_next_bit(bitmap, size, start);
1535     }
1536
1537     return next;
1538 }
1539
1540 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1541                                                 RAMBlock *rb,
1542                                                 unsigned long page)
1543 {
1544     bool ret;
1545
1546     ret = test_and_clear_bit(page, rb->bmap);
1547
1548     if (ret) {
1549         rs->migration_dirty_pages--;
1550     }
1551     return ret;
1552 }
1553
1554 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1555                                         ram_addr_t start, ram_addr_t length)
1556 {
1557     rs->migration_dirty_pages +=
1558         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1559                                               &rs->num_dirty_pages_period);
1560 }
1561
1562 /**
1563  * ram_pagesize_summary: calculate all the pagesizes of a VM
1564  *
1565  * Returns a summary bitmap of the page sizes of all RAMBlocks
1566  *
1567  * For VMs with just normal pages this is equivalent to the host page
1568  * size. If it's got some huge pages then it's the OR of all the
1569  * different page sizes.
1570  */
1571 uint64_t ram_pagesize_summary(void)
1572 {
1573     RAMBlock *block;
1574     uint64_t summary = 0;
1575
1576     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1577         summary |= block->page_size;
1578     }
1579
1580     return summary;
1581 }
1582
1583 static void migration_update_rates(RAMState *rs, int64_t end_time)
1584 {
1585     uint64_t iter_count = rs->iterations - rs->iterations_prev;
1586
1587     /* calculate period counters */
1588     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1589                 / (end_time - rs->time_last_bitmap_sync);
1590
1591     if (!iter_count) {
1592         return;
1593     }
1594
1595     if (migrate_use_xbzrle()) {
1596         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1597             rs->xbzrle_cache_miss_prev) / iter_count;
1598         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1599     }
1600 }
1601
1602 static void migration_bitmap_sync(RAMState *rs)
1603 {
1604     RAMBlock *block;
1605     int64_t end_time;
1606     uint64_t bytes_xfer_now;
1607
1608     ram_counters.dirty_sync_count++;
1609
1610     if (!rs->time_last_bitmap_sync) {
1611         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1612     }
1613
1614     trace_migration_bitmap_sync_start();
1615     memory_global_dirty_log_sync();
1616
1617     qemu_mutex_lock(&rs->bitmap_mutex);
1618     rcu_read_lock();
1619     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1620         migration_bitmap_sync_range(rs, block, 0, block->used_length);
1621     }
1622     ram_counters.remaining = ram_bytes_remaining();
1623     rcu_read_unlock();
1624     qemu_mutex_unlock(&rs->bitmap_mutex);
1625
1626     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1627
1628     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1629
1630     /* more than 1 second = 1000 millisecons */
1631     if (end_time > rs->time_last_bitmap_sync + 1000) {
1632         bytes_xfer_now = ram_counters.transferred;
1633
1634         /* During block migration the auto-converge logic incorrectly detects
1635          * that ram migration makes no progress. Avoid this by disabling the
1636          * throttling logic during the bulk phase of block migration. */
1637         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1638             /* The following detection logic can be refined later. For now:
1639                Check to see if the dirtied bytes is 50% more than the approx.
1640                amount of bytes that just got transferred since the last time we
1641                were in this routine. If that happens twice, start or increase
1642                throttling */
1643
1644             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1645                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1646                 (++rs->dirty_rate_high_cnt >= 2)) {
1647                     trace_migration_throttle();
1648                     rs->dirty_rate_high_cnt = 0;
1649                     mig_throttle_guest_down();
1650             }
1651         }
1652
1653         migration_update_rates(rs, end_time);
1654
1655         rs->iterations_prev = rs->iterations;
1656
1657         /* reset period counters */
1658         rs->time_last_bitmap_sync = end_time;
1659         rs->num_dirty_pages_period = 0;
1660         rs->bytes_xfer_prev = bytes_xfer_now;
1661     }
1662     if (migrate_use_events()) {
1663         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
1664     }
1665 }
1666
1667 /**
1668  * save_zero_page: send the zero page to the stream
1669  *
1670  * Returns the number of pages written.
1671  *
1672  * @rs: current RAM state
1673  * @block: block that contains the page we want to send
1674  * @offset: offset inside the block for the page
1675  */
1676 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1677 {
1678     uint8_t *p = block->host + offset;
1679     int pages = -1;
1680
1681     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1682         ram_counters.duplicate++;
1683         ram_counters.transferred +=
1684             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
1685         qemu_put_byte(rs->f, 0);
1686         ram_counters.transferred += 1;
1687         pages = 1;
1688     }
1689
1690     return pages;
1691 }
1692
1693 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1694 {
1695     if (!migrate_release_ram() || !migration_in_postcopy()) {
1696         return;
1697     }
1698
1699     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1700 }
1701
1702 /*
1703  * @pages: the number of pages written by the control path,
1704  *        < 0 - error
1705  *        > 0 - number of pages written
1706  *
1707  * Return true if the pages has been saved, otherwise false is returned.
1708  */
1709 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1710                               int *pages)
1711 {
1712     uint64_t bytes_xmit = 0;
1713     int ret;
1714
1715     *pages = -1;
1716     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1717                                 &bytes_xmit);
1718     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1719         return false;
1720     }
1721
1722     if (bytes_xmit) {
1723         ram_counters.transferred += bytes_xmit;
1724         *pages = 1;
1725     }
1726
1727     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1728         return true;
1729     }
1730
1731     if (bytes_xmit > 0) {
1732         ram_counters.normal++;
1733     } else if (bytes_xmit == 0) {
1734         ram_counters.duplicate++;
1735     }
1736
1737     return true;
1738 }
1739
1740 /*
1741  * directly send the page to the stream
1742  *
1743  * Returns the number of pages written.
1744  *
1745  * @rs: current RAM state
1746  * @block: block that contains the page we want to send
1747  * @offset: offset inside the block for the page
1748  * @buf: the page to be sent
1749  * @async: send to page asyncly
1750  */
1751 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1752                             uint8_t *buf, bool async)
1753 {
1754     ram_counters.transferred += save_page_header(rs, rs->f, block,
1755                                                  offset | RAM_SAVE_FLAG_PAGE);
1756     if (async) {
1757         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1758                               migrate_release_ram() &
1759                               migration_in_postcopy());
1760     } else {
1761         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1762     }
1763     ram_counters.transferred += TARGET_PAGE_SIZE;
1764     ram_counters.normal++;
1765     return 1;
1766 }
1767
1768 /**
1769  * ram_save_page: send the given page to the stream
1770  *
1771  * Returns the number of pages written.
1772  *          < 0 - error
1773  *          >=0 - Number of pages written - this might legally be 0
1774  *                if xbzrle noticed the page was the same.
1775  *
1776  * @rs: current RAM state
1777  * @block: block that contains the page we want to send
1778  * @offset: offset inside the block for the page
1779  * @last_stage: if we are at the completion stage
1780  */
1781 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1782 {
1783     int pages = -1;
1784     uint8_t *p;
1785     bool send_async = true;
1786     RAMBlock *block = pss->block;
1787     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1788     ram_addr_t current_addr = block->offset + offset;
1789
1790     p = block->host + offset;
1791     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1792
1793     XBZRLE_cache_lock();
1794     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1795         migrate_use_xbzrle()) {
1796         pages = save_xbzrle_page(rs, &p, current_addr, block,
1797                                  offset, last_stage);
1798         if (!last_stage) {
1799             /* Can't send this cached data async, since the cache page
1800              * might get updated before it gets to the wire
1801              */
1802             send_async = false;
1803         }
1804     }
1805
1806     /* XBZRLE overflow or normal page */
1807     if (pages == -1) {
1808         pages = save_normal_page(rs, block, offset, p, send_async);
1809     }
1810
1811     XBZRLE_cache_unlock();
1812
1813     return pages;
1814 }
1815
1816 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1817                                  ram_addr_t offset)
1818 {
1819     multifd_queue_page(block, offset);
1820     ram_counters.normal++;
1821
1822     return 1;
1823 }
1824
1825 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1826                                 ram_addr_t offset, uint8_t *source_buf)
1827 {
1828     RAMState *rs = ram_state;
1829     int bytes_sent, blen;
1830     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1831
1832     bytes_sent = save_page_header(rs, f, block, offset |
1833                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
1834
1835     /*
1836      * copy it to a internal buffer to avoid it being modified by VM
1837      * so that we can catch up the error during compression and
1838      * decompression
1839      */
1840     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1841     blen = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1842     if (blen < 0) {
1843         bytes_sent = 0;
1844         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1845         error_report("compressed data failed!");
1846     } else {
1847         bytes_sent += blen;
1848         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1849     }
1850
1851     return bytes_sent;
1852 }
1853
1854 static void flush_compressed_data(RAMState *rs)
1855 {
1856     int idx, len, thread_count;
1857
1858     if (!migrate_use_compression()) {
1859         return;
1860     }
1861     thread_count = migrate_compress_threads();
1862
1863     qemu_mutex_lock(&comp_done_lock);
1864     for (idx = 0; idx < thread_count; idx++) {
1865         while (!comp_param[idx].done) {
1866             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1867         }
1868     }
1869     qemu_mutex_unlock(&comp_done_lock);
1870
1871     for (idx = 0; idx < thread_count; idx++) {
1872         qemu_mutex_lock(&comp_param[idx].mutex);
1873         if (!comp_param[idx].quit) {
1874             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1875             ram_counters.transferred += len;
1876         }
1877         qemu_mutex_unlock(&comp_param[idx].mutex);
1878     }
1879 }
1880
1881 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1882                                        ram_addr_t offset)
1883 {
1884     param->block = block;
1885     param->offset = offset;
1886 }
1887
1888 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1889                                            ram_addr_t offset)
1890 {
1891     int idx, thread_count, bytes_xmit = -1, pages = -1;
1892
1893     thread_count = migrate_compress_threads();
1894     qemu_mutex_lock(&comp_done_lock);
1895     while (true) {
1896         for (idx = 0; idx < thread_count; idx++) {
1897             if (comp_param[idx].done) {
1898                 comp_param[idx].done = false;
1899                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1900                 qemu_mutex_lock(&comp_param[idx].mutex);
1901                 set_compress_params(&comp_param[idx], block, offset);
1902                 qemu_cond_signal(&comp_param[idx].cond);
1903                 qemu_mutex_unlock(&comp_param[idx].mutex);
1904                 pages = 1;
1905                 ram_counters.normal++;
1906                 ram_counters.transferred += bytes_xmit;
1907                 break;
1908             }
1909         }
1910         if (pages > 0) {
1911             break;
1912         } else {
1913             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1914         }
1915     }
1916     qemu_mutex_unlock(&comp_done_lock);
1917
1918     return pages;
1919 }
1920
1921 /**
1922  * find_dirty_block: find the next dirty page and update any state
1923  * associated with the search process.
1924  *
1925  * Returns if a page is found
1926  *
1927  * @rs: current RAM state
1928  * @pss: data about the state of the current dirty page scan
1929  * @again: set to false if the search has scanned the whole of RAM
1930  */
1931 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1932 {
1933     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1934     if (pss->complete_round && pss->block == rs->last_seen_block &&
1935         pss->page >= rs->last_page) {
1936         /*
1937          * We've been once around the RAM and haven't found anything.
1938          * Give up.
1939          */
1940         *again = false;
1941         return false;
1942     }
1943     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1944         /* Didn't find anything in this RAM Block */
1945         pss->page = 0;
1946         pss->block = QLIST_NEXT_RCU(pss->block, next);
1947         if (!pss->block) {
1948             /* Hit the end of the list */
1949             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1950             /* Flag that we've looped */
1951             pss->complete_round = true;
1952             rs->ram_bulk_stage = false;
1953             if (migrate_use_xbzrle()) {
1954                 /* If xbzrle is on, stop using the data compression at this
1955                  * point. In theory, xbzrle can do better than compression.
1956                  */
1957                 flush_compressed_data(rs);
1958             }
1959         }
1960         /* Didn't find anything this time, but try again on the new block */
1961         *again = true;
1962         return false;
1963     } else {
1964         /* Can go around again, but... */
1965         *again = true;
1966         /* We've found something so probably don't need to */
1967         return true;
1968     }
1969 }
1970
1971 /**
1972  * unqueue_page: gets a page of the queue
1973  *
1974  * Helper for 'get_queued_page' - gets a page off the queue
1975  *
1976  * Returns the block of the page (or NULL if none available)
1977  *
1978  * @rs: current RAM state
1979  * @offset: used to return the offset within the RAMBlock
1980  */
1981 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1982 {
1983     RAMBlock *block = NULL;
1984
1985     qemu_mutex_lock(&rs->src_page_req_mutex);
1986     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1987         struct RAMSrcPageRequest *entry =
1988                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1989         block = entry->rb;
1990         *offset = entry->offset;
1991
1992         if (entry->len > TARGET_PAGE_SIZE) {
1993             entry->len -= TARGET_PAGE_SIZE;
1994             entry->offset += TARGET_PAGE_SIZE;
1995         } else {
1996             memory_region_unref(block->mr);
1997             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1998             g_free(entry);
1999             migration_consume_urgent_request();
2000         }
2001     }
2002     qemu_mutex_unlock(&rs->src_page_req_mutex);
2003
2004     return block;
2005 }
2006
2007 /**
2008  * get_queued_page: unqueue a page from the postocpy requests
2009  *
2010  * Skips pages that are already sent (!dirty)
2011  *
2012  * Returns if a queued page is found
2013  *
2014  * @rs: current RAM state
2015  * @pss: data about the state of the current dirty page scan
2016  */
2017 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2018 {
2019     RAMBlock  *block;
2020     ram_addr_t offset;
2021     bool dirty;
2022
2023     do {
2024         block = unqueue_page(rs, &offset);
2025         /*
2026          * We're sending this page, and since it's postcopy nothing else
2027          * will dirty it, and we must make sure it doesn't get sent again
2028          * even if this queue request was received after the background
2029          * search already sent it.
2030          */
2031         if (block) {
2032             unsigned long page;
2033
2034             page = offset >> TARGET_PAGE_BITS;
2035             dirty = test_bit(page, block->bmap);
2036             if (!dirty) {
2037                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2038                        page, test_bit(page, block->unsentmap));
2039             } else {
2040                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2041             }
2042         }
2043
2044     } while (block && !dirty);
2045
2046     if (block) {
2047         /*
2048          * As soon as we start servicing pages out of order, then we have
2049          * to kill the bulk stage, since the bulk stage assumes
2050          * in (migration_bitmap_find_and_reset_dirty) that every page is
2051          * dirty, that's no longer true.
2052          */
2053         rs->ram_bulk_stage = false;
2054
2055         /*
2056          * We want the background search to continue from the queued page
2057          * since the guest is likely to want other pages near to the page
2058          * it just requested.
2059          */
2060         pss->block = block;
2061         pss->page = offset >> TARGET_PAGE_BITS;
2062     }
2063
2064     return !!block;
2065 }
2066
2067 /**
2068  * migration_page_queue_free: drop any remaining pages in the ram
2069  * request queue
2070  *
2071  * It should be empty at the end anyway, but in error cases there may
2072  * be some left.  in case that there is any page left, we drop it.
2073  *
2074  */
2075 static void migration_page_queue_free(RAMState *rs)
2076 {
2077     struct RAMSrcPageRequest *mspr, *next_mspr;
2078     /* This queue generally should be empty - but in the case of a failed
2079      * migration might have some droppings in.
2080      */
2081     rcu_read_lock();
2082     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2083         memory_region_unref(mspr->rb->mr);
2084         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2085         g_free(mspr);
2086     }
2087     rcu_read_unlock();
2088 }
2089
2090 /**
2091  * ram_save_queue_pages: queue the page for transmission
2092  *
2093  * A request from postcopy destination for example.
2094  *
2095  * Returns zero on success or negative on error
2096  *
2097  * @rbname: Name of the RAMBLock of the request. NULL means the
2098  *          same that last one.
2099  * @start: starting address from the start of the RAMBlock
2100  * @len: length (in bytes) to send
2101  */
2102 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2103 {
2104     RAMBlock *ramblock;
2105     RAMState *rs = ram_state;
2106
2107     ram_counters.postcopy_requests++;
2108     rcu_read_lock();
2109     if (!rbname) {
2110         /* Reuse last RAMBlock */
2111         ramblock = rs->last_req_rb;
2112
2113         if (!ramblock) {
2114             /*
2115              * Shouldn't happen, we can't reuse the last RAMBlock if
2116              * it's the 1st request.
2117              */
2118             error_report("ram_save_queue_pages no previous block");
2119             goto err;
2120         }
2121     } else {
2122         ramblock = qemu_ram_block_by_name(rbname);
2123
2124         if (!ramblock) {
2125             /* We shouldn't be asked for a non-existent RAMBlock */
2126             error_report("ram_save_queue_pages no block '%s'", rbname);
2127             goto err;
2128         }
2129         rs->last_req_rb = ramblock;
2130     }
2131     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2132     if (start+len > ramblock->used_length) {
2133         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2134                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2135                      __func__, start, len, ramblock->used_length);
2136         goto err;
2137     }
2138
2139     struct RAMSrcPageRequest *new_entry =
2140         g_malloc0(sizeof(struct RAMSrcPageRequest));
2141     new_entry->rb = ramblock;
2142     new_entry->offset = start;
2143     new_entry->len = len;
2144
2145     memory_region_ref(ramblock->mr);
2146     qemu_mutex_lock(&rs->src_page_req_mutex);
2147     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2148     migration_make_urgent_request();
2149     qemu_mutex_unlock(&rs->src_page_req_mutex);
2150     rcu_read_unlock();
2151
2152     return 0;
2153
2154 err:
2155     rcu_read_unlock();
2156     return -1;
2157 }
2158
2159 static bool save_page_use_compression(RAMState *rs)
2160 {
2161     if (!migrate_use_compression()) {
2162         return false;
2163     }
2164
2165     /*
2166      * If xbzrle is on, stop using the data compression after first
2167      * round of migration even if compression is enabled. In theory,
2168      * xbzrle can do better than compression.
2169      */
2170     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2171         return true;
2172     }
2173
2174     return false;
2175 }
2176
2177 /**
2178  * ram_save_target_page: save one target page
2179  *
2180  * Returns the number of pages written
2181  *
2182  * @rs: current RAM state
2183  * @pss: data about the page we want to send
2184  * @last_stage: if we are at the completion stage
2185  */
2186 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2187                                 bool last_stage)
2188 {
2189     RAMBlock *block = pss->block;
2190     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2191     int res;
2192
2193     if (control_save_page(rs, block, offset, &res)) {
2194         return res;
2195     }
2196
2197     /*
2198      * When starting the process of a new block, the first page of
2199      * the block should be sent out before other pages in the same
2200      * block, and all the pages in last block should have been sent
2201      * out, keeping this order is important, because the 'cont' flag
2202      * is used to avoid resending the block name.
2203      */
2204     if (block != rs->last_sent_block && save_page_use_compression(rs)) {
2205             flush_compressed_data(rs);
2206     }
2207
2208     res = save_zero_page(rs, block, offset);
2209     if (res > 0) {
2210         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2211          * page would be stale
2212          */
2213         if (!save_page_use_compression(rs)) {
2214             XBZRLE_cache_lock();
2215             xbzrle_cache_zero_page(rs, block->offset + offset);
2216             XBZRLE_cache_unlock();
2217         }
2218         ram_release_pages(block->idstr, offset, res);
2219         return res;
2220     }
2221
2222     /*
2223      * Make sure the first page is sent out before other pages.
2224      *
2225      * we post it as normal page as compression will take much
2226      * CPU resource.
2227      */
2228     if (block == rs->last_sent_block && save_page_use_compression(rs)) {
2229         return compress_page_with_multi_thread(rs, block, offset);
2230     } else if (migrate_use_multifd()) {
2231         return ram_save_multifd_page(rs, block, offset);
2232     }
2233
2234     return ram_save_page(rs, pss, last_stage);
2235 }
2236
2237 /**
2238  * ram_save_host_page: save a whole host page
2239  *
2240  * Starting at *offset send pages up to the end of the current host
2241  * page. It's valid for the initial offset to point into the middle of
2242  * a host page in which case the remainder of the hostpage is sent.
2243  * Only dirty target pages are sent. Note that the host page size may
2244  * be a huge page for this block.
2245  * The saving stops at the boundary of the used_length of the block
2246  * if the RAMBlock isn't a multiple of the host page size.
2247  *
2248  * Returns the number of pages written or negative on error
2249  *
2250  * @rs: current RAM state
2251  * @ms: current migration state
2252  * @pss: data about the page we want to send
2253  * @last_stage: if we are at the completion stage
2254  */
2255 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2256                               bool last_stage)
2257 {
2258     int tmppages, pages = 0;
2259     size_t pagesize_bits =
2260         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2261
2262     if (!qemu_ram_is_migratable(pss->block)) {
2263         error_report("block %s should not be migrated !", pss->block->idstr);
2264         return 0;
2265     }
2266
2267     do {
2268         /* Check the pages is dirty and if it is send it */
2269         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2270             pss->page++;
2271             continue;
2272         }
2273
2274         tmppages = ram_save_target_page(rs, pss, last_stage);
2275         if (tmppages < 0) {
2276             return tmppages;
2277         }
2278
2279         pages += tmppages;
2280         if (pss->block->unsentmap) {
2281             clear_bit(pss->page, pss->block->unsentmap);
2282         }
2283
2284         pss->page++;
2285     } while ((pss->page & (pagesize_bits - 1)) &&
2286              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2287
2288     /* The offset we leave with is the last one we looked at */
2289     pss->page--;
2290     return pages;
2291 }
2292
2293 /**
2294  * ram_find_and_save_block: finds a dirty page and sends it to f
2295  *
2296  * Called within an RCU critical section.
2297  *
2298  * Returns the number of pages written where zero means no dirty pages
2299  *
2300  * @rs: current RAM state
2301  * @last_stage: if we are at the completion stage
2302  *
2303  * On systems where host-page-size > target-page-size it will send all the
2304  * pages in a host page that are dirty.
2305  */
2306
2307 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2308 {
2309     PageSearchStatus pss;
2310     int pages = 0;
2311     bool again, found;
2312
2313     /* No dirty page as there is zero RAM */
2314     if (!ram_bytes_total()) {
2315         return pages;
2316     }
2317
2318     pss.block = rs->last_seen_block;
2319     pss.page = rs->last_page;
2320     pss.complete_round = false;
2321
2322     if (!pss.block) {
2323         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2324     }
2325
2326     do {
2327         again = true;
2328         found = get_queued_page(rs, &pss);
2329
2330         if (!found) {
2331             /* priority queue empty, so just search for something dirty */
2332             found = find_dirty_block(rs, &pss, &again);
2333         }
2334
2335         if (found) {
2336             pages = ram_save_host_page(rs, &pss, last_stage);
2337         }
2338     } while (!pages && again);
2339
2340     rs->last_seen_block = pss.block;
2341     rs->last_page = pss.page;
2342
2343     return pages;
2344 }
2345
2346 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2347 {
2348     uint64_t pages = size / TARGET_PAGE_SIZE;
2349
2350     if (zero) {
2351         ram_counters.duplicate += pages;
2352     } else {
2353         ram_counters.normal += pages;
2354         ram_counters.transferred += size;
2355         qemu_update_position(f, size);
2356     }
2357 }
2358
2359 uint64_t ram_bytes_total(void)
2360 {
2361     RAMBlock *block;
2362     uint64_t total = 0;
2363
2364     rcu_read_lock();
2365     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2366         total += block->used_length;
2367     }
2368     rcu_read_unlock();
2369     return total;
2370 }
2371
2372 static void xbzrle_load_setup(void)
2373 {
2374     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2375 }
2376
2377 static void xbzrle_load_cleanup(void)
2378 {
2379     g_free(XBZRLE.decoded_buf);
2380     XBZRLE.decoded_buf = NULL;
2381 }
2382
2383 static void ram_state_cleanup(RAMState **rsp)
2384 {
2385     if (*rsp) {
2386         migration_page_queue_free(*rsp);
2387         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2388         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2389         g_free(*rsp);
2390         *rsp = NULL;
2391     }
2392 }
2393
2394 static void xbzrle_cleanup(void)
2395 {
2396     XBZRLE_cache_lock();
2397     if (XBZRLE.cache) {
2398         cache_fini(XBZRLE.cache);
2399         g_free(XBZRLE.encoded_buf);
2400         g_free(XBZRLE.current_buf);
2401         g_free(XBZRLE.zero_target_page);
2402         XBZRLE.cache = NULL;
2403         XBZRLE.encoded_buf = NULL;
2404         XBZRLE.current_buf = NULL;
2405         XBZRLE.zero_target_page = NULL;
2406     }
2407     XBZRLE_cache_unlock();
2408 }
2409
2410 static void ram_save_cleanup(void *opaque)
2411 {
2412     RAMState **rsp = opaque;
2413     RAMBlock *block;
2414
2415     /* caller have hold iothread lock or is in a bh, so there is
2416      * no writing race against this migration_bitmap
2417      */
2418     memory_global_dirty_log_stop();
2419
2420     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2421         g_free(block->bmap);
2422         block->bmap = NULL;
2423         g_free(block->unsentmap);
2424         block->unsentmap = NULL;
2425     }
2426
2427     xbzrle_cleanup();
2428     compress_threads_save_cleanup();
2429     ram_state_cleanup(rsp);
2430 }
2431
2432 static void ram_state_reset(RAMState *rs)
2433 {
2434     rs->last_seen_block = NULL;
2435     rs->last_sent_block = NULL;
2436     rs->last_page = 0;
2437     rs->last_version = ram_list.version;
2438     rs->ram_bulk_stage = true;
2439 }
2440
2441 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2442
2443 /*
2444  * 'expected' is the value you expect the bitmap mostly to be full
2445  * of; it won't bother printing lines that are all this value.
2446  * If 'todump' is null the migration bitmap is dumped.
2447  */
2448 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2449                            unsigned long pages)
2450 {
2451     int64_t cur;
2452     int64_t linelen = 128;
2453     char linebuf[129];
2454
2455     for (cur = 0; cur < pages; cur += linelen) {
2456         int64_t curb;
2457         bool found = false;
2458         /*
2459          * Last line; catch the case where the line length
2460          * is longer than remaining ram
2461          */
2462         if (cur + linelen > pages) {
2463             linelen = pages - cur;
2464         }
2465         for (curb = 0; curb < linelen; curb++) {
2466             bool thisbit = test_bit(cur + curb, todump);
2467             linebuf[curb] = thisbit ? '1' : '.';
2468             found = found || (thisbit != expected);
2469         }
2470         if (found) {
2471             linebuf[curb] = '\0';
2472             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2473         }
2474     }
2475 }
2476
2477 /* **** functions for postcopy ***** */
2478
2479 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2480 {
2481     struct RAMBlock *block;
2482
2483     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2484         unsigned long *bitmap = block->bmap;
2485         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2486         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2487
2488         while (run_start < range) {
2489             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2490             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2491                               (run_end - run_start) << TARGET_PAGE_BITS);
2492             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2493         }
2494     }
2495 }
2496
2497 /**
2498  * postcopy_send_discard_bm_ram: discard a RAMBlock
2499  *
2500  * Returns zero on success
2501  *
2502  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2503  * Note: At this point the 'unsentmap' is the processed bitmap combined
2504  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2505  *
2506  * @ms: current migration state
2507  * @pds: state for postcopy
2508  * @start: RAMBlock starting page
2509  * @length: RAMBlock size
2510  */
2511 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2512                                         PostcopyDiscardState *pds,
2513                                         RAMBlock *block)
2514 {
2515     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2516     unsigned long current;
2517     unsigned long *unsentmap = block->unsentmap;
2518
2519     for (current = 0; current < end; ) {
2520         unsigned long one = find_next_bit(unsentmap, end, current);
2521
2522         if (one <= end) {
2523             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2524             unsigned long discard_length;
2525
2526             if (zero >= end) {
2527                 discard_length = end - one;
2528             } else {
2529                 discard_length = zero - one;
2530             }
2531             if (discard_length) {
2532                 postcopy_discard_send_range(ms, pds, one, discard_length);
2533             }
2534             current = one + discard_length;
2535         } else {
2536             current = one;
2537         }
2538     }
2539
2540     return 0;
2541 }
2542
2543 /**
2544  * postcopy_each_ram_send_discard: discard all RAMBlocks
2545  *
2546  * Returns 0 for success or negative for error
2547  *
2548  * Utility for the outgoing postcopy code.
2549  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2550  *   passing it bitmap indexes and name.
2551  * (qemu_ram_foreach_block ends up passing unscaled lengths
2552  *  which would mean postcopy code would have to deal with target page)
2553  *
2554  * @ms: current migration state
2555  */
2556 static int postcopy_each_ram_send_discard(MigrationState *ms)
2557 {
2558     struct RAMBlock *block;
2559     int ret;
2560
2561     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2562         PostcopyDiscardState *pds =
2563             postcopy_discard_send_init(ms, block->idstr);
2564
2565         /*
2566          * Postcopy sends chunks of bitmap over the wire, but it
2567          * just needs indexes at this point, avoids it having
2568          * target page specific code.
2569          */
2570         ret = postcopy_send_discard_bm_ram(ms, pds, block);
2571         postcopy_discard_send_finish(ms, pds);
2572         if (ret) {
2573             return ret;
2574         }
2575     }
2576
2577     return 0;
2578 }
2579
2580 /**
2581  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2582  *
2583  * Helper for postcopy_chunk_hostpages; it's called twice to
2584  * canonicalize the two bitmaps, that are similar, but one is
2585  * inverted.
2586  *
2587  * Postcopy requires that all target pages in a hostpage are dirty or
2588  * clean, not a mix.  This function canonicalizes the bitmaps.
2589  *
2590  * @ms: current migration state
2591  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2592  *               otherwise we need to canonicalize partially dirty host pages
2593  * @block: block that contains the page we want to canonicalize
2594  * @pds: state for postcopy
2595  */
2596 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2597                                           RAMBlock *block,
2598                                           PostcopyDiscardState *pds)
2599 {
2600     RAMState *rs = ram_state;
2601     unsigned long *bitmap = block->bmap;
2602     unsigned long *unsentmap = block->unsentmap;
2603     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2604     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2605     unsigned long run_start;
2606
2607     if (block->page_size == TARGET_PAGE_SIZE) {
2608         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2609         return;
2610     }
2611
2612     if (unsent_pass) {
2613         /* Find a sent page */
2614         run_start = find_next_zero_bit(unsentmap, pages, 0);
2615     } else {
2616         /* Find a dirty page */
2617         run_start = find_next_bit(bitmap, pages, 0);
2618     }
2619
2620     while (run_start < pages) {
2621         bool do_fixup = false;
2622         unsigned long fixup_start_addr;
2623         unsigned long host_offset;
2624
2625         /*
2626          * If the start of this run of pages is in the middle of a host
2627          * page, then we need to fixup this host page.
2628          */
2629         host_offset = run_start % host_ratio;
2630         if (host_offset) {
2631             do_fixup = true;
2632             run_start -= host_offset;
2633             fixup_start_addr = run_start;
2634             /* For the next pass */
2635             run_start = run_start + host_ratio;
2636         } else {
2637             /* Find the end of this run */
2638             unsigned long run_end;
2639             if (unsent_pass) {
2640                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2641             } else {
2642                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2643             }
2644             /*
2645              * If the end isn't at the start of a host page, then the
2646              * run doesn't finish at the end of a host page
2647              * and we need to discard.
2648              */
2649             host_offset = run_end % host_ratio;
2650             if (host_offset) {
2651                 do_fixup = true;
2652                 fixup_start_addr = run_end - host_offset;
2653                 /*
2654                  * This host page has gone, the next loop iteration starts
2655                  * from after the fixup
2656                  */
2657                 run_start = fixup_start_addr + host_ratio;
2658             } else {
2659                 /*
2660                  * No discards on this iteration, next loop starts from
2661                  * next sent/dirty page
2662                  */
2663                 run_start = run_end + 1;
2664             }
2665         }
2666
2667         if (do_fixup) {
2668             unsigned long page;
2669
2670             /* Tell the destination to discard this page */
2671             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2672                 /* For the unsent_pass we:
2673                  *     discard partially sent pages
2674                  * For the !unsent_pass (dirty) we:
2675                  *     discard partially dirty pages that were sent
2676                  *     (any partially sent pages were already discarded
2677                  *     by the previous unsent_pass)
2678                  */
2679                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2680                                             host_ratio);
2681             }
2682
2683             /* Clean up the bitmap */
2684             for (page = fixup_start_addr;
2685                  page < fixup_start_addr + host_ratio; page++) {
2686                 /* All pages in this host page are now not sent */
2687                 set_bit(page, unsentmap);
2688
2689                 /*
2690                  * Remark them as dirty, updating the count for any pages
2691                  * that weren't previously dirty.
2692                  */
2693                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2694             }
2695         }
2696
2697         if (unsent_pass) {
2698             /* Find the next sent page for the next iteration */
2699             run_start = find_next_zero_bit(unsentmap, pages, run_start);
2700         } else {
2701             /* Find the next dirty page for the next iteration */
2702             run_start = find_next_bit(bitmap, pages, run_start);
2703         }
2704     }
2705 }
2706
2707 /**
2708  * postcopy_chuck_hostpages: discrad any partially sent host page
2709  *
2710  * Utility for the outgoing postcopy code.
2711  *
2712  * Discard any partially sent host-page size chunks, mark any partially
2713  * dirty host-page size chunks as all dirty.  In this case the host-page
2714  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2715  *
2716  * Returns zero on success
2717  *
2718  * @ms: current migration state
2719  * @block: block we want to work with
2720  */
2721 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2722 {
2723     PostcopyDiscardState *pds =
2724         postcopy_discard_send_init(ms, block->idstr);
2725
2726     /* First pass: Discard all partially sent host pages */
2727     postcopy_chunk_hostpages_pass(ms, true, block, pds);
2728     /*
2729      * Second pass: Ensure that all partially dirty host pages are made
2730      * fully dirty.
2731      */
2732     postcopy_chunk_hostpages_pass(ms, false, block, pds);
2733
2734     postcopy_discard_send_finish(ms, pds);
2735     return 0;
2736 }
2737
2738 /**
2739  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2740  *
2741  * Returns zero on success
2742  *
2743  * Transmit the set of pages to be discarded after precopy to the target
2744  * these are pages that:
2745  *     a) Have been previously transmitted but are now dirty again
2746  *     b) Pages that have never been transmitted, this ensures that
2747  *        any pages on the destination that have been mapped by background
2748  *        tasks get discarded (transparent huge pages is the specific concern)
2749  * Hopefully this is pretty sparse
2750  *
2751  * @ms: current migration state
2752  */
2753 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2754 {
2755     RAMState *rs = ram_state;
2756     RAMBlock *block;
2757     int ret;
2758
2759     rcu_read_lock();
2760
2761     /* This should be our last sync, the src is now paused */
2762     migration_bitmap_sync(rs);
2763
2764     /* Easiest way to make sure we don't resume in the middle of a host-page */
2765     rs->last_seen_block = NULL;
2766     rs->last_sent_block = NULL;
2767     rs->last_page = 0;
2768
2769     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2770         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2771         unsigned long *bitmap = block->bmap;
2772         unsigned long *unsentmap = block->unsentmap;
2773
2774         if (!unsentmap) {
2775             /* We don't have a safe way to resize the sentmap, so
2776              * if the bitmap was resized it will be NULL at this
2777              * point.
2778              */
2779             error_report("migration ram resized during precopy phase");
2780             rcu_read_unlock();
2781             return -EINVAL;
2782         }
2783         /* Deal with TPS != HPS and huge pages */
2784         ret = postcopy_chunk_hostpages(ms, block);
2785         if (ret) {
2786             rcu_read_unlock();
2787             return ret;
2788         }
2789
2790         /*
2791          * Update the unsentmap to be unsentmap = unsentmap | dirty
2792          */
2793         bitmap_or(unsentmap, unsentmap, bitmap, pages);
2794 #ifdef DEBUG_POSTCOPY
2795         ram_debug_dump_bitmap(unsentmap, true, pages);
2796 #endif
2797     }
2798     trace_ram_postcopy_send_discard_bitmap();
2799
2800     ret = postcopy_each_ram_send_discard(ms);
2801     rcu_read_unlock();
2802
2803     return ret;
2804 }
2805
2806 /**
2807  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2808  *
2809  * Returns zero on success
2810  *
2811  * @rbname: name of the RAMBlock of the request. NULL means the
2812  *          same that last one.
2813  * @start: RAMBlock starting page
2814  * @length: RAMBlock size
2815  */
2816 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2817 {
2818     int ret = -1;
2819
2820     trace_ram_discard_range(rbname, start, length);
2821
2822     rcu_read_lock();
2823     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2824
2825     if (!rb) {
2826         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2827         goto err;
2828     }
2829
2830     /*
2831      * On source VM, we don't need to update the received bitmap since
2832      * we don't even have one.
2833      */
2834     if (rb->receivedmap) {
2835         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2836                      length >> qemu_target_page_bits());
2837     }
2838
2839     ret = ram_block_discard_range(rb, start, length);
2840
2841 err:
2842     rcu_read_unlock();
2843
2844     return ret;
2845 }
2846
2847 /*
2848  * For every allocation, we will try not to crash the VM if the
2849  * allocation failed.
2850  */
2851 static int xbzrle_init(void)
2852 {
2853     Error *local_err = NULL;
2854
2855     if (!migrate_use_xbzrle()) {
2856         return 0;
2857     }
2858
2859     XBZRLE_cache_lock();
2860
2861     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2862     if (!XBZRLE.zero_target_page) {
2863         error_report("%s: Error allocating zero page", __func__);
2864         goto err_out;
2865     }
2866
2867     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2868                               TARGET_PAGE_SIZE, &local_err);
2869     if (!XBZRLE.cache) {
2870         error_report_err(local_err);
2871         goto free_zero_page;
2872     }
2873
2874     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2875     if (!XBZRLE.encoded_buf) {
2876         error_report("%s: Error allocating encoded_buf", __func__);
2877         goto free_cache;
2878     }
2879
2880     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2881     if (!XBZRLE.current_buf) {
2882         error_report("%s: Error allocating current_buf", __func__);
2883         goto free_encoded_buf;
2884     }
2885
2886     /* We are all good */
2887     XBZRLE_cache_unlock();
2888     return 0;
2889
2890 free_encoded_buf:
2891     g_free(XBZRLE.encoded_buf);
2892     XBZRLE.encoded_buf = NULL;
2893 free_cache:
2894     cache_fini(XBZRLE.cache);
2895     XBZRLE.cache = NULL;
2896 free_zero_page:
2897     g_free(XBZRLE.zero_target_page);
2898     XBZRLE.zero_target_page = NULL;
2899 err_out:
2900     XBZRLE_cache_unlock();
2901     return -ENOMEM;
2902 }
2903
2904 static int ram_state_init(RAMState **rsp)
2905 {
2906     *rsp = g_try_new0(RAMState, 1);
2907
2908     if (!*rsp) {
2909         error_report("%s: Init ramstate fail", __func__);
2910         return -1;
2911     }
2912
2913     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2914     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2915     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2916
2917     /*
2918      * Count the total number of pages used by ram blocks not including any
2919      * gaps due to alignment or unplugs.
2920      */
2921     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2922
2923     ram_state_reset(*rsp);
2924
2925     return 0;
2926 }
2927
2928 static void ram_list_init_bitmaps(void)
2929 {
2930     RAMBlock *block;
2931     unsigned long pages;
2932
2933     /* Skip setting bitmap if there is no RAM */
2934     if (ram_bytes_total()) {
2935         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2936             pages = block->max_length >> TARGET_PAGE_BITS;
2937             block->bmap = bitmap_new(pages);
2938             bitmap_set(block->bmap, 0, pages);
2939             if (migrate_postcopy_ram()) {
2940                 block->unsentmap = bitmap_new(pages);
2941                 bitmap_set(block->unsentmap, 0, pages);
2942             }
2943         }
2944     }
2945 }
2946
2947 static void ram_init_bitmaps(RAMState *rs)
2948 {
2949     /* For memory_global_dirty_log_start below.  */
2950     qemu_mutex_lock_iothread();
2951     qemu_mutex_lock_ramlist();
2952     rcu_read_lock();
2953
2954     ram_list_init_bitmaps();
2955     memory_global_dirty_log_start();
2956     migration_bitmap_sync(rs);
2957
2958     rcu_read_unlock();
2959     qemu_mutex_unlock_ramlist();
2960     qemu_mutex_unlock_iothread();
2961 }
2962
2963 static int ram_init_all(RAMState **rsp)
2964 {
2965     if (ram_state_init(rsp)) {
2966         return -1;
2967     }
2968
2969     if (xbzrle_init()) {
2970         ram_state_cleanup(rsp);
2971         return -1;
2972     }
2973
2974     ram_init_bitmaps(*rsp);
2975
2976     return 0;
2977 }
2978
2979 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2980 {
2981     RAMBlock *block;
2982     uint64_t pages = 0;
2983
2984     /*
2985      * Postcopy is not using xbzrle/compression, so no need for that.
2986      * Also, since source are already halted, we don't need to care
2987      * about dirty page logging as well.
2988      */
2989
2990     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2991         pages += bitmap_count_one(block->bmap,
2992                                   block->used_length >> TARGET_PAGE_BITS);
2993     }
2994
2995     /* This may not be aligned with current bitmaps. Recalculate. */
2996     rs->migration_dirty_pages = pages;
2997
2998     rs->last_seen_block = NULL;
2999     rs->last_sent_block = NULL;
3000     rs->last_page = 0;
3001     rs->last_version = ram_list.version;
3002     /*
3003      * Disable the bulk stage, otherwise we'll resend the whole RAM no
3004      * matter what we have sent.
3005      */
3006     rs->ram_bulk_stage = false;
3007
3008     /* Update RAMState cache of output QEMUFile */
3009     rs->f = out;
3010
3011     trace_ram_state_resume_prepare(pages);
3012 }
3013
3014 /*
3015  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3016  * long-running RCU critical section.  When rcu-reclaims in the code
3017  * start to become numerous it will be necessary to reduce the
3018  * granularity of these critical sections.
3019  */
3020
3021 /**
3022  * ram_save_setup: Setup RAM for migration
3023  *
3024  * Returns zero to indicate success and negative for error
3025  *
3026  * @f: QEMUFile where to send the data
3027  * @opaque: RAMState pointer
3028  */
3029 static int ram_save_setup(QEMUFile *f, void *opaque)
3030 {
3031     RAMState **rsp = opaque;
3032     RAMBlock *block;
3033
3034     if (compress_threads_save_setup()) {
3035         return -1;
3036     }
3037
3038     /* migration has already setup the bitmap, reuse it. */
3039     if (!migration_in_colo_state()) {
3040         if (ram_init_all(rsp) != 0) {
3041             compress_threads_save_cleanup();
3042             return -1;
3043         }
3044     }
3045     (*rsp)->f = f;
3046
3047     rcu_read_lock();
3048
3049     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
3050
3051     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3052         qemu_put_byte(f, strlen(block->idstr));
3053         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3054         qemu_put_be64(f, block->used_length);
3055         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3056             qemu_put_be64(f, block->page_size);
3057         }
3058     }
3059
3060     rcu_read_unlock();
3061
3062     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3063     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3064
3065     multifd_send_sync_main();
3066     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3067     qemu_fflush(f);
3068
3069     return 0;
3070 }
3071
3072 /**
3073  * ram_save_iterate: iterative stage for migration
3074  *
3075  * Returns zero to indicate success and negative for error
3076  *
3077  * @f: QEMUFile where to send the data
3078  * @opaque: RAMState pointer
3079  */
3080 static int ram_save_iterate(QEMUFile *f, void *opaque)
3081 {
3082     RAMState **temp = opaque;
3083     RAMState *rs = *temp;
3084     int ret;
3085     int i;
3086     int64_t t0;
3087     int done = 0;
3088
3089     if (blk_mig_bulk_active()) {
3090         /* Avoid transferring ram during bulk phase of block migration as
3091          * the bulk phase will usually take a long time and transferring
3092          * ram updates during that time is pointless. */
3093         goto out;
3094     }
3095
3096     rcu_read_lock();
3097     if (ram_list.version != rs->last_version) {
3098         ram_state_reset(rs);
3099     }
3100
3101     /* Read version before ram_list.blocks */
3102     smp_rmb();
3103
3104     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3105
3106     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3107     i = 0;
3108     while ((ret = qemu_file_rate_limit(f)) == 0 ||
3109             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3110         int pages;
3111
3112         if (qemu_file_get_error(f)) {
3113             break;
3114         }
3115
3116         pages = ram_find_and_save_block(rs, false);
3117         /* no more pages to sent */
3118         if (pages == 0) {
3119             done = 1;
3120             break;
3121         }
3122         rs->iterations++;
3123
3124         /* we want to check in the 1st loop, just in case it was the 1st time
3125            and we had to sync the dirty bitmap.
3126            qemu_get_clock_ns() is a bit expensive, so we only check each some
3127            iterations
3128         */
3129         if ((i & 63) == 0) {
3130             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3131             if (t1 > MAX_WAIT) {
3132                 trace_ram_save_iterate_big_wait(t1, i);
3133                 break;
3134             }
3135         }
3136         i++;
3137     }
3138     flush_compressed_data(rs);
3139     rcu_read_unlock();
3140
3141     /*
3142      * Must occur before EOS (or any QEMUFile operation)
3143      * because of RDMA protocol.
3144      */
3145     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3146
3147     multifd_send_sync_main();
3148 out:
3149     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3150     qemu_fflush(f);
3151     ram_counters.transferred += 8;
3152
3153     ret = qemu_file_get_error(f);
3154     if (ret < 0) {
3155         return ret;
3156     }
3157
3158     return done;
3159 }
3160
3161 /**
3162  * ram_save_complete: function called to send the remaining amount of ram
3163  *
3164  * Returns zero to indicate success
3165  *
3166  * Called with iothread lock
3167  *
3168  * @f: QEMUFile where to send the data
3169  * @opaque: RAMState pointer
3170  */
3171 static int ram_save_complete(QEMUFile *f, void *opaque)
3172 {
3173     RAMState **temp = opaque;
3174     RAMState *rs = *temp;
3175
3176     rcu_read_lock();
3177
3178     if (!migration_in_postcopy()) {
3179         migration_bitmap_sync(rs);
3180     }
3181
3182     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3183
3184     /* try transferring iterative blocks of memory */
3185
3186     /* flush all remaining blocks regardless of rate limiting */
3187     while (true) {
3188         int pages;
3189
3190         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3191         /* no more blocks to sent */
3192         if (pages == 0) {
3193             break;
3194         }
3195     }
3196
3197     flush_compressed_data(rs);
3198     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3199
3200     rcu_read_unlock();
3201
3202     multifd_send_sync_main();
3203     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3204     qemu_fflush(f);
3205
3206     return 0;
3207 }
3208
3209 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3210                              uint64_t *res_precopy_only,
3211                              uint64_t *res_compatible,
3212                              uint64_t *res_postcopy_only)
3213 {
3214     RAMState **temp = opaque;
3215     RAMState *rs = *temp;
3216     uint64_t remaining_size;
3217
3218     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3219
3220     if (!migration_in_postcopy() &&
3221         remaining_size < max_size) {
3222         qemu_mutex_lock_iothread();
3223         rcu_read_lock();
3224         migration_bitmap_sync(rs);
3225         rcu_read_unlock();
3226         qemu_mutex_unlock_iothread();
3227         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3228     }
3229
3230     if (migrate_postcopy_ram()) {
3231         /* We can do postcopy, and all the data is postcopiable */
3232         *res_compatible += remaining_size;
3233     } else {
3234         *res_precopy_only += remaining_size;
3235     }
3236 }
3237
3238 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3239 {
3240     unsigned int xh_len;
3241     int xh_flags;
3242     uint8_t *loaded_data;
3243
3244     /* extract RLE header */
3245     xh_flags = qemu_get_byte(f);
3246     xh_len = qemu_get_be16(f);
3247
3248     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3249         error_report("Failed to load XBZRLE page - wrong compression!");
3250         return -1;
3251     }
3252
3253     if (xh_len > TARGET_PAGE_SIZE) {
3254         error_report("Failed to load XBZRLE page - len overflow!");
3255         return -1;
3256     }
3257     loaded_data = XBZRLE.decoded_buf;
3258     /* load data and decode */
3259     /* it can change loaded_data to point to an internal buffer */
3260     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3261
3262     /* decode RLE */
3263     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3264                              TARGET_PAGE_SIZE) == -1) {
3265         error_report("Failed to load XBZRLE page - decode error!");
3266         return -1;
3267     }
3268
3269     return 0;
3270 }
3271
3272 /**
3273  * ram_block_from_stream: read a RAMBlock id from the migration stream
3274  *
3275  * Must be called from within a rcu critical section.
3276  *
3277  * Returns a pointer from within the RCU-protected ram_list.
3278  *
3279  * @f: QEMUFile where to read the data from
3280  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3281  */
3282 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3283 {
3284     static RAMBlock *block = NULL;
3285     char id[256];
3286     uint8_t len;
3287
3288     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3289         if (!block) {
3290             error_report("Ack, bad migration stream!");
3291             return NULL;
3292         }
3293         return block;
3294     }
3295
3296     len = qemu_get_byte(f);
3297     qemu_get_buffer(f, (uint8_t *)id, len);
3298     id[len] = 0;
3299
3300     block = qemu_ram_block_by_name(id);
3301     if (!block) {
3302         error_report("Can't find block %s", id);
3303         return NULL;
3304     }
3305
3306     if (!qemu_ram_is_migratable(block)) {
3307         error_report("block %s should not be migrated !", id);
3308         return NULL;
3309     }
3310
3311     return block;
3312 }
3313
3314 static inline void *host_from_ram_block_offset(RAMBlock *block,
3315                                                ram_addr_t offset)
3316 {
3317     if (!offset_in_ramblock(block, offset)) {
3318         return NULL;
3319     }
3320
3321     return block->host + offset;
3322 }
3323
3324 /**
3325  * ram_handle_compressed: handle the zero page case
3326  *
3327  * If a page (or a whole RDMA chunk) has been
3328  * determined to be zero, then zap it.
3329  *
3330  * @host: host address for the zero page
3331  * @ch: what the page is filled from.  We only support zero
3332  * @size: size of the zero page
3333  */
3334 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3335 {
3336     if (ch != 0 || !is_zero_range(host, size)) {
3337         memset(host, ch, size);
3338     }
3339 }
3340
3341 /* return the size after decompression, or negative value on error */
3342 static int
3343 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3344                      const uint8_t *source, size_t source_len)
3345 {
3346     int err;
3347
3348     err = inflateReset(stream);
3349     if (err != Z_OK) {
3350         return -1;
3351     }
3352
3353     stream->avail_in = source_len;
3354     stream->next_in = (uint8_t *)source;
3355     stream->avail_out = dest_len;
3356     stream->next_out = dest;
3357
3358     err = inflate(stream, Z_NO_FLUSH);
3359     if (err != Z_STREAM_END) {
3360         return -1;
3361     }
3362
3363     return stream->total_out;
3364 }
3365
3366 static void *do_data_decompress(void *opaque)
3367 {
3368     DecompressParam *param = opaque;
3369     unsigned long pagesize;
3370     uint8_t *des;
3371     int len, ret;
3372
3373     qemu_mutex_lock(&param->mutex);
3374     while (!param->quit) {
3375         if (param->des) {
3376             des = param->des;
3377             len = param->len;
3378             param->des = 0;
3379             qemu_mutex_unlock(&param->mutex);
3380
3381             pagesize = TARGET_PAGE_SIZE;
3382
3383             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3384                                        param->compbuf, len);
3385             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3386                 error_report("decompress data failed");
3387                 qemu_file_set_error(decomp_file, ret);
3388             }
3389
3390             qemu_mutex_lock(&decomp_done_lock);
3391             param->done = true;
3392             qemu_cond_signal(&decomp_done_cond);
3393             qemu_mutex_unlock(&decomp_done_lock);
3394
3395             qemu_mutex_lock(&param->mutex);
3396         } else {
3397             qemu_cond_wait(&param->cond, &param->mutex);
3398         }
3399     }
3400     qemu_mutex_unlock(&param->mutex);
3401
3402     return NULL;
3403 }
3404
3405 static int wait_for_decompress_done(void)
3406 {
3407     int idx, thread_count;
3408
3409     if (!migrate_use_compression()) {
3410         return 0;
3411     }
3412
3413     thread_count = migrate_decompress_threads();
3414     qemu_mutex_lock(&decomp_done_lock);
3415     for (idx = 0; idx < thread_count; idx++) {
3416         while (!decomp_param[idx].done) {
3417             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3418         }
3419     }
3420     qemu_mutex_unlock(&decomp_done_lock);
3421     return qemu_file_get_error(decomp_file);
3422 }
3423
3424 static void compress_threads_load_cleanup(void)
3425 {
3426     int i, thread_count;
3427
3428     if (!migrate_use_compression()) {
3429         return;
3430     }
3431     thread_count = migrate_decompress_threads();
3432     for (i = 0; i < thread_count; i++) {
3433         /*
3434          * we use it as a indicator which shows if the thread is
3435          * properly init'd or not
3436          */
3437         if (!decomp_param[i].compbuf) {
3438             break;
3439         }
3440
3441         qemu_mutex_lock(&decomp_param[i].mutex);
3442         decomp_param[i].quit = true;
3443         qemu_cond_signal(&decomp_param[i].cond);
3444         qemu_mutex_unlock(&decomp_param[i].mutex);
3445     }
3446     for (i = 0; i < thread_count; i++) {
3447         if (!decomp_param[i].compbuf) {
3448             break;
3449         }
3450
3451         qemu_thread_join(decompress_threads + i);
3452         qemu_mutex_destroy(&decomp_param[i].mutex);
3453         qemu_cond_destroy(&decomp_param[i].cond);
3454         inflateEnd(&decomp_param[i].stream);
3455         g_free(decomp_param[i].compbuf);
3456         decomp_param[i].compbuf = NULL;
3457     }
3458     g_free(decompress_threads);
3459     g_free(decomp_param);
3460     decompress_threads = NULL;
3461     decomp_param = NULL;
3462     decomp_file = NULL;
3463 }
3464
3465 static int compress_threads_load_setup(QEMUFile *f)
3466 {
3467     int i, thread_count;
3468
3469     if (!migrate_use_compression()) {
3470         return 0;
3471     }
3472
3473     thread_count = migrate_decompress_threads();
3474     decompress_threads = g_new0(QemuThread, thread_count);
3475     decomp_param = g_new0(DecompressParam, thread_count);
3476     qemu_mutex_init(&decomp_done_lock);
3477     qemu_cond_init(&decomp_done_cond);
3478     decomp_file = f;
3479     for (i = 0; i < thread_count; i++) {
3480         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3481             goto exit;
3482         }
3483
3484         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3485         qemu_mutex_init(&decomp_param[i].mutex);
3486         qemu_cond_init(&decomp_param[i].cond);
3487         decomp_param[i].done = true;
3488         decomp_param[i].quit = false;
3489         qemu_thread_create(decompress_threads + i, "decompress",
3490                            do_data_decompress, decomp_param + i,
3491                            QEMU_THREAD_JOINABLE);
3492     }
3493     return 0;
3494 exit:
3495     compress_threads_load_cleanup();
3496     return -1;
3497 }
3498
3499 static void decompress_data_with_multi_threads(QEMUFile *f,
3500                                                void *host, int len)
3501 {
3502     int idx, thread_count;
3503
3504     thread_count = migrate_decompress_threads();
3505     qemu_mutex_lock(&decomp_done_lock);
3506     while (true) {
3507         for (idx = 0; idx < thread_count; idx++) {
3508             if (decomp_param[idx].done) {
3509                 decomp_param[idx].done = false;
3510                 qemu_mutex_lock(&decomp_param[idx].mutex);
3511                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3512                 decomp_param[idx].des = host;
3513                 decomp_param[idx].len = len;
3514                 qemu_cond_signal(&decomp_param[idx].cond);
3515                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3516                 break;
3517             }
3518         }
3519         if (idx < thread_count) {
3520             break;
3521         } else {
3522             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3523         }
3524     }
3525     qemu_mutex_unlock(&decomp_done_lock);
3526 }
3527
3528 /**
3529  * ram_load_setup: Setup RAM for migration incoming side
3530  *
3531  * Returns zero to indicate success and negative for error
3532  *
3533  * @f: QEMUFile where to receive the data
3534  * @opaque: RAMState pointer
3535  */
3536 static int ram_load_setup(QEMUFile *f, void *opaque)
3537 {
3538     if (compress_threads_load_setup(f)) {
3539         return -1;
3540     }
3541
3542     xbzrle_load_setup();
3543     ramblock_recv_map_init();
3544     return 0;
3545 }
3546
3547 static int ram_load_cleanup(void *opaque)
3548 {
3549     RAMBlock *rb;
3550     xbzrle_load_cleanup();
3551     compress_threads_load_cleanup();
3552
3553     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3554         g_free(rb->receivedmap);
3555         rb->receivedmap = NULL;
3556     }
3557     return 0;
3558 }
3559
3560 /**
3561  * ram_postcopy_incoming_init: allocate postcopy data structures
3562  *
3563  * Returns 0 for success and negative if there was one error
3564  *
3565  * @mis: current migration incoming state
3566  *
3567  * Allocate data structures etc needed by incoming migration with
3568  * postcopy-ram. postcopy-ram's similarly names
3569  * postcopy_ram_incoming_init does the work.
3570  */
3571 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3572 {
3573     return postcopy_ram_incoming_init(mis);
3574 }
3575
3576 /**
3577  * ram_load_postcopy: load a page in postcopy case
3578  *
3579  * Returns 0 for success or -errno in case of error
3580  *
3581  * Called in postcopy mode by ram_load().
3582  * rcu_read_lock is taken prior to this being called.
3583  *
3584  * @f: QEMUFile where to send the data
3585  */
3586 static int ram_load_postcopy(QEMUFile *f)
3587 {
3588     int flags = 0, ret = 0;
3589     bool place_needed = false;
3590     bool matches_target_page_size = false;
3591     MigrationIncomingState *mis = migration_incoming_get_current();
3592     /* Temporary page that is later 'placed' */
3593     void *postcopy_host_page = postcopy_get_tmp_page(mis);
3594     void *last_host = NULL;
3595     bool all_zero = false;
3596
3597     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3598         ram_addr_t addr;
3599         void *host = NULL;
3600         void *page_buffer = NULL;
3601         void *place_source = NULL;
3602         RAMBlock *block = NULL;
3603         uint8_t ch;
3604
3605         addr = qemu_get_be64(f);
3606
3607         /*
3608          * If qemu file error, we should stop here, and then "addr"
3609          * may be invalid
3610          */
3611         ret = qemu_file_get_error(f);
3612         if (ret) {
3613             break;
3614         }
3615
3616         flags = addr & ~TARGET_PAGE_MASK;
3617         addr &= TARGET_PAGE_MASK;
3618
3619         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3620         place_needed = false;
3621         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
3622             block = ram_block_from_stream(f, flags);
3623
3624             host = host_from_ram_block_offset(block, addr);
3625             if (!host) {
3626                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3627                 ret = -EINVAL;
3628                 break;
3629             }
3630             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3631             /*
3632              * Postcopy requires that we place whole host pages atomically;
3633              * these may be huge pages for RAMBlocks that are backed by
3634              * hugetlbfs.
3635              * To make it atomic, the data is read into a temporary page
3636              * that's moved into place later.
3637              * The migration protocol uses,  possibly smaller, target-pages
3638              * however the source ensures it always sends all the components
3639              * of a host page in order.
3640              */
3641             page_buffer = postcopy_host_page +
3642                           ((uintptr_t)host & (block->page_size - 1));
3643             /* If all TP are zero then we can optimise the place */
3644             if (!((uintptr_t)host & (block->page_size - 1))) {
3645                 all_zero = true;
3646             } else {
3647                 /* not the 1st TP within the HP */
3648                 if (host != (last_host + TARGET_PAGE_SIZE)) {
3649                     error_report("Non-sequential target page %p/%p",
3650                                   host, last_host);
3651                     ret = -EINVAL;
3652                     break;
3653                 }
3654             }
3655
3656
3657             /*
3658              * If it's the last part of a host page then we place the host
3659              * page
3660              */
3661             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
3662                                      (block->page_size - 1)) == 0;
3663             place_source = postcopy_host_page;
3664         }
3665         last_host = host;
3666
3667         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3668         case RAM_SAVE_FLAG_ZERO:
3669             ch = qemu_get_byte(f);
3670             memset(page_buffer, ch, TARGET_PAGE_SIZE);
3671             if (ch) {
3672                 all_zero = false;
3673             }
3674             break;
3675
3676         case RAM_SAVE_FLAG_PAGE:
3677             all_zero = false;
3678             if (!matches_target_page_size) {
3679                 /* For huge pages, we always use temporary buffer */
3680                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3681             } else {
3682                 /*
3683                  * For small pages that matches target page size, we
3684                  * avoid the qemu_file copy.  Instead we directly use
3685                  * the buffer of QEMUFile to place the page.  Note: we
3686                  * cannot do any QEMUFile operation before using that
3687                  * buffer to make sure the buffer is valid when
3688                  * placing the page.
3689                  */
3690                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3691                                          TARGET_PAGE_SIZE);
3692             }
3693             break;
3694         case RAM_SAVE_FLAG_EOS:
3695             /* normal exit */
3696             multifd_recv_sync_main();
3697             break;
3698         default:
3699             error_report("Unknown combination of migration flags: %#x"
3700                          " (postcopy mode)", flags);
3701             ret = -EINVAL;
3702             break;
3703         }
3704
3705         /* Detect for any possible file errors */
3706         if (!ret && qemu_file_get_error(f)) {
3707             ret = qemu_file_get_error(f);
3708         }
3709
3710         if (!ret && place_needed) {
3711             /* This gets called at the last target page in the host page */
3712             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3713
3714             if (all_zero) {
3715                 ret = postcopy_place_page_zero(mis, place_dest,
3716                                                block);
3717             } else {
3718                 ret = postcopy_place_page(mis, place_dest,
3719                                           place_source, block);
3720             }
3721         }
3722     }
3723
3724     return ret;
3725 }
3726
3727 static bool postcopy_is_advised(void)
3728 {
3729     PostcopyState ps = postcopy_state_get();
3730     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3731 }
3732
3733 static bool postcopy_is_running(void)
3734 {
3735     PostcopyState ps = postcopy_state_get();
3736     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3737 }
3738
3739 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3740 {
3741     int flags = 0, ret = 0, invalid_flags = 0;
3742     static uint64_t seq_iter;
3743     int len = 0;
3744     /*
3745      * If system is running in postcopy mode, page inserts to host memory must
3746      * be atomic
3747      */
3748     bool postcopy_running = postcopy_is_running();
3749     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3750     bool postcopy_advised = postcopy_is_advised();
3751
3752     seq_iter++;
3753
3754     if (version_id != 4) {
3755         ret = -EINVAL;
3756     }
3757
3758     if (!migrate_use_compression()) {
3759         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3760     }
3761     /* This RCU critical section can be very long running.
3762      * When RCU reclaims in the code start to become numerous,
3763      * it will be necessary to reduce the granularity of this
3764      * critical section.
3765      */
3766     rcu_read_lock();
3767
3768     if (postcopy_running) {
3769         ret = ram_load_postcopy(f);
3770     }
3771
3772     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3773         ram_addr_t addr, total_ram_bytes;
3774         void *host = NULL;
3775         uint8_t ch;
3776
3777         addr = qemu_get_be64(f);
3778         flags = addr & ~TARGET_PAGE_MASK;
3779         addr &= TARGET_PAGE_MASK;
3780
3781         if (flags & invalid_flags) {
3782             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3783                 error_report("Received an unexpected compressed page");
3784             }
3785
3786             ret = -EINVAL;
3787             break;
3788         }
3789
3790         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3791                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3792             RAMBlock *block = ram_block_from_stream(f, flags);
3793
3794             host = host_from_ram_block_offset(block, addr);
3795             if (!host) {
3796                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3797                 ret = -EINVAL;
3798                 break;
3799             }
3800             ramblock_recv_bitmap_set(block, host);
3801             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3802         }
3803
3804         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3805         case RAM_SAVE_FLAG_MEM_SIZE:
3806             /* Synchronize RAM block list */
3807             total_ram_bytes = addr;
3808             while (!ret && total_ram_bytes) {
3809                 RAMBlock *block;
3810                 char id[256];
3811                 ram_addr_t length;
3812
3813                 len = qemu_get_byte(f);
3814                 qemu_get_buffer(f, (uint8_t *)id, len);
3815                 id[len] = 0;
3816                 length = qemu_get_be64(f);
3817
3818                 block = qemu_ram_block_by_name(id);
3819                 if (block && !qemu_ram_is_migratable(block)) {
3820                     error_report("block %s should not be migrated !", id);
3821                     ret = -EINVAL;
3822                 } else if (block) {
3823                     if (length != block->used_length) {
3824                         Error *local_err = NULL;
3825
3826                         ret = qemu_ram_resize(block, length,
3827                                               &local_err);
3828                         if (local_err) {
3829                             error_report_err(local_err);
3830                         }
3831                     }
3832                     /* For postcopy we need to check hugepage sizes match */
3833                     if (postcopy_advised &&
3834                         block->page_size != qemu_host_page_size) {
3835                         uint64_t remote_page_size = qemu_get_be64(f);
3836                         if (remote_page_size != block->page_size) {
3837                             error_report("Mismatched RAM page size %s "
3838                                          "(local) %zd != %" PRId64,
3839                                          id, block->page_size,
3840                                          remote_page_size);
3841                             ret = -EINVAL;
3842                         }
3843                     }
3844                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3845                                           block->idstr);
3846                 } else {
3847                     error_report("Unknown ramblock \"%s\", cannot "
3848                                  "accept migration", id);
3849                     ret = -EINVAL;
3850                 }
3851
3852                 total_ram_bytes -= length;
3853             }
3854             break;
3855
3856         case RAM_SAVE_FLAG_ZERO:
3857             ch = qemu_get_byte(f);
3858             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3859             break;
3860
3861         case RAM_SAVE_FLAG_PAGE:
3862             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3863             break;
3864
3865         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3866             len = qemu_get_be32(f);
3867             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3868                 error_report("Invalid compressed data length: %d", len);
3869                 ret = -EINVAL;
3870                 break;
3871             }
3872             decompress_data_with_multi_threads(f, host, len);
3873             break;
3874
3875         case RAM_SAVE_FLAG_XBZRLE:
3876             if (load_xbzrle(f, addr, host) < 0) {
3877                 error_report("Failed to decompress XBZRLE page at "
3878                              RAM_ADDR_FMT, addr);
3879                 ret = -EINVAL;
3880                 break;
3881             }
3882             break;
3883         case RAM_SAVE_FLAG_EOS:
3884             /* normal exit */
3885             multifd_recv_sync_main();
3886             break;
3887         default:
3888             if (flags & RAM_SAVE_FLAG_HOOK) {
3889                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3890             } else {
3891                 error_report("Unknown combination of migration flags: %#x",
3892                              flags);
3893                 ret = -EINVAL;
3894             }
3895         }
3896         if (!ret) {
3897             ret = qemu_file_get_error(f);
3898         }
3899     }
3900
3901     ret |= wait_for_decompress_done();
3902     rcu_read_unlock();
3903     trace_ram_load_complete(ret, seq_iter);
3904     return ret;
3905 }
3906
3907 static bool ram_has_postcopy(void *opaque)
3908 {
3909     return migrate_postcopy_ram();
3910 }
3911
3912 /* Sync all the dirty bitmap with destination VM.  */
3913 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3914 {
3915     RAMBlock *block;
3916     QEMUFile *file = s->to_dst_file;
3917     int ramblock_count = 0;
3918
3919     trace_ram_dirty_bitmap_sync_start();
3920
3921     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3922         qemu_savevm_send_recv_bitmap(file, block->idstr);
3923         trace_ram_dirty_bitmap_request(block->idstr);
3924         ramblock_count++;
3925     }
3926
3927     trace_ram_dirty_bitmap_sync_wait();
3928
3929     /* Wait until all the ramblocks' dirty bitmap synced */
3930     while (ramblock_count--) {
3931         qemu_sem_wait(&s->rp_state.rp_sem);
3932     }
3933
3934     trace_ram_dirty_bitmap_sync_complete();
3935
3936     return 0;
3937 }
3938
3939 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3940 {
3941     qemu_sem_post(&s->rp_state.rp_sem);
3942 }
3943
3944 /*
3945  * Read the received bitmap, revert it as the initial dirty bitmap.
3946  * This is only used when the postcopy migration is paused but wants
3947  * to resume from a middle point.
3948  */
3949 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3950 {
3951     int ret = -EINVAL;
3952     QEMUFile *file = s->rp_state.from_dst_file;
3953     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3954     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3955     uint64_t size, end_mark;
3956
3957     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3958
3959     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3960         error_report("%s: incorrect state %s", __func__,
3961                      MigrationStatus_str(s->state));
3962         return -EINVAL;
3963     }
3964
3965     /*
3966      * Note: see comments in ramblock_recv_bitmap_send() on why we
3967      * need the endianess convertion, and the paddings.
3968      */
3969     local_size = ROUND_UP(local_size, 8);
3970
3971     /* Add paddings */
3972     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3973
3974     size = qemu_get_be64(file);
3975
3976     /* The size of the bitmap should match with our ramblock */
3977     if (size != local_size) {
3978         error_report("%s: ramblock '%s' bitmap size mismatch "
3979                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3980                      block->idstr, size, local_size);
3981         ret = -EINVAL;
3982         goto out;
3983     }
3984
3985     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3986     end_mark = qemu_get_be64(file);
3987
3988     ret = qemu_file_get_error(file);
3989     if (ret || size != local_size) {
3990         error_report("%s: read bitmap failed for ramblock '%s': %d"
3991                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3992                      __func__, block->idstr, ret, local_size, size);
3993         ret = -EIO;
3994         goto out;
3995     }
3996
3997     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3998         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3999                      __func__, block->idstr, end_mark);
4000         ret = -EINVAL;
4001         goto out;
4002     }
4003
4004     /*
4005      * Endianess convertion. We are during postcopy (though paused).
4006      * The dirty bitmap won't change. We can directly modify it.
4007      */
4008     bitmap_from_le(block->bmap, le_bitmap, nbits);
4009
4010     /*
4011      * What we received is "received bitmap". Revert it as the initial
4012      * dirty bitmap for this ramblock.
4013      */
4014     bitmap_complement(block->bmap, block->bmap, nbits);
4015
4016     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4017
4018     /*
4019      * We succeeded to sync bitmap for current ramblock. If this is
4020      * the last one to sync, we need to notify the main send thread.
4021      */
4022     ram_dirty_bitmap_reload_notify(s);
4023
4024     ret = 0;
4025 out:
4026     g_free(le_bitmap);
4027     return ret;
4028 }
4029
4030 static int ram_resume_prepare(MigrationState *s, void *opaque)
4031 {
4032     RAMState *rs = *(RAMState **)opaque;
4033     int ret;
4034
4035     ret = ram_dirty_bitmap_sync_all(s, rs);
4036     if (ret) {
4037         return ret;
4038     }
4039
4040     ram_state_resume_prepare(rs, s->to_dst_file);
4041
4042     return 0;
4043 }
4044
4045 static SaveVMHandlers savevm_ram_handlers = {
4046     .save_setup = ram_save_setup,
4047     .save_live_iterate = ram_save_iterate,
4048     .save_live_complete_postcopy = ram_save_complete,
4049     .save_live_complete_precopy = ram_save_complete,
4050     .has_postcopy = ram_has_postcopy,
4051     .save_live_pending = ram_save_pending,
4052     .load_state = ram_load,
4053     .save_cleanup = ram_save_cleanup,
4054     .load_setup = ram_load_setup,
4055     .load_cleanup = ram_load_cleanup,
4056     .resume_prepare = ram_resume_prepare,
4057 };
4058
4059 void ram_mig_init(void)
4060 {
4061     qemu_mutex_init(&XBZRLE.lock);
4062     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4063 }