migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "socket.h"
  40 #include "migration/register.h"
  41 #include "migration/misc.h"
  42 #include "qemu-file.h"
  43 #include "postcopy-ram.h"
  44 #include "page_cache.h"
  45 #include "qemu/error-report.h"
  46 #include "qapi/error.h"
  47 #include "qapi/qapi-events-migration.h"
  48 #include "qapi/qmp/qerror.h"
  49 #include "trace.h"
  50 #include "exec/ram_addr.h"
  51 #include "exec/target_page.h"
  52 #include "qemu/rcu_queue.h"
  53 #include "migration/colo.h"
  54 #include "block.h"
  55 #include "sysemu/sysemu.h"
  56 #include "qemu/uuid.h"
  57 #include "savevm.h"
  58 #include "qemu/iov.h"
  59
  60 /***********************************************************/
  61 /* ram save/restore */
  62
  63 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  64  * worked for pages that where filled with the same char.  We switched
  65  * it to only search for the zero value.  And to avoid confusion with
  66  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  67  */
  68
  69 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  70 #define RAM_SAVE_FLAG_ZERO     0x02
  71 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  72 #define RAM_SAVE_FLAG_PAGE     0x08
  73 #define RAM_SAVE_FLAG_EOS      0x10
  74 #define RAM_SAVE_FLAG_CONTINUE 0x20
  75 #define RAM_SAVE_FLAG_XBZRLE   0x40
  76 /* 0x80 is reserved in migration.h start with 0x100 next */
  77 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  78
  79 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  80 {
  81     return buffer_is_zero(p, size);
  82 }
  83
  84 XBZRLECacheStats xbzrle_counters;
  85
  86 /* struct contains XBZRLE cache and a static page
  87    used by the compression */
  88 static struct {
  89     /* buffer used for XBZRLE encoding */
  90     uint8_t *encoded_buf;
  91     /* buffer for storing page content */
  92     uint8_t *current_buf;
  93     /* Cache for XBZRLE, Protected by lock. */
  94     PageCache *cache;
  95     QemuMutex lock;
  96     /* it will store a page full of zeros */
  97     uint8_t *zero_target_page;
  98     /* buffer used for XBZRLE decoding */
  99     uint8_t *decoded_buf;
 100 } XBZRLE;
 101
 102 static void XBZRLE_cache_lock(void)
 103 {
 104     if (migrate_use_xbzrle())
 105         qemu_mutex_lock(&XBZRLE.lock);
 106 }
 107
 108 static void XBZRLE_cache_unlock(void)
 109 {
 110     if (migrate_use_xbzrle())
 111         qemu_mutex_unlock(&XBZRLE.lock);
 112 }
 113
 114 /**
 115  * xbzrle_cache_resize: resize the xbzrle cache
 116  *
 117  * This function is called from qmp_migrate_set_cache_size in main
 118  * thread, possibly while a migration is in progress.  A running
 119  * migration may be using the cache and might finish during this call,
 120  * hence changes to the cache are protected by XBZRLE.lock().
 121  *
 122  * Returns 0 for success or -1 for error
 123  *
 124  * @new_size: new cache size
 125  * @errp: set *errp if the check failed, with reason
 126  */
 127 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 128 {
 129     PageCache *new_cache;
 130     int64_t ret = 0;
 131
 132     /* Check for truncation */
 133     if (new_size != (size_t)new_size) {
 134         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 135                    "exceeding address space");
 136         return -1;
 137     }
 138
 139     if (new_size == migrate_xbzrle_cache_size()) {
 140         /* nothing to do */
 141         return 0;
 142     }
 143
 144     XBZRLE_cache_lock();
 145
 146     if (XBZRLE.cache != NULL) {
 147         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 148         if (!new_cache) {
 149             ret = -1;
 150             goto out;
 151         }
 152
 153         cache_fini(XBZRLE.cache);
 154         XBZRLE.cache = new_cache;
 155     }
 156 out:
 157     XBZRLE_cache_unlock();
 158     return ret;
 159 }
 160
 161 /* Should be holding either ram_list.mutex, or the RCU lock. */
 162 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 163     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 164         if (!qemu_ram_is_migratable(block)) {} else
 165
 166 #undef RAMBLOCK_FOREACH
 167
 168 static void ramblock_recv_map_init(void)
 169 {
 170     RAMBlock *rb;
 171
 172     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
 173         assert(!rb->receivedmap);
 174         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 175     }
 176 }
 177
 178 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 179 {
 180     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 181                     rb->receivedmap);
 182 }
 183
 184 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 185 {
 186     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 187 }
 188
 189 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 190 {
 191     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 192 }
 193
 194 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 195                                     size_t nr)
 196 {
 197     bitmap_set_atomic(rb->receivedmap,
 198                       ramblock_recv_bitmap_offset(host_addr, rb),
 199                       nr);
 200 }
 201
 202 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 203
 204 /*
 205  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 206  *
 207  * Returns >0 if success with sent bytes, or <0 if error.
 208  */
 209 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 210                                   const char *block_name)
 211 {
 212     RAMBlock *block = qemu_ram_block_by_name(block_name);
 213     unsigned long *le_bitmap, nbits;
 214     uint64_t size;
 215
 216     if (!block) {
 217         error_report("%s: invalid block name: %s", __func__, block_name);
 218         return -1;
 219     }
 220
 221     nbits = block->used_length >> TARGET_PAGE_BITS;
 222
 223     /*
 224      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 225      * machines we may need 4 more bytes for padding (see below
 226      * comment). So extend it a bit before hand.
 227      */
 228     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 229
 230     /*
 231      * Always use little endian when sending the bitmap. This is
 232      * required that when source and destination VMs are not using the
 233      * same endianess. (Note: big endian won't work.)
 234      */
 235     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 236
 237     /* Size of the bitmap, in bytes */
 238     size = nbits / 8;
 239
 240     /*
 241      * size is always aligned to 8 bytes for 64bit machines, but it
 242      * may not be true for 32bit machines. We need this padding to
 243      * make sure the migration can survive even between 32bit and
 244      * 64bit machines.
 245      */
 246     size = ROUND_UP(size, 8);
 247
 248     qemu_put_be64(file, size);
 249     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 250     /*
 251      * Mark as an end, in case the middle part is screwed up due to
 252      * some "misterious" reason.
 253      */
 254     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 255     qemu_fflush(file);
 256
 257     g_free(le_bitmap);
 258
 259     if (qemu_file_get_error(file)) {
 260         return qemu_file_get_error(file);
 261     }
 262
 263     return size + sizeof(size);
 264 }
 265
 266 /*
 267  * An outstanding page request, on the source, having been received
 268  * and queued
 269  */
 270 struct RAMSrcPageRequest {
 271     RAMBlock *rb;
 272     hwaddr    offset;
 273     hwaddr    len;
 274
 275     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 276 };
 277
 278 /* State of RAM for migration */
 279 struct RAMState {
 280     /* QEMUFile used for this migration */
 281     QEMUFile *f;
 282     /* Last block that we have visited searching for dirty pages */
 283     RAMBlock *last_seen_block;
 284     /* Last block from where we have sent data */
 285     RAMBlock *last_sent_block;
 286     /* Last dirty target page we have sent */
 287     ram_addr_t last_page;
 288     /* last ram version we have seen */
 289     uint32_t last_version;
 290     /* We are in the first round */
 291     bool ram_bulk_stage;
 292     /* How many times we have dirty too many pages */
 293     int dirty_rate_high_cnt;
 294     /* these variables are used for bitmap sync */
 295     /* last time we did a full bitmap_sync */
 296     int64_t time_last_bitmap_sync;
 297     /* bytes transferred at start_time */
 298     uint64_t bytes_xfer_prev;
 299     /* number of dirty pages since start_time */
 300     uint64_t num_dirty_pages_period;
 301     /* xbzrle misses since the beginning of the period */
 302     uint64_t xbzrle_cache_miss_prev;
 303     /* number of iterations at the beginning of period */
 304     uint64_t iterations_prev;
 305     /* Iterations since start */
 306     uint64_t iterations;
 307     /* number of dirty bits in the bitmap */
 308     uint64_t migration_dirty_pages;
 309     /* protects modification of the bitmap */
 310     QemuMutex bitmap_mutex;
 311     /* The RAMBlock used in the last src_page_requests */
 312     RAMBlock *last_req_rb;
 313     /* Queue of outstanding page requests from the destination */
 314     QemuMutex src_page_req_mutex;
 315     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 316 };
 317 typedef struct RAMState RAMState;
 318
 319 static RAMState *ram_state;
 320
 321 uint64_t ram_bytes_remaining(void)
 322 {
 323     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 324                        0;
 325 }
 326
 327 MigrationStats ram_counters;
 328
 329 /* used by the search for pages to send */
 330 struct PageSearchStatus {
 331     /* Current block being searched */
 332     RAMBlock    *block;
 333     /* Current page to search from */
 334     unsigned long page;
 335     /* Set once we wrap around */
 336     bool         complete_round;
 337 };
 338 typedef struct PageSearchStatus PageSearchStatus;
 339
 340 struct CompressParam {
 341     bool done;
 342     bool quit;
 343     QEMUFile *file;
 344     QemuMutex mutex;
 345     QemuCond cond;
 346     RAMBlock *block;
 347     ram_addr_t offset;
 348
 349     /* internally used fields */
 350     z_stream stream;
 351     uint8_t *originbuf;
 352 };
 353 typedef struct CompressParam CompressParam;
 354
 355 struct DecompressParam {
 356     bool done;
 357     bool quit;
 358     QemuMutex mutex;
 359     QemuCond cond;
 360     void *des;
 361     uint8_t *compbuf;
 362     int len;
 363     z_stream stream;
 364 };
 365 typedef struct DecompressParam DecompressParam;
 366
 367 static CompressParam *comp_param;
 368 static QemuThread *compress_threads;
 369 /* comp_done_cond is used to wake up the migration thread when
 370  * one of the compression threads has finished the compression.
 371  * comp_done_lock is used to co-work with comp_done_cond.
 372  */
 373 static QemuMutex comp_done_lock;
 374 static QemuCond comp_done_cond;
 375 /* The empty QEMUFileOps will be used by file in CompressParam */
 376 static const QEMUFileOps empty_ops = { };
 377
 378 static QEMUFile *decomp_file;
 379 static DecompressParam *decomp_param;
 380 static QemuThread *decompress_threads;
 381 static QemuMutex decomp_done_lock;
 382 static QemuCond decomp_done_cond;
 383
 384 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 385                                 ram_addr_t offset, uint8_t *source_buf);
 386
 387 static void *do_data_compress(void *opaque)
 388 {
 389     CompressParam *param = opaque;
 390     RAMBlock *block;
 391     ram_addr_t offset;
 392
 393     qemu_mutex_lock(&param->mutex);
 394     while (!param->quit) {
 395         if (param->block) {
 396             block = param->block;
 397             offset = param->offset;
 398             param->block = NULL;
 399             qemu_mutex_unlock(&param->mutex);
 400
 401             do_compress_ram_page(param->file, &param->stream, block, offset,
 402                                  param->originbuf);
 403
 404             qemu_mutex_lock(&comp_done_lock);
 405             param->done = true;
 406             qemu_cond_signal(&comp_done_cond);
 407             qemu_mutex_unlock(&comp_done_lock);
 408
 409             qemu_mutex_lock(&param->mutex);
 410         } else {
 411             qemu_cond_wait(&param->cond, &param->mutex);
 412         }
 413     }
 414     qemu_mutex_unlock(&param->mutex);
 415
 416     return NULL;
 417 }
 418
 419 static inline void terminate_compression_threads(void)
 420 {
 421     int idx, thread_count;
 422
 423     thread_count = migrate_compress_threads();
 424
 425     for (idx = 0; idx < thread_count; idx++) {
 426         qemu_mutex_lock(&comp_param[idx].mutex);
 427         comp_param[idx].quit = true;
 428         qemu_cond_signal(&comp_param[idx].cond);
 429         qemu_mutex_unlock(&comp_param[idx].mutex);
 430     }
 431 }
 432
 433 static void compress_threads_save_cleanup(void)
 434 {
 435     int i, thread_count;
 436
 437     if (!migrate_use_compression()) {
 438         return;
 439     }
 440     terminate_compression_threads();
 441     thread_count = migrate_compress_threads();
 442     for (i = 0; i < thread_count; i++) {
 443         /*
 444          * we use it as a indicator which shows if the thread is
 445          * properly init'd or not
 446          */
 447         if (!comp_param[i].file) {
 448             break;
 449         }
 450         qemu_thread_join(compress_threads + i);
 451         qemu_mutex_destroy(&comp_param[i].mutex);
 452         qemu_cond_destroy(&comp_param[i].cond);
 453         deflateEnd(&comp_param[i].stream);
 454         g_free(comp_param[i].originbuf);
 455         qemu_fclose(comp_param[i].file);
 456         comp_param[i].file = NULL;
 457     }
 458     qemu_mutex_destroy(&comp_done_lock);
 459     qemu_cond_destroy(&comp_done_cond);
 460     g_free(compress_threads);
 461     g_free(comp_param);
 462     compress_threads = NULL;
 463     comp_param = NULL;
 464 }
 465
 466 static int compress_threads_save_setup(void)
 467 {
 468     int i, thread_count;
 469
 470     if (!migrate_use_compression()) {
 471         return 0;
 472     }
 473     thread_count = migrate_compress_threads();
 474     compress_threads = g_new0(QemuThread, thread_count);
 475     comp_param = g_new0(CompressParam, thread_count);
 476     qemu_cond_init(&comp_done_cond);
 477     qemu_mutex_init(&comp_done_lock);
 478     for (i = 0; i < thread_count; i++) {
 479         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 480         if (!comp_param[i].originbuf) {
 481             goto exit;
 482         }
 483
 484         if (deflateInit(&comp_param[i].stream,
 485                         migrate_compress_level()) != Z_OK) {
 486             g_free(comp_param[i].originbuf);
 487             goto exit;
 488         }
 489
 490         /* comp_param[i].file is just used as a dummy buffer to save data,
 491          * set its ops to empty.
 492          */
 493         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 494         comp_param[i].done = true;
 495         comp_param[i].quit = false;
 496         qemu_mutex_init(&comp_param[i].mutex);
 497         qemu_cond_init(&comp_param[i].cond);
 498         qemu_thread_create(compress_threads + i, "compress",
 499                            do_data_compress, comp_param + i,
 500                            QEMU_THREAD_JOINABLE);
 501     }
 502     return 0;
 503
 504 exit:
 505     compress_threads_save_cleanup();
 506     return -1;
 507 }
 508
 509 /* Multiple fd's */
 510
 511 #define MULTIFD_MAGIC 0x11223344U
 512 #define MULTIFD_VERSION 1
 513
 514 #define MULTIFD_FLAG_SYNC (1 << 0)
 515
 516 typedef struct {
 517     uint32_t magic;
 518     uint32_t version;
 519     unsigned char uuid[16]; /* QemuUUID */
 520     uint8_t id;
 521 } __attribute__((packed)) MultiFDInit_t;
 522
 523 typedef struct {
 524     uint32_t magic;
 525     uint32_t version;
 526     uint32_t flags;
 527     uint32_t size;
 528     uint32_t used;
 529     uint64_t packet_num;
 530     char ramblock[256];
 531     uint64_t offset[];
 532 } __attribute__((packed)) MultiFDPacket_t;
 533
 534 typedef struct {
 535     /* number of used pages */
 536     uint32_t used;
 537     /* number of allocated pages */
 538     uint32_t allocated;
 539     /* global number of generated multifd packets */
 540     uint64_t packet_num;
 541     /* offset of each page */
 542     ram_addr_t *offset;
 543     /* pointer to each page */
 544     struct iovec *iov;
 545     RAMBlock *block;
 546 } MultiFDPages_t;
 547
 548 typedef struct {
 549     /* this fields are not changed once the thread is created */
 550     /* channel number */
 551     uint8_t id;
 552     /* channel thread name */
 553     char *name;
 554     /* channel thread id */
 555     QemuThread thread;
 556     /* communication channel */
 557     QIOChannel *c;
 558     /* sem where to wait for more work */
 559     QemuSemaphore sem;
 560     /* this mutex protects the following parameters */
 561     QemuMutex mutex;
 562     /* is this channel thread running */
 563     bool running;
 564     /* should this thread finish */
 565     bool quit;
 566     /* thread has work to do */
 567     int pending_job;
 568     /* array of pages to sent */
 569     MultiFDPages_t *pages;
 570     /* packet allocated len */
 571     uint32_t packet_len;
 572     /* pointer to the packet */
 573     MultiFDPacket_t *packet;
 574     /* multifd flags for each packet */
 575     uint32_t flags;
 576     /* global number of generated multifd packets */
 577     uint64_t packet_num;
 578     /* thread local variables */
 579     /* packets sent through this channel */
 580     uint64_t num_packets;
 581     /* pages sent through this channel */
 582     uint64_t num_pages;
 583     /* syncs main thread and channels */
 584     QemuSemaphore sem_sync;
 585 }  MultiFDSendParams;
 586
 587 typedef struct {
 588     /* this fields are not changed once the thread is created */
 589     /* channel number */
 590     uint8_t id;
 591     /* channel thread name */
 592     char *name;
 593     /* channel thread id */
 594     QemuThread thread;
 595     /* communication channel */
 596     QIOChannel *c;
 597     /* this mutex protects the following parameters */
 598     QemuMutex mutex;
 599     /* is this channel thread running */
 600     bool running;
 601     /* array of pages to receive */
 602     MultiFDPages_t *pages;
 603     /* packet allocated len */
 604     uint32_t packet_len;
 605     /* pointer to the packet */
 606     MultiFDPacket_t *packet;
 607     /* multifd flags for each packet */
 608     uint32_t flags;
 609     /* global number of generated multifd packets */
 610     uint64_t packet_num;
 611     /* thread local variables */
 612     /* packets sent through this channel */
 613     uint64_t num_packets;
 614     /* pages sent through this channel */
 615     uint64_t num_pages;
 616     /* syncs main thread and channels */
 617     QemuSemaphore sem_sync;
 618 } MultiFDRecvParams;
 619
 620 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 621 {
 622     MultiFDInit_t msg;
 623     int ret;
 624
 625     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 626     msg.version = cpu_to_be32(MULTIFD_VERSION);
 627     msg.id = p->id;
 628     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 629
 630     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 631     if (ret != 0) {
 632         return -1;
 633     }
 634     return 0;
 635 }
 636
 637 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 638 {
 639     MultiFDInit_t msg;
 640     int ret;
 641
 642     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 643     if (ret != 0) {
 644         return -1;
 645     }
 646
 647     be32_to_cpus(&msg.magic);
 648     be32_to_cpus(&msg.version);
 649
 650     if (msg.magic != MULTIFD_MAGIC) {
 651         error_setg(errp, "multifd: received packet magic %x "
 652                    "expected %x", msg.magic, MULTIFD_MAGIC);
 653         return -1;
 654     }
 655
 656     if (msg.version != MULTIFD_VERSION) {
 657         error_setg(errp, "multifd: received packet version %d "
 658                    "expected %d", msg.version, MULTIFD_VERSION);
 659         return -1;
 660     }
 661
 662     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 663         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 664         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 665
 666         error_setg(errp, "multifd: received uuid '%s' and expected "
 667                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 668         g_free(uuid);
 669         g_free(msg_uuid);
 670         return -1;
 671     }
 672
 673     if (msg.id > migrate_multifd_channels()) {
 674         error_setg(errp, "multifd: received channel version %d "
 675                    "expected %d", msg.version, MULTIFD_VERSION);
 676         return -1;
 677     }
 678
 679     return msg.id;
 680 }
 681
 682 static MultiFDPages_t *multifd_pages_init(size_t size)
 683 {
 684     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 685
 686     pages->allocated = size;
 687     pages->iov = g_new0(struct iovec, size);
 688     pages->offset = g_new0(ram_addr_t, size);
 689
 690     return pages;
 691 }
 692
 693 static void multifd_pages_clear(MultiFDPages_t *pages)
 694 {
 695     pages->used = 0;
 696     pages->allocated = 0;
 697     pages->packet_num = 0;
 698     pages->block = NULL;
 699     g_free(pages->iov);
 700     pages->iov = NULL;
 701     g_free(pages->offset);
 702     pages->offset = NULL;
 703     g_free(pages);
 704 }
 705
 706 static void multifd_send_fill_packet(MultiFDSendParams *p)
 707 {
 708     MultiFDPacket_t *packet = p->packet;
 709     int i;
 710
 711     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
 712     packet->version = cpu_to_be32(MULTIFD_VERSION);
 713     packet->flags = cpu_to_be32(p->flags);
 714     packet->size = cpu_to_be32(migrate_multifd_page_count());
 715     packet->used = cpu_to_be32(p->pages->used);
 716     packet->packet_num = cpu_to_be64(p->packet_num);
 717
 718     if (p->pages->block) {
 719         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 720     }
 721
 722     for (i = 0; i < p->pages->used; i++) {
 723         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 724     }
 725 }
 726
 727 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 728 {
 729     MultiFDPacket_t *packet = p->packet;
 730     RAMBlock *block;
 731     int i;
 732
 733     be32_to_cpus(&packet->magic);
 734     if (packet->magic != MULTIFD_MAGIC) {
 735         error_setg(errp, "multifd: received packet "
 736                    "magic %x and expected magic %x",
 737                    packet->magic, MULTIFD_MAGIC);
 738         return -1;
 739     }
 740
 741     be32_to_cpus(&packet->version);
 742     if (packet->version != MULTIFD_VERSION) {
 743         error_setg(errp, "multifd: received packet "
 744                    "version %d and expected version %d",
 745                    packet->version, MULTIFD_VERSION);
 746         return -1;
 747     }
 748
 749     p->flags = be32_to_cpu(packet->flags);
 750
 751     be32_to_cpus(&packet->size);
 752     if (packet->size > migrate_multifd_page_count()) {
 753         error_setg(errp, "multifd: received packet "
 754                    "with size %d and expected maximum size %d",
 755                    packet->size, migrate_multifd_page_count()) ;
 756         return -1;
 757     }
 758
 759     p->pages->used = be32_to_cpu(packet->used);
 760     if (p->pages->used > packet->size) {
 761         error_setg(errp, "multifd: received packet "
 762                    "with size %d and expected maximum size %d",
 763                    p->pages->used, packet->size) ;
 764         return -1;
 765     }
 766
 767     p->packet_num = be64_to_cpu(packet->packet_num);
 768
 769     if (p->pages->used) {
 770         /* make sure that ramblock is 0 terminated */
 771         packet->ramblock[255] = 0;
 772         block = qemu_ram_block_by_name(packet->ramblock);
 773         if (!block) {
 774             error_setg(errp, "multifd: unknown ram block %s",
 775                        packet->ramblock);
 776             return -1;
 777         }
 778     }
 779
 780     for (i = 0; i < p->pages->used; i++) {
 781         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 782
 783         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 784             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 785                        " (max " RAM_ADDR_FMT ")",
 786                        offset, block->max_length);
 787             return -1;
 788         }
 789         p->pages->iov[i].iov_base = block->host + offset;
 790         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 791     }
 792
 793     return 0;
 794 }
 795
 796 struct {
 797     MultiFDSendParams *params;
 798     /* number of created threads */
 799     int count;
 800     /* array of pages to sent */
 801     MultiFDPages_t *pages;
 802     /* syncs main thread and channels */
 803     QemuSemaphore sem_sync;
 804     /* global number of generated multifd packets */
 805     uint64_t packet_num;
 806     /* send channels ready */
 807     QemuSemaphore channels_ready;
 808 } *multifd_send_state;
 809
 810 /*
 811  * How we use multifd_send_state->pages and channel->pages?
 812  *
 813  * We create a pages for each channel, and a main one.  Each time that
 814  * we need to send a batch of pages we interchange the ones between
 815  * multifd_send_state and the channel that is sending it.  There are
 816  * two reasons for that:
 817  *    - to not have to do so many mallocs during migration
 818  *    - to make easier to know what to free at the end of migration
 819  *
 820  * This way we always know who is the owner of each "pages" struct,
 821  * and we don't need any loocking.  It belongs to the migration thread
 822  * or to the channel thread.  Switching is safe because the migration
 823  * thread is using the channel mutex when changing it, and the channel
 824  * have to had finish with its own, otherwise pending_job can't be
 825  * false.
 826  */
 827
 828 static void multifd_send_pages(void)
 829 {
 830     int i;
 831     static int next_channel;
 832     MultiFDSendParams *p = NULL; /* make happy gcc */
 833     MultiFDPages_t *pages = multifd_send_state->pages;
 834     uint64_t transferred;
 835
 836     qemu_sem_wait(&multifd_send_state->channels_ready);
 837     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 838         p = &multifd_send_state->params[i];
 839
 840         qemu_mutex_lock(&p->mutex);
 841         if (!p->pending_job) {
 842             p->pending_job++;
 843             next_channel = (i + 1) % migrate_multifd_channels();
 844             break;
 845         }
 846         qemu_mutex_unlock(&p->mutex);
 847     }
 848     p->pages->used = 0;
 849
 850     p->packet_num = multifd_send_state->packet_num++;
 851     p->pages->block = NULL;
 852     multifd_send_state->pages = p->pages;
 853     p->pages = pages;
 854     transferred = pages->used * TARGET_PAGE_SIZE + p->packet_len;
 855     ram_counters.multifd_bytes += transferred;
 856     ram_counters.transferred += transferred;;
 857     qemu_mutex_unlock(&p->mutex);
 858     qemu_sem_post(&p->sem);
 859 }
 860
 861 static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
 862 {
 863     MultiFDPages_t *pages = multifd_send_state->pages;
 864
 865     if (!pages->block) {
 866         pages->block = block;
 867     }
 868
 869     if (pages->block == block) {
 870         pages->offset[pages->used] = offset;
 871         pages->iov[pages->used].iov_base = block->host + offset;
 872         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
 873         pages->used++;
 874
 875         if (pages->used < pages->allocated) {
 876             return;
 877         }
 878     }
 879
 880     multifd_send_pages();
 881
 882     if (pages->block != block) {
 883         multifd_queue_page(block, offset);
 884     }
 885 }
 886
 887 static void multifd_send_terminate_threads(Error *err)
 888 {
 889     int i;
 890
 891     if (err) {
 892         MigrationState *s = migrate_get_current();
 893         migrate_set_error(s, err);
 894         if (s->state == MIGRATION_STATUS_SETUP ||
 895             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
 896             s->state == MIGRATION_STATUS_DEVICE ||
 897             s->state == MIGRATION_STATUS_ACTIVE) {
 898             migrate_set_state(&s->state, s->state,
 899                               MIGRATION_STATUS_FAILED);
 900         }
 901     }
 902
 903     for (i = 0; i < migrate_multifd_channels(); i++) {
 904         MultiFDSendParams *p = &multifd_send_state->params[i];
 905
 906         qemu_mutex_lock(&p->mutex);
 907         p->quit = true;
 908         qemu_sem_post(&p->sem);
 909         qemu_mutex_unlock(&p->mutex);
 910     }
 911 }
 912
 913 int multifd_save_cleanup(Error **errp)
 914 {
 915     int i;
 916     int ret = 0;
 917
 918     if (!migrate_use_multifd()) {
 919         return 0;
 920     }
 921     multifd_send_terminate_threads(NULL);
 922     for (i = 0; i < migrate_multifd_channels(); i++) {
 923         MultiFDSendParams *p = &multifd_send_state->params[i];
 924
 925         if (p->running) {
 926             qemu_thread_join(&p->thread);
 927         }
 928         socket_send_channel_destroy(p->c);
 929         p->c = NULL;
 930         qemu_mutex_destroy(&p->mutex);
 931         qemu_sem_destroy(&p->sem);
 932         qemu_sem_destroy(&p->sem_sync);
 933         g_free(p->name);
 934         p->name = NULL;
 935         multifd_pages_clear(p->pages);
 936         p->pages = NULL;
 937         p->packet_len = 0;
 938         g_free(p->packet);
 939         p->packet = NULL;
 940     }
 941     qemu_sem_destroy(&multifd_send_state->channels_ready);
 942     qemu_sem_destroy(&multifd_send_state->sem_sync);
 943     g_free(multifd_send_state->params);
 944     multifd_send_state->params = NULL;
 945     multifd_pages_clear(multifd_send_state->pages);
 946     multifd_send_state->pages = NULL;
 947     g_free(multifd_send_state);
 948     multifd_send_state = NULL;
 949     return ret;
 950 }
 951
 952 static void multifd_send_sync_main(void)
 953 {
 954     int i;
 955
 956     if (!migrate_use_multifd()) {
 957         return;
 958     }
 959     if (multifd_send_state->pages->used) {
 960         multifd_send_pages();
 961     }
 962     for (i = 0; i < migrate_multifd_channels(); i++) {
 963         MultiFDSendParams *p = &multifd_send_state->params[i];
 964
 965         trace_multifd_send_sync_main_signal(p->id);
 966
 967         qemu_mutex_lock(&p->mutex);
 968
 969         p->packet_num = multifd_send_state->packet_num++;
 970         p->flags |= MULTIFD_FLAG_SYNC;
 971         p->pending_job++;
 972         qemu_mutex_unlock(&p->mutex);
 973         qemu_sem_post(&p->sem);
 974     }
 975     for (i = 0; i < migrate_multifd_channels(); i++) {
 976         MultiFDSendParams *p = &multifd_send_state->params[i];
 977
 978         trace_multifd_send_sync_main_wait(p->id);
 979         qemu_sem_wait(&multifd_send_state->sem_sync);
 980     }
 981     trace_multifd_send_sync_main(multifd_send_state->packet_num);
 982 }
 983
 984 static void *multifd_send_thread(void *opaque)
 985 {
 986     MultiFDSendParams *p = opaque;
 987     Error *local_err = NULL;
 988     int ret;
 989
 990     trace_multifd_send_thread_start(p->id);
 991
 992     if (multifd_send_initial_packet(p, &local_err) < 0) {
 993         goto out;
 994     }
 995     /* initial packet */
 996     p->num_packets = 1;
 997
 998     while (true) {
 999         qemu_sem_wait(&p->sem);
1000         qemu_mutex_lock(&p->mutex);
1001
1002         if (p->pending_job) {
1003             uint32_t used = p->pages->used;
1004             uint64_t packet_num = p->packet_num;
1005             uint32_t flags = p->flags;
1006
1007             multifd_send_fill_packet(p);
1008             p->flags = 0;
1009             p->num_packets++;
1010             p->num_pages += used;
1011             p->pages->used = 0;
1012             qemu_mutex_unlock(&p->mutex);
1013
1014             trace_multifd_send(p->id, packet_num, used, flags);
1015
1016             ret = qio_channel_write_all(p->c, (void *)p->packet,
1017                                         p->packet_len, &local_err);
1018             if (ret != 0) {
1019                 break;
1020             }
1021
1022             ret = qio_channel_writev_all(p->c, p->pages->iov, used, &local_err);
1023             if (ret != 0) {
1024                 break;
1025             }
1026
1027             qemu_mutex_lock(&p->mutex);
1028             p->pending_job--;
1029             qemu_mutex_unlock(&p->mutex);
1030
1031             if (flags & MULTIFD_FLAG_SYNC) {
1032                 qemu_sem_post(&multifd_send_state->sem_sync);
1033             }
1034             qemu_sem_post(&multifd_send_state->channels_ready);
1035         } else if (p->quit) {
1036             qemu_mutex_unlock(&p->mutex);
1037             break;
1038         } else {
1039             qemu_mutex_unlock(&p->mutex);
1040             /* sometimes there are spurious wakeups */
1041         }
1042     }
1043
1044 out:
1045     if (local_err) {
1046         multifd_send_terminate_threads(local_err);
1047     }
1048
1049     qemu_mutex_lock(&p->mutex);
1050     p->running = false;
1051     qemu_mutex_unlock(&p->mutex);
1052
1053     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1054
1055     return NULL;
1056 }
1057
1058 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1059 {
1060     MultiFDSendParams *p = opaque;
1061     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1062     Error *local_err = NULL;
1063
1064     if (qio_task_propagate_error(task, &local_err)) {
1065         if (multifd_save_cleanup(&local_err) != 0) {
1066             migrate_set_error(migrate_get_current(), local_err);
1067         }
1068     } else {
1069         p->c = QIO_CHANNEL(sioc);
1070         qio_channel_set_delay(p->c, false);
1071         p->running = true;
1072         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1073                            QEMU_THREAD_JOINABLE);
1074
1075         atomic_inc(&multifd_send_state->count);
1076     }
1077 }
1078
1079 int multifd_save_setup(void)
1080 {
1081     int thread_count;
1082     uint32_t page_count = migrate_multifd_page_count();
1083     uint8_t i;
1084
1085     if (!migrate_use_multifd()) {
1086         return 0;
1087     }
1088     thread_count = migrate_multifd_channels();
1089     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1090     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1091     atomic_set(&multifd_send_state->count, 0);
1092     multifd_send_state->pages = multifd_pages_init(page_count);
1093     qemu_sem_init(&multifd_send_state->sem_sync, 0);
1094     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1095
1096     for (i = 0; i < thread_count; i++) {
1097         MultiFDSendParams *p = &multifd_send_state->params[i];
1098
1099         qemu_mutex_init(&p->mutex);
1100         qemu_sem_init(&p->sem, 0);
1101         qemu_sem_init(&p->sem_sync, 0);
1102         p->quit = false;
1103         p->pending_job = 0;
1104         p->id = i;
1105         p->pages = multifd_pages_init(page_count);
1106         p->packet_len = sizeof(MultiFDPacket_t)
1107                       + sizeof(ram_addr_t) * page_count;
1108         p->packet = g_malloc0(p->packet_len);
1109         p->name = g_strdup_printf("multifdsend_%d", i);
1110         socket_send_channel_create(multifd_new_send_channel_async, p);
1111     }
1112     return 0;
1113 }
1114
1115 struct {
1116     MultiFDRecvParams *params;
1117     /* number of created threads */
1118     int count;
1119     /* syncs main thread and channels */
1120     QemuSemaphore sem_sync;
1121     /* global number of generated multifd packets */
1122     uint64_t packet_num;
1123 } *multifd_recv_state;
1124
1125 static void multifd_recv_terminate_threads(Error *err)
1126 {
1127     int i;
1128
1129     if (err) {
1130         MigrationState *s = migrate_get_current();
1131         migrate_set_error(s, err);
1132         if (s->state == MIGRATION_STATUS_SETUP ||
1133             s->state == MIGRATION_STATUS_ACTIVE) {
1134             migrate_set_state(&s->state, s->state,
1135                               MIGRATION_STATUS_FAILED);
1136         }
1137     }
1138
1139     for (i = 0; i < migrate_multifd_channels(); i++) {
1140         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1141
1142         qemu_mutex_lock(&p->mutex);
1143         /* We could arrive here for two reasons:
1144            - normal quit, i.e. everything went fine, just finished
1145            - error quit: We close the channels so the channel threads
1146              finish the qio_channel_read_all_eof() */
1147         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1148         qemu_mutex_unlock(&p->mutex);
1149     }
1150 }
1151
1152 int multifd_load_cleanup(Error **errp)
1153 {
1154     int i;
1155     int ret = 0;
1156
1157     if (!migrate_use_multifd()) {
1158         return 0;
1159     }
1160     multifd_recv_terminate_threads(NULL);
1161     for (i = 0; i < migrate_multifd_channels(); i++) {
1162         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1163
1164         if (p->running) {
1165             qemu_thread_join(&p->thread);
1166         }
1167         object_unref(OBJECT(p->c));
1168         p->c = NULL;
1169         qemu_mutex_destroy(&p->mutex);
1170         qemu_sem_destroy(&p->sem_sync);
1171         g_free(p->name);
1172         p->name = NULL;
1173         multifd_pages_clear(p->pages);
1174         p->pages = NULL;
1175         p->packet_len = 0;
1176         g_free(p->packet);
1177         p->packet = NULL;
1178     }
1179     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1180     g_free(multifd_recv_state->params);
1181     multifd_recv_state->params = NULL;
1182     g_free(multifd_recv_state);
1183     multifd_recv_state = NULL;
1184
1185     return ret;
1186 }
1187
1188 static void multifd_recv_sync_main(void)
1189 {
1190     int i;
1191
1192     if (!migrate_use_multifd()) {
1193         return;
1194     }
1195     for (i = 0; i < migrate_multifd_channels(); i++) {
1196         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1197
1198         trace_multifd_recv_sync_main_wait(p->id);
1199         qemu_sem_wait(&multifd_recv_state->sem_sync);
1200         qemu_mutex_lock(&p->mutex);
1201         if (multifd_recv_state->packet_num < p->packet_num) {
1202             multifd_recv_state->packet_num = p->packet_num;
1203         }
1204         qemu_mutex_unlock(&p->mutex);
1205     }
1206     for (i = 0; i < migrate_multifd_channels(); i++) {
1207         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1208
1209         trace_multifd_recv_sync_main_signal(p->id);
1210         qemu_sem_post(&p->sem_sync);
1211     }
1212     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1213 }
1214
1215 static void *multifd_recv_thread(void *opaque)
1216 {
1217     MultiFDRecvParams *p = opaque;
1218     Error *local_err = NULL;
1219     int ret;
1220
1221     trace_multifd_recv_thread_start(p->id);
1222
1223     while (true) {
1224         uint32_t used;
1225         uint32_t flags;
1226
1227         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1228                                        p->packet_len, &local_err);
1229         if (ret == 0) {   /* EOF */
1230             break;
1231         }
1232         if (ret == -1) {   /* Error */
1233             break;
1234         }
1235
1236         qemu_mutex_lock(&p->mutex);
1237         ret = multifd_recv_unfill_packet(p, &local_err);
1238         if (ret) {
1239             qemu_mutex_unlock(&p->mutex);
1240             break;
1241         }
1242
1243         used = p->pages->used;
1244         flags = p->flags;
1245         trace_multifd_recv(p->id, p->packet_num, used, flags);
1246         p->num_packets++;
1247         p->num_pages += used;
1248         qemu_mutex_unlock(&p->mutex);
1249
1250         ret = qio_channel_readv_all(p->c, p->pages->iov, used, &local_err);
1251         if (ret != 0) {
1252             break;
1253         }
1254
1255         if (flags & MULTIFD_FLAG_SYNC) {
1256             qemu_sem_post(&multifd_recv_state->sem_sync);
1257             qemu_sem_wait(&p->sem_sync);
1258         }
1259     }
1260
1261     if (local_err) {
1262         multifd_recv_terminate_threads(local_err);
1263     }
1264     qemu_mutex_lock(&p->mutex);
1265     p->running = false;
1266     qemu_mutex_unlock(&p->mutex);
1267
1268     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1269
1270     return NULL;
1271 }
1272
1273 int multifd_load_setup(void)
1274 {
1275     int thread_count;
1276     uint32_t page_count = migrate_multifd_page_count();
1277     uint8_t i;
1278
1279     if (!migrate_use_multifd()) {
1280         return 0;
1281     }
1282     thread_count = migrate_multifd_channels();
1283     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1284     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1285     atomic_set(&multifd_recv_state->count, 0);
1286     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1287
1288     for (i = 0; i < thread_count; i++) {
1289         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1290
1291         qemu_mutex_init(&p->mutex);
1292         qemu_sem_init(&p->sem_sync, 0);
1293         p->id = i;
1294         p->pages = multifd_pages_init(page_count);
1295         p->packet_len = sizeof(MultiFDPacket_t)
1296                       + sizeof(ram_addr_t) * page_count;
1297         p->packet = g_malloc0(p->packet_len);
1298         p->name = g_strdup_printf("multifdrecv_%d", i);
1299     }
1300     return 0;
1301 }
1302
1303 bool multifd_recv_all_channels_created(void)
1304 {
1305     int thread_count = migrate_multifd_channels();
1306
1307     if (!migrate_use_multifd()) {
1308         return true;
1309     }
1310
1311     return thread_count == atomic_read(&multifd_recv_state->count);
1312 }
1313
1314 void multifd_recv_new_channel(QIOChannel *ioc)
1315 {
1316     MultiFDRecvParams *p;
1317     Error *local_err = NULL;
1318     int id;
1319
1320     id = multifd_recv_initial_packet(ioc, &local_err);
1321     if (id < 0) {
1322         multifd_recv_terminate_threads(local_err);
1323         return;
1324     }
1325
1326     p = &multifd_recv_state->params[id];
1327     if (p->c != NULL) {
1328         error_setg(&local_err, "multifd: received id '%d' already setup'",
1329                    id);
1330         multifd_recv_terminate_threads(local_err);
1331         return;
1332     }
1333     p->c = ioc;
1334     object_ref(OBJECT(ioc));
1335     /* initial packet */
1336     p->num_packets = 1;
1337
1338     p->running = true;
1339     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1340                        QEMU_THREAD_JOINABLE);
1341     atomic_inc(&multifd_recv_state->count);
1342     if (multifd_recv_state->count == migrate_multifd_channels()) {
1343         migration_incoming_process();
1344     }
1345 }
1346
1347 /**
1348  * save_page_header: write page header to wire
1349  *
1350  * If this is the 1st block, it also writes the block identification
1351  *
1352  * Returns the number of bytes written
1353  *
1354  * @f: QEMUFile where to send the data
1355  * @block: block that contains the page we want to send
1356  * @offset: offset inside the block for the page
1357  *          in the lower bits, it contains flags
1358  */
1359 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1360                                ram_addr_t offset)
1361 {
1362     size_t size, len;
1363
1364     if (block == rs->last_sent_block) {
1365         offset |= RAM_SAVE_FLAG_CONTINUE;
1366     }
1367     qemu_put_be64(f, offset);
1368     size = 8;
1369
1370     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1371         len = strlen(block->idstr);
1372         qemu_put_byte(f, len);
1373         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1374         size += 1 + len;
1375         rs->last_sent_block = block;
1376     }
1377     return size;
1378 }
1379
1380 /**
1381  * mig_throttle_guest_down: throotle down the guest
1382  *
1383  * Reduce amount of guest cpu execution to hopefully slow down memory
1384  * writes. If guest dirty memory rate is reduced below the rate at
1385  * which we can transfer pages to the destination then we should be
1386  * able to complete migration. Some workloads dirty memory way too
1387  * fast and will not effectively converge, even with auto-converge.
1388  */
1389 static void mig_throttle_guest_down(void)
1390 {
1391     MigrationState *s = migrate_get_current();
1392     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1393     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1394
1395     /* We have not started throttling yet. Let's start it. */
1396     if (!cpu_throttle_active()) {
1397         cpu_throttle_set(pct_initial);
1398     } else {
1399         /* Throttling already on, just increase the rate */
1400         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
1401     }
1402 }
1403
1404 /**
1405  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1406  *
1407  * @rs: current RAM state
1408  * @current_addr: address for the zero page
1409  *
1410  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1411  * The important thing is that a stale (not-yet-0'd) page be replaced
1412  * by the new data.
1413  * As a bonus, if the page wasn't in the cache it gets added so that
1414  * when a small write is made into the 0'd page it gets XBZRLE sent.
1415  */
1416 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1417 {
1418     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1419         return;
1420     }
1421
1422     /* We don't care if this fails to allocate a new cache page
1423      * as long as it updated an old one */
1424     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1425                  ram_counters.dirty_sync_count);
1426 }
1427
1428 #define ENCODING_FLAG_XBZRLE 0x1
1429
1430 /**
1431  * save_xbzrle_page: compress and send current page
1432  *
1433  * Returns: 1 means that we wrote the page
1434  *          0 means that page is identical to the one already sent
1435  *          -1 means that xbzrle would be longer than normal
1436  *
1437  * @rs: current RAM state
1438  * @current_data: pointer to the address of the page contents
1439  * @current_addr: addr of the page
1440  * @block: block that contains the page we want to send
1441  * @offset: offset inside the block for the page
1442  * @last_stage: if we are at the completion stage
1443  */
1444 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1445                             ram_addr_t current_addr, RAMBlock *block,
1446                             ram_addr_t offset, bool last_stage)
1447 {
1448     int encoded_len = 0, bytes_xbzrle;
1449     uint8_t *prev_cached_page;
1450
1451     if (!cache_is_cached(XBZRLE.cache, current_addr,
1452                          ram_counters.dirty_sync_count)) {
1453         xbzrle_counters.cache_miss++;
1454         if (!last_stage) {
1455             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1456                              ram_counters.dirty_sync_count) == -1) {
1457                 return -1;
1458             } else {
1459                 /* update *current_data when the page has been
1460                    inserted into cache */
1461                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1462             }
1463         }
1464         return -1;
1465     }
1466
1467     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1468
1469     /* save current buffer into memory */
1470     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1471
1472     /* XBZRLE encoding (if there is no overflow) */
1473     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1474                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1475                                        TARGET_PAGE_SIZE);
1476     if (encoded_len == 0) {
1477         trace_save_xbzrle_page_skipping();
1478         return 0;
1479     } else if (encoded_len == -1) {
1480         trace_save_xbzrle_page_overflow();
1481         xbzrle_counters.overflow++;
1482         /* update data in the cache */
1483         if (!last_stage) {
1484             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1485             *current_data = prev_cached_page;
1486         }
1487         return -1;
1488     }
1489
1490     /* we need to update the data in the cache, in order to get the same data */
1491     if (!last_stage) {
1492         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1493     }
1494
1495     /* Send XBZRLE based compressed page */
1496     bytes_xbzrle = save_page_header(rs, rs->f, block,
1497                                     offset | RAM_SAVE_FLAG_XBZRLE);
1498     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1499     qemu_put_be16(rs->f, encoded_len);
1500     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1501     bytes_xbzrle += encoded_len + 1 + 2;
1502     xbzrle_counters.pages++;
1503     xbzrle_counters.bytes += bytes_xbzrle;
1504     ram_counters.transferred += bytes_xbzrle;
1505
1506     return 1;
1507 }
1508
1509 /**
1510  * migration_bitmap_find_dirty: find the next dirty page from start
1511  *
1512  * Called with rcu_read_lock() to protect migration_bitmap
1513  *
1514  * Returns the byte offset within memory region of the start of a dirty page
1515  *
1516  * @rs: current RAM state
1517  * @rb: RAMBlock where to search for dirty pages
1518  * @start: page where we start the search
1519  */
1520 static inline
1521 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1522                                           unsigned long start)
1523 {
1524     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1525     unsigned long *bitmap = rb->bmap;
1526     unsigned long next;
1527
1528     if (!qemu_ram_is_migratable(rb)) {
1529         return size;
1530     }
1531
1532     if (rs->ram_bulk_stage && start > 0) {
1533         next = start + 1;
1534     } else {
1535         next = find_next_bit(bitmap, size, start);
1536     }
1537
1538     return next;
1539 }
1540
1541 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1542                                                 RAMBlock *rb,
1543                                                 unsigned long page)
1544 {
1545     bool ret;
1546
1547     ret = test_and_clear_bit(page, rb->bmap);
1548
1549     if (ret) {
1550         rs->migration_dirty_pages--;
1551     }
1552     return ret;
1553 }
1554
1555 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1556                                         ram_addr_t start, ram_addr_t length)
1557 {
1558     rs->migration_dirty_pages +=
1559         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1560                                               &rs->num_dirty_pages_period);
1561 }
1562
1563 /**
1564  * ram_pagesize_summary: calculate all the pagesizes of a VM
1565  *
1566  * Returns a summary bitmap of the page sizes of all RAMBlocks
1567  *
1568  * For VMs with just normal pages this is equivalent to the host page
1569  * size. If it's got some huge pages then it's the OR of all the
1570  * different page sizes.
1571  */
1572 uint64_t ram_pagesize_summary(void)
1573 {
1574     RAMBlock *block;
1575     uint64_t summary = 0;
1576
1577     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1578         summary |= block->page_size;
1579     }
1580
1581     return summary;
1582 }
1583
1584 static void migration_update_rates(RAMState *rs, int64_t end_time)
1585 {
1586     uint64_t iter_count = rs->iterations - rs->iterations_prev;
1587
1588     /* calculate period counters */
1589     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1590                 / (end_time - rs->time_last_bitmap_sync);
1591
1592     if (!iter_count) {
1593         return;
1594     }
1595
1596     if (migrate_use_xbzrle()) {
1597         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1598             rs->xbzrle_cache_miss_prev) / iter_count;
1599         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1600     }
1601 }
1602
1603 static void migration_bitmap_sync(RAMState *rs)
1604 {
1605     RAMBlock *block;
1606     int64_t end_time;
1607     uint64_t bytes_xfer_now;
1608
1609     ram_counters.dirty_sync_count++;
1610
1611     if (!rs->time_last_bitmap_sync) {
1612         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1613     }
1614
1615     trace_migration_bitmap_sync_start();
1616     memory_global_dirty_log_sync();
1617
1618     qemu_mutex_lock(&rs->bitmap_mutex);
1619     rcu_read_lock();
1620     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1621         migration_bitmap_sync_range(rs, block, 0, block->used_length);
1622     }
1623     ram_counters.remaining = ram_bytes_remaining();
1624     rcu_read_unlock();
1625     qemu_mutex_unlock(&rs->bitmap_mutex);
1626
1627     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1628
1629     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1630
1631     /* more than 1 second = 1000 millisecons */
1632     if (end_time > rs->time_last_bitmap_sync + 1000) {
1633         bytes_xfer_now = ram_counters.transferred;
1634
1635         /* During block migration the auto-converge logic incorrectly detects
1636          * that ram migration makes no progress. Avoid this by disabling the
1637          * throttling logic during the bulk phase of block migration. */
1638         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1639             /* The following detection logic can be refined later. For now:
1640                Check to see if the dirtied bytes is 50% more than the approx.
1641                amount of bytes that just got transferred since the last time we
1642                were in this routine. If that happens twice, start or increase
1643                throttling */
1644
1645             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1646                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1647                 (++rs->dirty_rate_high_cnt >= 2)) {
1648                     trace_migration_throttle();
1649                     rs->dirty_rate_high_cnt = 0;
1650                     mig_throttle_guest_down();
1651             }
1652         }
1653
1654         migration_update_rates(rs, end_time);
1655
1656         rs->iterations_prev = rs->iterations;
1657
1658         /* reset period counters */
1659         rs->time_last_bitmap_sync = end_time;
1660         rs->num_dirty_pages_period = 0;
1661         rs->bytes_xfer_prev = bytes_xfer_now;
1662     }
1663     if (migrate_use_events()) {
1664         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
1665     }
1666 }
1667
1668 /**
1669  * save_zero_page: send the zero page to the stream
1670  *
1671  * Returns the number of pages written.
1672  *
1673  * @rs: current RAM state
1674  * @block: block that contains the page we want to send
1675  * @offset: offset inside the block for the page
1676  */
1677 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1678 {
1679     uint8_t *p = block->host + offset;
1680     int pages = -1;
1681
1682     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1683         ram_counters.duplicate++;
1684         ram_counters.transferred +=
1685             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
1686         qemu_put_byte(rs->f, 0);
1687         ram_counters.transferred += 1;
1688         pages = 1;
1689     }
1690
1691     return pages;
1692 }
1693
1694 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1695 {
1696     if (!migrate_release_ram() || !migration_in_postcopy()) {
1697         return;
1698     }
1699
1700     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1701 }
1702
1703 /*
1704  * @pages: the number of pages written by the control path,
1705  *        < 0 - error
1706  *        > 0 - number of pages written
1707  *
1708  * Return true if the pages has been saved, otherwise false is returned.
1709  */
1710 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1711                               int *pages)
1712 {
1713     uint64_t bytes_xmit = 0;
1714     int ret;
1715
1716     *pages = -1;
1717     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1718                                 &bytes_xmit);
1719     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1720         return false;
1721     }
1722
1723     if (bytes_xmit) {
1724         ram_counters.transferred += bytes_xmit;
1725         *pages = 1;
1726     }
1727
1728     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1729         return true;
1730     }
1731
1732     if (bytes_xmit > 0) {
1733         ram_counters.normal++;
1734     } else if (bytes_xmit == 0) {
1735         ram_counters.duplicate++;
1736     }
1737
1738     return true;
1739 }
1740
1741 /*
1742  * directly send the page to the stream
1743  *
1744  * Returns the number of pages written.
1745  *
1746  * @rs: current RAM state
1747  * @block: block that contains the page we want to send
1748  * @offset: offset inside the block for the page
1749  * @buf: the page to be sent
1750  * @async: send to page asyncly
1751  */
1752 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1753                             uint8_t *buf, bool async)
1754 {
1755     ram_counters.transferred += save_page_header(rs, rs->f, block,
1756                                                  offset | RAM_SAVE_FLAG_PAGE);
1757     if (async) {
1758         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1759                               migrate_release_ram() &
1760                               migration_in_postcopy());
1761     } else {
1762         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1763     }
1764     ram_counters.transferred += TARGET_PAGE_SIZE;
1765     ram_counters.normal++;
1766     return 1;
1767 }
1768
1769 /**
1770  * ram_save_page: send the given page to the stream
1771  *
1772  * Returns the number of pages written.
1773  *          < 0 - error
1774  *          >=0 - Number of pages written - this might legally be 0
1775  *                if xbzrle noticed the page was the same.
1776  *
1777  * @rs: current RAM state
1778  * @block: block that contains the page we want to send
1779  * @offset: offset inside the block for the page
1780  * @last_stage: if we are at the completion stage
1781  */
1782 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1783 {
1784     int pages = -1;
1785     uint8_t *p;
1786     bool send_async = true;
1787     RAMBlock *block = pss->block;
1788     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1789     ram_addr_t current_addr = block->offset + offset;
1790
1791     p = block->host + offset;
1792     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1793
1794     XBZRLE_cache_lock();
1795     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1796         migrate_use_xbzrle()) {
1797         pages = save_xbzrle_page(rs, &p, current_addr, block,
1798                                  offset, last_stage);
1799         if (!last_stage) {
1800             /* Can't send this cached data async, since the cache page
1801              * might get updated before it gets to the wire
1802              */
1803             send_async = false;
1804         }
1805     }
1806
1807     /* XBZRLE overflow or normal page */
1808     if (pages == -1) {
1809         pages = save_normal_page(rs, block, offset, p, send_async);
1810     }
1811
1812     XBZRLE_cache_unlock();
1813
1814     return pages;
1815 }
1816
1817 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1818                                  ram_addr_t offset)
1819 {
1820     multifd_queue_page(block, offset);
1821     ram_counters.normal++;
1822
1823     return 1;
1824 }
1825
1826 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1827                                 ram_addr_t offset, uint8_t *source_buf)
1828 {
1829     RAMState *rs = ram_state;
1830     int bytes_sent, blen;
1831     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1832
1833     bytes_sent = save_page_header(rs, f, block, offset |
1834                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
1835
1836     /*
1837      * copy it to a internal buffer to avoid it being modified by VM
1838      * so that we can catch up the error during compression and
1839      * decompression
1840      */
1841     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1842     blen = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1843     if (blen < 0) {
1844         bytes_sent = 0;
1845         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1846         error_report("compressed data failed!");
1847     } else {
1848         bytes_sent += blen;
1849         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1850     }
1851
1852     return bytes_sent;
1853 }
1854
1855 static void flush_compressed_data(RAMState *rs)
1856 {
1857     int idx, len, thread_count;
1858
1859     if (!migrate_use_compression()) {
1860         return;
1861     }
1862     thread_count = migrate_compress_threads();
1863
1864     qemu_mutex_lock(&comp_done_lock);
1865     for (idx = 0; idx < thread_count; idx++) {
1866         while (!comp_param[idx].done) {
1867             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1868         }
1869     }
1870     qemu_mutex_unlock(&comp_done_lock);
1871
1872     for (idx = 0; idx < thread_count; idx++) {
1873         qemu_mutex_lock(&comp_param[idx].mutex);
1874         if (!comp_param[idx].quit) {
1875             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1876             ram_counters.transferred += len;
1877         }
1878         qemu_mutex_unlock(&comp_param[idx].mutex);
1879     }
1880 }
1881
1882 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1883                                        ram_addr_t offset)
1884 {
1885     param->block = block;
1886     param->offset = offset;
1887 }
1888
1889 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1890                                            ram_addr_t offset)
1891 {
1892     int idx, thread_count, bytes_xmit = -1, pages = -1;
1893
1894     thread_count = migrate_compress_threads();
1895     qemu_mutex_lock(&comp_done_lock);
1896     while (true) {
1897         for (idx = 0; idx < thread_count; idx++) {
1898             if (comp_param[idx].done) {
1899                 comp_param[idx].done = false;
1900                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1901                 qemu_mutex_lock(&comp_param[idx].mutex);
1902                 set_compress_params(&comp_param[idx], block, offset);
1903                 qemu_cond_signal(&comp_param[idx].cond);
1904                 qemu_mutex_unlock(&comp_param[idx].mutex);
1905                 pages = 1;
1906                 ram_counters.normal++;
1907                 ram_counters.transferred += bytes_xmit;
1908                 break;
1909             }
1910         }
1911         if (pages > 0) {
1912             break;
1913         } else {
1914             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1915         }
1916     }
1917     qemu_mutex_unlock(&comp_done_lock);
1918
1919     return pages;
1920 }
1921
1922 /**
1923  * find_dirty_block: find the next dirty page and update any state
1924  * associated with the search process.
1925  *
1926  * Returns if a page is found
1927  *
1928  * @rs: current RAM state
1929  * @pss: data about the state of the current dirty page scan
1930  * @again: set to false if the search has scanned the whole of RAM
1931  */
1932 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1933 {
1934     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1935     if (pss->complete_round && pss->block == rs->last_seen_block &&
1936         pss->page >= rs->last_page) {
1937         /*
1938          * We've been once around the RAM and haven't found anything.
1939          * Give up.
1940          */
1941         *again = false;
1942         return false;
1943     }
1944     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1945         /* Didn't find anything in this RAM Block */
1946         pss->page = 0;
1947         pss->block = QLIST_NEXT_RCU(pss->block, next);
1948         if (!pss->block) {
1949             /* Hit the end of the list */
1950             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1951             /* Flag that we've looped */
1952             pss->complete_round = true;
1953             rs->ram_bulk_stage = false;
1954             if (migrate_use_xbzrle()) {
1955                 /* If xbzrle is on, stop using the data compression at this
1956                  * point. In theory, xbzrle can do better than compression.
1957                  */
1958                 flush_compressed_data(rs);
1959             }
1960         }
1961         /* Didn't find anything this time, but try again on the new block */
1962         *again = true;
1963         return false;
1964     } else {
1965         /* Can go around again, but... */
1966         *again = true;
1967         /* We've found something so probably don't need to */
1968         return true;
1969     }
1970 }
1971
1972 /**
1973  * unqueue_page: gets a page of the queue
1974  *
1975  * Helper for 'get_queued_page' - gets a page off the queue
1976  *
1977  * Returns the block of the page (or NULL if none available)
1978  *
1979  * @rs: current RAM state
1980  * @offset: used to return the offset within the RAMBlock
1981  */
1982 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1983 {
1984     RAMBlock *block = NULL;
1985
1986     qemu_mutex_lock(&rs->src_page_req_mutex);
1987     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1988         struct RAMSrcPageRequest *entry =
1989                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1990         block = entry->rb;
1991         *offset = entry->offset;
1992
1993         if (entry->len > TARGET_PAGE_SIZE) {
1994             entry->len -= TARGET_PAGE_SIZE;
1995             entry->offset += TARGET_PAGE_SIZE;
1996         } else {
1997             memory_region_unref(block->mr);
1998             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1999             g_free(entry);
2000             migration_consume_urgent_request();
2001         }
2002     }
2003     qemu_mutex_unlock(&rs->src_page_req_mutex);
2004
2005     return block;
2006 }
2007
2008 /**
2009  * get_queued_page: unqueue a page from the postocpy requests
2010  *
2011  * Skips pages that are already sent (!dirty)
2012  *
2013  * Returns if a queued page is found
2014  *
2015  * @rs: current RAM state
2016  * @pss: data about the state of the current dirty page scan
2017  */
2018 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2019 {
2020     RAMBlock  *block;
2021     ram_addr_t offset;
2022     bool dirty;
2023
2024     do {
2025         block = unqueue_page(rs, &offset);
2026         /*
2027          * We're sending this page, and since it's postcopy nothing else
2028          * will dirty it, and we must make sure it doesn't get sent again
2029          * even if this queue request was received after the background
2030          * search already sent it.
2031          */
2032         if (block) {
2033             unsigned long page;
2034
2035             page = offset >> TARGET_PAGE_BITS;
2036             dirty = test_bit(page, block->bmap);
2037             if (!dirty) {
2038                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2039                        page, test_bit(page, block->unsentmap));
2040             } else {
2041                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2042             }
2043         }
2044
2045     } while (block && !dirty);
2046
2047     if (block) {
2048         /*
2049          * As soon as we start servicing pages out of order, then we have
2050          * to kill the bulk stage, since the bulk stage assumes
2051          * in (migration_bitmap_find_and_reset_dirty) that every page is
2052          * dirty, that's no longer true.
2053          */
2054         rs->ram_bulk_stage = false;
2055
2056         /*
2057          * We want the background search to continue from the queued page
2058          * since the guest is likely to want other pages near to the page
2059          * it just requested.
2060          */
2061         pss->block = block;
2062         pss->page = offset >> TARGET_PAGE_BITS;
2063     }
2064
2065     return !!block;
2066 }
2067
2068 /**
2069  * migration_page_queue_free: drop any remaining pages in the ram
2070  * request queue
2071  *
2072  * It should be empty at the end anyway, but in error cases there may
2073  * be some left.  in case that there is any page left, we drop it.
2074  *
2075  */
2076 static void migration_page_queue_free(RAMState *rs)
2077 {
2078     struct RAMSrcPageRequest *mspr, *next_mspr;
2079     /* This queue generally should be empty - but in the case of a failed
2080      * migration might have some droppings in.
2081      */
2082     rcu_read_lock();
2083     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2084         memory_region_unref(mspr->rb->mr);
2085         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2086         g_free(mspr);
2087     }
2088     rcu_read_unlock();
2089 }
2090
2091 /**
2092  * ram_save_queue_pages: queue the page for transmission
2093  *
2094  * A request from postcopy destination for example.
2095  *
2096  * Returns zero on success or negative on error
2097  *
2098  * @rbname: Name of the RAMBLock of the request. NULL means the
2099  *          same that last one.
2100  * @start: starting address from the start of the RAMBlock
2101  * @len: length (in bytes) to send
2102  */
2103 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2104 {
2105     RAMBlock *ramblock;
2106     RAMState *rs = ram_state;
2107
2108     ram_counters.postcopy_requests++;
2109     rcu_read_lock();
2110     if (!rbname) {
2111         /* Reuse last RAMBlock */
2112         ramblock = rs->last_req_rb;
2113
2114         if (!ramblock) {
2115             /*
2116              * Shouldn't happen, we can't reuse the last RAMBlock if
2117              * it's the 1st request.
2118              */
2119             error_report("ram_save_queue_pages no previous block");
2120             goto err;
2121         }
2122     } else {
2123         ramblock = qemu_ram_block_by_name(rbname);
2124
2125         if (!ramblock) {
2126             /* We shouldn't be asked for a non-existent RAMBlock */
2127             error_report("ram_save_queue_pages no block '%s'", rbname);
2128             goto err;
2129         }
2130         rs->last_req_rb = ramblock;
2131     }
2132     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2133     if (start+len > ramblock->used_length) {
2134         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2135                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2136                      __func__, start, len, ramblock->used_length);
2137         goto err;
2138     }
2139
2140     struct RAMSrcPageRequest *new_entry =
2141         g_malloc0(sizeof(struct RAMSrcPageRequest));
2142     new_entry->rb = ramblock;
2143     new_entry->offset = start;
2144     new_entry->len = len;
2145
2146     memory_region_ref(ramblock->mr);
2147     qemu_mutex_lock(&rs->src_page_req_mutex);
2148     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2149     migration_make_urgent_request();
2150     qemu_mutex_unlock(&rs->src_page_req_mutex);
2151     rcu_read_unlock();
2152
2153     return 0;
2154
2155 err:
2156     rcu_read_unlock();
2157     return -1;
2158 }
2159
2160 static bool save_page_use_compression(RAMState *rs)
2161 {
2162     if (!migrate_use_compression()) {
2163         return false;
2164     }
2165
2166     /*
2167      * If xbzrle is on, stop using the data compression after first
2168      * round of migration even if compression is enabled. In theory,
2169      * xbzrle can do better than compression.
2170      */
2171     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2172         return true;
2173     }
2174
2175     return false;
2176 }
2177
2178 /**
2179  * ram_save_target_page: save one target page
2180  *
2181  * Returns the number of pages written
2182  *
2183  * @rs: current RAM state
2184  * @pss: data about the page we want to send
2185  * @last_stage: if we are at the completion stage
2186  */
2187 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2188                                 bool last_stage)
2189 {
2190     RAMBlock *block = pss->block;
2191     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2192     int res;
2193
2194     if (control_save_page(rs, block, offset, &res)) {
2195         return res;
2196     }
2197
2198     /*
2199      * When starting the process of a new block, the first page of
2200      * the block should be sent out before other pages in the same
2201      * block, and all the pages in last block should have been sent
2202      * out, keeping this order is important, because the 'cont' flag
2203      * is used to avoid resending the block name.
2204      */
2205     if (block != rs->last_sent_block && save_page_use_compression(rs)) {
2206             flush_compressed_data(rs);
2207     }
2208
2209     res = save_zero_page(rs, block, offset);
2210     if (res > 0) {
2211         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2212          * page would be stale
2213          */
2214         if (!save_page_use_compression(rs)) {
2215             XBZRLE_cache_lock();
2216             xbzrle_cache_zero_page(rs, block->offset + offset);
2217             XBZRLE_cache_unlock();
2218         }
2219         ram_release_pages(block->idstr, offset, res);
2220         return res;
2221     }
2222
2223     /*
2224      * Make sure the first page is sent out before other pages.
2225      *
2226      * we post it as normal page as compression will take much
2227      * CPU resource.
2228      */
2229     if (block == rs->last_sent_block && save_page_use_compression(rs)) {
2230         return compress_page_with_multi_thread(rs, block, offset);
2231     } else if (migrate_use_multifd()) {
2232         return ram_save_multifd_page(rs, block, offset);
2233     }
2234
2235     return ram_save_page(rs, pss, last_stage);
2236 }
2237
2238 /**
2239  * ram_save_host_page: save a whole host page
2240  *
2241  * Starting at *offset send pages up to the end of the current host
2242  * page. It's valid for the initial offset to point into the middle of
2243  * a host page in which case the remainder of the hostpage is sent.
2244  * Only dirty target pages are sent. Note that the host page size may
2245  * be a huge page for this block.
2246  * The saving stops at the boundary of the used_length of the block
2247  * if the RAMBlock isn't a multiple of the host page size.
2248  *
2249  * Returns the number of pages written or negative on error
2250  *
2251  * @rs: current RAM state
2252  * @ms: current migration state
2253  * @pss: data about the page we want to send
2254  * @last_stage: if we are at the completion stage
2255  */
2256 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2257                               bool last_stage)
2258 {
2259     int tmppages, pages = 0;
2260     size_t pagesize_bits =
2261         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2262
2263     if (!qemu_ram_is_migratable(pss->block)) {
2264         error_report("block %s should not be migrated !", pss->block->idstr);
2265         return 0;
2266     }
2267
2268     do {
2269         /* Check the pages is dirty and if it is send it */
2270         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2271             pss->page++;
2272             continue;
2273         }
2274
2275         tmppages = ram_save_target_page(rs, pss, last_stage);
2276         if (tmppages < 0) {
2277             return tmppages;
2278         }
2279
2280         pages += tmppages;
2281         if (pss->block->unsentmap) {
2282             clear_bit(pss->page, pss->block->unsentmap);
2283         }
2284
2285         pss->page++;
2286     } while ((pss->page & (pagesize_bits - 1)) &&
2287              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2288
2289     /* The offset we leave with is the last one we looked at */
2290     pss->page--;
2291     return pages;
2292 }
2293
2294 /**
2295  * ram_find_and_save_block: finds a dirty page and sends it to f
2296  *
2297  * Called within an RCU critical section.
2298  *
2299  * Returns the number of pages written where zero means no dirty pages
2300  *
2301  * @rs: current RAM state
2302  * @last_stage: if we are at the completion stage
2303  *
2304  * On systems where host-page-size > target-page-size it will send all the
2305  * pages in a host page that are dirty.
2306  */
2307
2308 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2309 {
2310     PageSearchStatus pss;
2311     int pages = 0;
2312     bool again, found;
2313
2314     /* No dirty page as there is zero RAM */
2315     if (!ram_bytes_total()) {
2316         return pages;
2317     }
2318
2319     pss.block = rs->last_seen_block;
2320     pss.page = rs->last_page;
2321     pss.complete_round = false;
2322
2323     if (!pss.block) {
2324         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2325     }
2326
2327     do {
2328         again = true;
2329         found = get_queued_page(rs, &pss);
2330
2331         if (!found) {
2332             /* priority queue empty, so just search for something dirty */
2333             found = find_dirty_block(rs, &pss, &again);
2334         }
2335
2336         if (found) {
2337             pages = ram_save_host_page(rs, &pss, last_stage);
2338         }
2339     } while (!pages && again);
2340
2341     rs->last_seen_block = pss.block;
2342     rs->last_page = pss.page;
2343
2344     return pages;
2345 }
2346
2347 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2348 {
2349     uint64_t pages = size / TARGET_PAGE_SIZE;
2350
2351     if (zero) {
2352         ram_counters.duplicate += pages;
2353     } else {
2354         ram_counters.normal += pages;
2355         ram_counters.transferred += size;
2356         qemu_update_position(f, size);
2357     }
2358 }
2359
2360 uint64_t ram_bytes_total(void)
2361 {
2362     RAMBlock *block;
2363     uint64_t total = 0;
2364
2365     rcu_read_lock();
2366     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2367         total += block->used_length;
2368     }
2369     rcu_read_unlock();
2370     return total;
2371 }
2372
2373 static void xbzrle_load_setup(void)
2374 {
2375     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2376 }
2377
2378 static void xbzrle_load_cleanup(void)
2379 {
2380     g_free(XBZRLE.decoded_buf);
2381     XBZRLE.decoded_buf = NULL;
2382 }
2383
2384 static void ram_state_cleanup(RAMState **rsp)
2385 {
2386     if (*rsp) {
2387         migration_page_queue_free(*rsp);
2388         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2389         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2390         g_free(*rsp);
2391         *rsp = NULL;
2392     }
2393 }
2394
2395 static void xbzrle_cleanup(void)
2396 {
2397     XBZRLE_cache_lock();
2398     if (XBZRLE.cache) {
2399         cache_fini(XBZRLE.cache);
2400         g_free(XBZRLE.encoded_buf);
2401         g_free(XBZRLE.current_buf);
2402         g_free(XBZRLE.zero_target_page);
2403         XBZRLE.cache = NULL;
2404         XBZRLE.encoded_buf = NULL;
2405         XBZRLE.current_buf = NULL;
2406         XBZRLE.zero_target_page = NULL;
2407     }
2408     XBZRLE_cache_unlock();
2409 }
2410
2411 static void ram_save_cleanup(void *opaque)
2412 {
2413     RAMState **rsp = opaque;
2414     RAMBlock *block;
2415
2416     /* caller have hold iothread lock or is in a bh, so there is
2417      * no writing race against this migration_bitmap
2418      */
2419     memory_global_dirty_log_stop();
2420
2421     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2422         g_free(block->bmap);
2423         block->bmap = NULL;
2424         g_free(block->unsentmap);
2425         block->unsentmap = NULL;
2426     }
2427
2428     xbzrle_cleanup();
2429     compress_threads_save_cleanup();
2430     ram_state_cleanup(rsp);
2431 }
2432
2433 static void ram_state_reset(RAMState *rs)
2434 {
2435     rs->last_seen_block = NULL;
2436     rs->last_sent_block = NULL;
2437     rs->last_page = 0;
2438     rs->last_version = ram_list.version;
2439     rs->ram_bulk_stage = true;
2440 }
2441
2442 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2443
2444 /*
2445  * 'expected' is the value you expect the bitmap mostly to be full
2446  * of; it won't bother printing lines that are all this value.
2447  * If 'todump' is null the migration bitmap is dumped.
2448  */
2449 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2450                            unsigned long pages)
2451 {
2452     int64_t cur;
2453     int64_t linelen = 128;
2454     char linebuf[129];
2455
2456     for (cur = 0; cur < pages; cur += linelen) {
2457         int64_t curb;
2458         bool found = false;
2459         /*
2460          * Last line; catch the case where the line length
2461          * is longer than remaining ram
2462          */
2463         if (cur + linelen > pages) {
2464             linelen = pages - cur;
2465         }
2466         for (curb = 0; curb < linelen; curb++) {
2467             bool thisbit = test_bit(cur + curb, todump);
2468             linebuf[curb] = thisbit ? '1' : '.';
2469             found = found || (thisbit != expected);
2470         }
2471         if (found) {
2472             linebuf[curb] = '\0';
2473             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2474         }
2475     }
2476 }
2477
2478 /* **** functions for postcopy ***** */
2479
2480 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2481 {
2482     struct RAMBlock *block;
2483
2484     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2485         unsigned long *bitmap = block->bmap;
2486         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2487         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2488
2489         while (run_start < range) {
2490             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2491             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2492                               (run_end - run_start) << TARGET_PAGE_BITS);
2493             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2494         }
2495     }
2496 }
2497
2498 /**
2499  * postcopy_send_discard_bm_ram: discard a RAMBlock
2500  *
2501  * Returns zero on success
2502  *
2503  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2504  * Note: At this point the 'unsentmap' is the processed bitmap combined
2505  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2506  *
2507  * @ms: current migration state
2508  * @pds: state for postcopy
2509  * @start: RAMBlock starting page
2510  * @length: RAMBlock size
2511  */
2512 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2513                                         PostcopyDiscardState *pds,
2514                                         RAMBlock *block)
2515 {
2516     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2517     unsigned long current;
2518     unsigned long *unsentmap = block->unsentmap;
2519
2520     for (current = 0; current < end; ) {
2521         unsigned long one = find_next_bit(unsentmap, end, current);
2522
2523         if (one <= end) {
2524             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2525             unsigned long discard_length;
2526
2527             if (zero >= end) {
2528                 discard_length = end - one;
2529             } else {
2530                 discard_length = zero - one;
2531             }
2532             if (discard_length) {
2533                 postcopy_discard_send_range(ms, pds, one, discard_length);
2534             }
2535             current = one + discard_length;
2536         } else {
2537             current = one;
2538         }
2539     }
2540
2541     return 0;
2542 }
2543
2544 /**
2545  * postcopy_each_ram_send_discard: discard all RAMBlocks
2546  *
2547  * Returns 0 for success or negative for error
2548  *
2549  * Utility for the outgoing postcopy code.
2550  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2551  *   passing it bitmap indexes and name.
2552  * (qemu_ram_foreach_block ends up passing unscaled lengths
2553  *  which would mean postcopy code would have to deal with target page)
2554  *
2555  * @ms: current migration state
2556  */
2557 static int postcopy_each_ram_send_discard(MigrationState *ms)
2558 {
2559     struct RAMBlock *block;
2560     int ret;
2561
2562     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2563         PostcopyDiscardState *pds =
2564             postcopy_discard_send_init(ms, block->idstr);
2565
2566         /*
2567          * Postcopy sends chunks of bitmap over the wire, but it
2568          * just needs indexes at this point, avoids it having
2569          * target page specific code.
2570          */
2571         ret = postcopy_send_discard_bm_ram(ms, pds, block);
2572         postcopy_discard_send_finish(ms, pds);
2573         if (ret) {
2574             return ret;
2575         }
2576     }
2577
2578     return 0;
2579 }
2580
2581 /**
2582  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2583  *
2584  * Helper for postcopy_chunk_hostpages; it's called twice to
2585  * canonicalize the two bitmaps, that are similar, but one is
2586  * inverted.
2587  *
2588  * Postcopy requires that all target pages in a hostpage are dirty or
2589  * clean, not a mix.  This function canonicalizes the bitmaps.
2590  *
2591  * @ms: current migration state
2592  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2593  *               otherwise we need to canonicalize partially dirty host pages
2594  * @block: block that contains the page we want to canonicalize
2595  * @pds: state for postcopy
2596  */
2597 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2598                                           RAMBlock *block,
2599                                           PostcopyDiscardState *pds)
2600 {
2601     RAMState *rs = ram_state;
2602     unsigned long *bitmap = block->bmap;
2603     unsigned long *unsentmap = block->unsentmap;
2604     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2605     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2606     unsigned long run_start;
2607
2608     if (block->page_size == TARGET_PAGE_SIZE) {
2609         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2610         return;
2611     }
2612
2613     if (unsent_pass) {
2614         /* Find a sent page */
2615         run_start = find_next_zero_bit(unsentmap, pages, 0);
2616     } else {
2617         /* Find a dirty page */
2618         run_start = find_next_bit(bitmap, pages, 0);
2619     }
2620
2621     while (run_start < pages) {
2622         bool do_fixup = false;
2623         unsigned long fixup_start_addr;
2624         unsigned long host_offset;
2625
2626         /*
2627          * If the start of this run of pages is in the middle of a host
2628          * page, then we need to fixup this host page.
2629          */
2630         host_offset = run_start % host_ratio;
2631         if (host_offset) {
2632             do_fixup = true;
2633             run_start -= host_offset;
2634             fixup_start_addr = run_start;
2635             /* For the next pass */
2636             run_start = run_start + host_ratio;
2637         } else {
2638             /* Find the end of this run */
2639             unsigned long run_end;
2640             if (unsent_pass) {
2641                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2642             } else {
2643                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2644             }
2645             /*
2646              * If the end isn't at the start of a host page, then the
2647              * run doesn't finish at the end of a host page
2648              * and we need to discard.
2649              */
2650             host_offset = run_end % host_ratio;
2651             if (host_offset) {
2652                 do_fixup = true;
2653                 fixup_start_addr = run_end - host_offset;
2654                 /*
2655                  * This host page has gone, the next loop iteration starts
2656                  * from after the fixup
2657                  */
2658                 run_start = fixup_start_addr + host_ratio;
2659             } else {
2660                 /*
2661                  * No discards on this iteration, next loop starts from
2662                  * next sent/dirty page
2663                  */
2664                 run_start = run_end + 1;
2665             }
2666         }
2667
2668         if (do_fixup) {
2669             unsigned long page;
2670
2671             /* Tell the destination to discard this page */
2672             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2673                 /* For the unsent_pass we:
2674                  *     discard partially sent pages
2675                  * For the !unsent_pass (dirty) we:
2676                  *     discard partially dirty pages that were sent
2677                  *     (any partially sent pages were already discarded
2678                  *     by the previous unsent_pass)
2679                  */
2680                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2681                                             host_ratio);
2682             }
2683
2684             /* Clean up the bitmap */
2685             for (page = fixup_start_addr;
2686                  page < fixup_start_addr + host_ratio; page++) {
2687                 /* All pages in this host page are now not sent */
2688                 set_bit(page, unsentmap);
2689
2690                 /*
2691                  * Remark them as dirty, updating the count for any pages
2692                  * that weren't previously dirty.
2693                  */
2694                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2695             }
2696         }
2697
2698         if (unsent_pass) {
2699             /* Find the next sent page for the next iteration */
2700             run_start = find_next_zero_bit(unsentmap, pages, run_start);
2701         } else {
2702             /* Find the next dirty page for the next iteration */
2703             run_start = find_next_bit(bitmap, pages, run_start);
2704         }
2705     }
2706 }
2707
2708 /**
2709  * postcopy_chuck_hostpages: discrad any partially sent host page
2710  *
2711  * Utility for the outgoing postcopy code.
2712  *
2713  * Discard any partially sent host-page size chunks, mark any partially
2714  * dirty host-page size chunks as all dirty.  In this case the host-page
2715  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2716  *
2717  * Returns zero on success
2718  *
2719  * @ms: current migration state
2720  * @block: block we want to work with
2721  */
2722 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2723 {
2724     PostcopyDiscardState *pds =
2725         postcopy_discard_send_init(ms, block->idstr);
2726
2727     /* First pass: Discard all partially sent host pages */
2728     postcopy_chunk_hostpages_pass(ms, true, block, pds);
2729     /*
2730      * Second pass: Ensure that all partially dirty host pages are made
2731      * fully dirty.
2732      */
2733     postcopy_chunk_hostpages_pass(ms, false, block, pds);
2734
2735     postcopy_discard_send_finish(ms, pds);
2736     return 0;
2737 }
2738
2739 /**
2740  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2741  *
2742  * Returns zero on success
2743  *
2744  * Transmit the set of pages to be discarded after precopy to the target
2745  * these are pages that:
2746  *     a) Have been previously transmitted but are now dirty again
2747  *     b) Pages that have never been transmitted, this ensures that
2748  *        any pages on the destination that have been mapped by background
2749  *        tasks get discarded (transparent huge pages is the specific concern)
2750  * Hopefully this is pretty sparse
2751  *
2752  * @ms: current migration state
2753  */
2754 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2755 {
2756     RAMState *rs = ram_state;
2757     RAMBlock *block;
2758     int ret;
2759
2760     rcu_read_lock();
2761
2762     /* This should be our last sync, the src is now paused */
2763     migration_bitmap_sync(rs);
2764
2765     /* Easiest way to make sure we don't resume in the middle of a host-page */
2766     rs->last_seen_block = NULL;
2767     rs->last_sent_block = NULL;
2768     rs->last_page = 0;
2769
2770     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2771         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2772         unsigned long *bitmap = block->bmap;
2773         unsigned long *unsentmap = block->unsentmap;
2774
2775         if (!unsentmap) {
2776             /* We don't have a safe way to resize the sentmap, so
2777              * if the bitmap was resized it will be NULL at this
2778              * point.
2779              */
2780             error_report("migration ram resized during precopy phase");
2781             rcu_read_unlock();
2782             return -EINVAL;
2783         }
2784         /* Deal with TPS != HPS and huge pages */
2785         ret = postcopy_chunk_hostpages(ms, block);
2786         if (ret) {
2787             rcu_read_unlock();
2788             return ret;
2789         }
2790
2791         /*
2792          * Update the unsentmap to be unsentmap = unsentmap | dirty
2793          */
2794         bitmap_or(unsentmap, unsentmap, bitmap, pages);
2795 #ifdef DEBUG_POSTCOPY
2796         ram_debug_dump_bitmap(unsentmap, true, pages);
2797 #endif
2798     }
2799     trace_ram_postcopy_send_discard_bitmap();
2800
2801     ret = postcopy_each_ram_send_discard(ms);
2802     rcu_read_unlock();
2803
2804     return ret;
2805 }
2806
2807 /**
2808  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2809  *
2810  * Returns zero on success
2811  *
2812  * @rbname: name of the RAMBlock of the request. NULL means the
2813  *          same that last one.
2814  * @start: RAMBlock starting page
2815  * @length: RAMBlock size
2816  */
2817 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2818 {
2819     int ret = -1;
2820
2821     trace_ram_discard_range(rbname, start, length);
2822
2823     rcu_read_lock();
2824     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2825
2826     if (!rb) {
2827         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2828         goto err;
2829     }
2830
2831     bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2832                  length >> qemu_target_page_bits());
2833     ret = ram_block_discard_range(rb, start, length);
2834
2835 err:
2836     rcu_read_unlock();
2837
2838     return ret;
2839 }
2840
2841 /*
2842  * For every allocation, we will try not to crash the VM if the
2843  * allocation failed.
2844  */
2845 static int xbzrle_init(void)
2846 {
2847     Error *local_err = NULL;
2848
2849     if (!migrate_use_xbzrle()) {
2850         return 0;
2851     }
2852
2853     XBZRLE_cache_lock();
2854
2855     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2856     if (!XBZRLE.zero_target_page) {
2857         error_report("%s: Error allocating zero page", __func__);
2858         goto err_out;
2859     }
2860
2861     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2862                               TARGET_PAGE_SIZE, &local_err);
2863     if (!XBZRLE.cache) {
2864         error_report_err(local_err);
2865         goto free_zero_page;
2866     }
2867
2868     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2869     if (!XBZRLE.encoded_buf) {
2870         error_report("%s: Error allocating encoded_buf", __func__);
2871         goto free_cache;
2872     }
2873
2874     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2875     if (!XBZRLE.current_buf) {
2876         error_report("%s: Error allocating current_buf", __func__);
2877         goto free_encoded_buf;
2878     }
2879
2880     /* We are all good */
2881     XBZRLE_cache_unlock();
2882     return 0;
2883
2884 free_encoded_buf:
2885     g_free(XBZRLE.encoded_buf);
2886     XBZRLE.encoded_buf = NULL;
2887 free_cache:
2888     cache_fini(XBZRLE.cache);
2889     XBZRLE.cache = NULL;
2890 free_zero_page:
2891     g_free(XBZRLE.zero_target_page);
2892     XBZRLE.zero_target_page = NULL;
2893 err_out:
2894     XBZRLE_cache_unlock();
2895     return -ENOMEM;
2896 }
2897
2898 static int ram_state_init(RAMState **rsp)
2899 {
2900     *rsp = g_try_new0(RAMState, 1);
2901
2902     if (!*rsp) {
2903         error_report("%s: Init ramstate fail", __func__);
2904         return -1;
2905     }
2906
2907     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2908     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2909     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2910
2911     /*
2912      * Count the total number of pages used by ram blocks not including any
2913      * gaps due to alignment or unplugs.
2914      */
2915     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2916
2917     ram_state_reset(*rsp);
2918
2919     return 0;
2920 }
2921
2922 static void ram_list_init_bitmaps(void)
2923 {
2924     RAMBlock *block;
2925     unsigned long pages;
2926
2927     /* Skip setting bitmap if there is no RAM */
2928     if (ram_bytes_total()) {
2929         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2930             pages = block->max_length >> TARGET_PAGE_BITS;
2931             block->bmap = bitmap_new(pages);
2932             bitmap_set(block->bmap, 0, pages);
2933             if (migrate_postcopy_ram()) {
2934                 block->unsentmap = bitmap_new(pages);
2935                 bitmap_set(block->unsentmap, 0, pages);
2936             }
2937         }
2938     }
2939 }
2940
2941 static void ram_init_bitmaps(RAMState *rs)
2942 {
2943     /* For memory_global_dirty_log_start below.  */
2944     qemu_mutex_lock_iothread();
2945     qemu_mutex_lock_ramlist();
2946     rcu_read_lock();
2947
2948     ram_list_init_bitmaps();
2949     memory_global_dirty_log_start();
2950     migration_bitmap_sync(rs);
2951
2952     rcu_read_unlock();
2953     qemu_mutex_unlock_ramlist();
2954     qemu_mutex_unlock_iothread();
2955 }
2956
2957 static int ram_init_all(RAMState **rsp)
2958 {
2959     if (ram_state_init(rsp)) {
2960         return -1;
2961     }
2962
2963     if (xbzrle_init()) {
2964         ram_state_cleanup(rsp);
2965         return -1;
2966     }
2967
2968     ram_init_bitmaps(*rsp);
2969
2970     return 0;
2971 }
2972
2973 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2974 {
2975     RAMBlock *block;
2976     uint64_t pages = 0;
2977
2978     /*
2979      * Postcopy is not using xbzrle/compression, so no need for that.
2980      * Also, since source are already halted, we don't need to care
2981      * about dirty page logging as well.
2982      */
2983
2984     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2985         pages += bitmap_count_one(block->bmap,
2986                                   block->used_length >> TARGET_PAGE_BITS);
2987     }
2988
2989     /* This may not be aligned with current bitmaps. Recalculate. */
2990     rs->migration_dirty_pages = pages;
2991
2992     rs->last_seen_block = NULL;
2993     rs->last_sent_block = NULL;
2994     rs->last_page = 0;
2995     rs->last_version = ram_list.version;
2996     /*
2997      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2998      * matter what we have sent.
2999      */
3000     rs->ram_bulk_stage = false;
3001
3002     /* Update RAMState cache of output QEMUFile */
3003     rs->f = out;
3004
3005     trace_ram_state_resume_prepare(pages);
3006 }
3007
3008 /*
3009  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3010  * long-running RCU critical section.  When rcu-reclaims in the code
3011  * start to become numerous it will be necessary to reduce the
3012  * granularity of these critical sections.
3013  */
3014
3015 /**
3016  * ram_save_setup: Setup RAM for migration
3017  *
3018  * Returns zero to indicate success and negative for error
3019  *
3020  * @f: QEMUFile where to send the data
3021  * @opaque: RAMState pointer
3022  */
3023 static int ram_save_setup(QEMUFile *f, void *opaque)
3024 {
3025     RAMState **rsp = opaque;
3026     RAMBlock *block;
3027
3028     if (compress_threads_save_setup()) {
3029         return -1;
3030     }
3031
3032     /* migration has already setup the bitmap, reuse it. */
3033     if (!migration_in_colo_state()) {
3034         if (ram_init_all(rsp) != 0) {
3035             compress_threads_save_cleanup();
3036             return -1;
3037         }
3038     }
3039     (*rsp)->f = f;
3040
3041     rcu_read_lock();
3042
3043     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
3044
3045     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3046         qemu_put_byte(f, strlen(block->idstr));
3047         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3048         qemu_put_be64(f, block->used_length);
3049         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3050             qemu_put_be64(f, block->page_size);
3051         }
3052     }
3053
3054     rcu_read_unlock();
3055
3056     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3057     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3058
3059     multifd_send_sync_main();
3060     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3061     qemu_fflush(f);
3062
3063     return 0;
3064 }
3065
3066 /**
3067  * ram_save_iterate: iterative stage for migration
3068  *
3069  * Returns zero to indicate success and negative for error
3070  *
3071  * @f: QEMUFile where to send the data
3072  * @opaque: RAMState pointer
3073  */
3074 static int ram_save_iterate(QEMUFile *f, void *opaque)
3075 {
3076     RAMState **temp = opaque;
3077     RAMState *rs = *temp;
3078     int ret;
3079     int i;
3080     int64_t t0;
3081     int done = 0;
3082
3083     if (blk_mig_bulk_active()) {
3084         /* Avoid transferring ram during bulk phase of block migration as
3085          * the bulk phase will usually take a long time and transferring
3086          * ram updates during that time is pointless. */
3087         goto out;
3088     }
3089
3090     rcu_read_lock();
3091     if (ram_list.version != rs->last_version) {
3092         ram_state_reset(rs);
3093     }
3094
3095     /* Read version before ram_list.blocks */
3096     smp_rmb();
3097
3098     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3099
3100     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3101     i = 0;
3102     while ((ret = qemu_file_rate_limit(f)) == 0 ||
3103             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3104         int pages;
3105
3106         if (qemu_file_get_error(f)) {
3107             break;
3108         }
3109
3110         pages = ram_find_and_save_block(rs, false);
3111         /* no more pages to sent */
3112         if (pages == 0) {
3113             done = 1;
3114             break;
3115         }
3116         rs->iterations++;
3117
3118         /* we want to check in the 1st loop, just in case it was the 1st time
3119            and we had to sync the dirty bitmap.
3120            qemu_get_clock_ns() is a bit expensive, so we only check each some
3121            iterations
3122         */
3123         if ((i & 63) == 0) {
3124             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3125             if (t1 > MAX_WAIT) {
3126                 trace_ram_save_iterate_big_wait(t1, i);
3127                 break;
3128             }
3129         }
3130         i++;
3131     }
3132     flush_compressed_data(rs);
3133     rcu_read_unlock();
3134
3135     /*
3136      * Must occur before EOS (or any QEMUFile operation)
3137      * because of RDMA protocol.
3138      */
3139     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3140
3141     multifd_send_sync_main();
3142 out:
3143     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3144     qemu_fflush(f);
3145     ram_counters.transferred += 8;
3146
3147     ret = qemu_file_get_error(f);
3148     if (ret < 0) {
3149         return ret;
3150     }
3151
3152     return done;
3153 }
3154
3155 /**
3156  * ram_save_complete: function called to send the remaining amount of ram
3157  *
3158  * Returns zero to indicate success
3159  *
3160  * Called with iothread lock
3161  *
3162  * @f: QEMUFile where to send the data
3163  * @opaque: RAMState pointer
3164  */
3165 static int ram_save_complete(QEMUFile *f, void *opaque)
3166 {
3167     RAMState **temp = opaque;
3168     RAMState *rs = *temp;
3169
3170     rcu_read_lock();
3171
3172     if (!migration_in_postcopy()) {
3173         migration_bitmap_sync(rs);
3174     }
3175
3176     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3177
3178     /* try transferring iterative blocks of memory */
3179
3180     /* flush all remaining blocks regardless of rate limiting */
3181     while (true) {
3182         int pages;
3183
3184         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3185         /* no more blocks to sent */
3186         if (pages == 0) {
3187             break;
3188         }
3189     }
3190
3191     flush_compressed_data(rs);
3192     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3193
3194     rcu_read_unlock();
3195
3196     multifd_send_sync_main();
3197     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3198     qemu_fflush(f);
3199
3200     return 0;
3201 }
3202
3203 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3204                              uint64_t *res_precopy_only,
3205                              uint64_t *res_compatible,
3206                              uint64_t *res_postcopy_only)
3207 {
3208     RAMState **temp = opaque;
3209     RAMState *rs = *temp;
3210     uint64_t remaining_size;
3211
3212     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3213
3214     if (!migration_in_postcopy() &&
3215         remaining_size < max_size) {
3216         qemu_mutex_lock_iothread();
3217         rcu_read_lock();
3218         migration_bitmap_sync(rs);
3219         rcu_read_unlock();
3220         qemu_mutex_unlock_iothread();
3221         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3222     }
3223
3224     if (migrate_postcopy_ram()) {
3225         /* We can do postcopy, and all the data is postcopiable */
3226         *res_compatible += remaining_size;
3227     } else {
3228         *res_precopy_only += remaining_size;
3229     }
3230 }
3231
3232 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3233 {
3234     unsigned int xh_len;
3235     int xh_flags;
3236     uint8_t *loaded_data;
3237
3238     /* extract RLE header */
3239     xh_flags = qemu_get_byte(f);
3240     xh_len = qemu_get_be16(f);
3241
3242     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3243         error_report("Failed to load XBZRLE page - wrong compression!");
3244         return -1;
3245     }
3246
3247     if (xh_len > TARGET_PAGE_SIZE) {
3248         error_report("Failed to load XBZRLE page - len overflow!");
3249         return -1;
3250     }
3251     loaded_data = XBZRLE.decoded_buf;
3252     /* load data and decode */
3253     /* it can change loaded_data to point to an internal buffer */
3254     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3255
3256     /* decode RLE */
3257     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3258                              TARGET_PAGE_SIZE) == -1) {
3259         error_report("Failed to load XBZRLE page - decode error!");
3260         return -1;
3261     }
3262
3263     return 0;
3264 }
3265
3266 /**
3267  * ram_block_from_stream: read a RAMBlock id from the migration stream
3268  *
3269  * Must be called from within a rcu critical section.
3270  *
3271  * Returns a pointer from within the RCU-protected ram_list.
3272  *
3273  * @f: QEMUFile where to read the data from
3274  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3275  */
3276 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3277 {
3278     static RAMBlock *block = NULL;
3279     char id[256];
3280     uint8_t len;
3281
3282     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3283         if (!block) {
3284             error_report("Ack, bad migration stream!");
3285             return NULL;
3286         }
3287         return block;
3288     }
3289
3290     len = qemu_get_byte(f);
3291     qemu_get_buffer(f, (uint8_t *)id, len);
3292     id[len] = 0;
3293
3294     block = qemu_ram_block_by_name(id);
3295     if (!block) {
3296         error_report("Can't find block %s", id);
3297         return NULL;
3298     }
3299
3300     if (!qemu_ram_is_migratable(block)) {
3301         error_report("block %s should not be migrated !", id);
3302         return NULL;
3303     }
3304
3305     return block;
3306 }
3307
3308 static inline void *host_from_ram_block_offset(RAMBlock *block,
3309                                                ram_addr_t offset)
3310 {
3311     if (!offset_in_ramblock(block, offset)) {
3312         return NULL;
3313     }
3314
3315     return block->host + offset;
3316 }
3317
3318 /**
3319  * ram_handle_compressed: handle the zero page case
3320  *
3321  * If a page (or a whole RDMA chunk) has been
3322  * determined to be zero, then zap it.
3323  *
3324  * @host: host address for the zero page
3325  * @ch: what the page is filled from.  We only support zero
3326  * @size: size of the zero page
3327  */
3328 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3329 {
3330     if (ch != 0 || !is_zero_range(host, size)) {
3331         memset(host, ch, size);
3332     }
3333 }
3334
3335 /* return the size after decompression, or negative value on error */
3336 static int
3337 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3338                      const uint8_t *source, size_t source_len)
3339 {
3340     int err;
3341
3342     err = inflateReset(stream);
3343     if (err != Z_OK) {
3344         return -1;
3345     }
3346
3347     stream->avail_in = source_len;
3348     stream->next_in = (uint8_t *)source;
3349     stream->avail_out = dest_len;
3350     stream->next_out = dest;
3351
3352     err = inflate(stream, Z_NO_FLUSH);
3353     if (err != Z_STREAM_END) {
3354         return -1;
3355     }
3356
3357     return stream->total_out;
3358 }
3359
3360 static void *do_data_decompress(void *opaque)
3361 {
3362     DecompressParam *param = opaque;
3363     unsigned long pagesize;
3364     uint8_t *des;
3365     int len, ret;
3366
3367     qemu_mutex_lock(&param->mutex);
3368     while (!param->quit) {
3369         if (param->des) {
3370             des = param->des;
3371             len = param->len;
3372             param->des = 0;
3373             qemu_mutex_unlock(&param->mutex);
3374
3375             pagesize = TARGET_PAGE_SIZE;
3376
3377             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3378                                        param->compbuf, len);
3379             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3380                 error_report("decompress data failed");
3381                 qemu_file_set_error(decomp_file, ret);
3382             }
3383
3384             qemu_mutex_lock(&decomp_done_lock);
3385             param->done = true;
3386             qemu_cond_signal(&decomp_done_cond);
3387             qemu_mutex_unlock(&decomp_done_lock);
3388
3389             qemu_mutex_lock(&param->mutex);
3390         } else {
3391             qemu_cond_wait(&param->cond, &param->mutex);
3392         }
3393     }
3394     qemu_mutex_unlock(&param->mutex);
3395
3396     return NULL;
3397 }
3398
3399 static int wait_for_decompress_done(void)
3400 {
3401     int idx, thread_count;
3402
3403     if (!migrate_use_compression()) {
3404         return 0;
3405     }
3406
3407     thread_count = migrate_decompress_threads();
3408     qemu_mutex_lock(&decomp_done_lock);
3409     for (idx = 0; idx < thread_count; idx++) {
3410         while (!decomp_param[idx].done) {
3411             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3412         }
3413     }
3414     qemu_mutex_unlock(&decomp_done_lock);
3415     return qemu_file_get_error(decomp_file);
3416 }
3417
3418 static void compress_threads_load_cleanup(void)
3419 {
3420     int i, thread_count;
3421
3422     if (!migrate_use_compression()) {
3423         return;
3424     }
3425     thread_count = migrate_decompress_threads();
3426     for (i = 0; i < thread_count; i++) {
3427         /*
3428          * we use it as a indicator which shows if the thread is
3429          * properly init'd or not
3430          */
3431         if (!decomp_param[i].compbuf) {
3432             break;
3433         }
3434
3435         qemu_mutex_lock(&decomp_param[i].mutex);
3436         decomp_param[i].quit = true;
3437         qemu_cond_signal(&decomp_param[i].cond);
3438         qemu_mutex_unlock(&decomp_param[i].mutex);
3439     }
3440     for (i = 0; i < thread_count; i++) {
3441         if (!decomp_param[i].compbuf) {
3442             break;
3443         }
3444
3445         qemu_thread_join(decompress_threads + i);
3446         qemu_mutex_destroy(&decomp_param[i].mutex);
3447         qemu_cond_destroy(&decomp_param[i].cond);
3448         inflateEnd(&decomp_param[i].stream);
3449         g_free(decomp_param[i].compbuf);
3450         decomp_param[i].compbuf = NULL;
3451     }
3452     g_free(decompress_threads);
3453     g_free(decomp_param);
3454     decompress_threads = NULL;
3455     decomp_param = NULL;
3456     decomp_file = NULL;
3457 }
3458
3459 static int compress_threads_load_setup(QEMUFile *f)
3460 {
3461     int i, thread_count;
3462
3463     if (!migrate_use_compression()) {
3464         return 0;
3465     }
3466
3467     thread_count = migrate_decompress_threads();
3468     decompress_threads = g_new0(QemuThread, thread_count);
3469     decomp_param = g_new0(DecompressParam, thread_count);
3470     qemu_mutex_init(&decomp_done_lock);
3471     qemu_cond_init(&decomp_done_cond);
3472     decomp_file = f;
3473     for (i = 0; i < thread_count; i++) {
3474         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3475             goto exit;
3476         }
3477
3478         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3479         qemu_mutex_init(&decomp_param[i].mutex);
3480         qemu_cond_init(&decomp_param[i].cond);
3481         decomp_param[i].done = true;
3482         decomp_param[i].quit = false;
3483         qemu_thread_create(decompress_threads + i, "decompress",
3484                            do_data_decompress, decomp_param + i,
3485                            QEMU_THREAD_JOINABLE);
3486     }
3487     return 0;
3488 exit:
3489     compress_threads_load_cleanup();
3490     return -1;
3491 }
3492
3493 static void decompress_data_with_multi_threads(QEMUFile *f,
3494                                                void *host, int len)
3495 {
3496     int idx, thread_count;
3497
3498     thread_count = migrate_decompress_threads();
3499     qemu_mutex_lock(&decomp_done_lock);
3500     while (true) {
3501         for (idx = 0; idx < thread_count; idx++) {
3502             if (decomp_param[idx].done) {
3503                 decomp_param[idx].done = false;
3504                 qemu_mutex_lock(&decomp_param[idx].mutex);
3505                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3506                 decomp_param[idx].des = host;
3507                 decomp_param[idx].len = len;
3508                 qemu_cond_signal(&decomp_param[idx].cond);
3509                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3510                 break;
3511             }
3512         }
3513         if (idx < thread_count) {
3514             break;
3515         } else {
3516             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3517         }
3518     }
3519     qemu_mutex_unlock(&decomp_done_lock);
3520 }
3521
3522 /**
3523  * ram_load_setup: Setup RAM for migration incoming side
3524  *
3525  * Returns zero to indicate success and negative for error
3526  *
3527  * @f: QEMUFile where to receive the data
3528  * @opaque: RAMState pointer
3529  */
3530 static int ram_load_setup(QEMUFile *f, void *opaque)
3531 {
3532     if (compress_threads_load_setup(f)) {
3533         return -1;
3534     }
3535
3536     xbzrle_load_setup();
3537     ramblock_recv_map_init();
3538     return 0;
3539 }
3540
3541 static int ram_load_cleanup(void *opaque)
3542 {
3543     RAMBlock *rb;
3544     xbzrle_load_cleanup();
3545     compress_threads_load_cleanup();
3546
3547     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3548         g_free(rb->receivedmap);
3549         rb->receivedmap = NULL;
3550     }
3551     return 0;
3552 }
3553
3554 /**
3555  * ram_postcopy_incoming_init: allocate postcopy data structures
3556  *
3557  * Returns 0 for success and negative if there was one error
3558  *
3559  * @mis: current migration incoming state
3560  *
3561  * Allocate data structures etc needed by incoming migration with
3562  * postcopy-ram. postcopy-ram's similarly names
3563  * postcopy_ram_incoming_init does the work.
3564  */
3565 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3566 {
3567     return postcopy_ram_incoming_init(mis);
3568 }
3569
3570 /**
3571  * ram_load_postcopy: load a page in postcopy case
3572  *
3573  * Returns 0 for success or -errno in case of error
3574  *
3575  * Called in postcopy mode by ram_load().
3576  * rcu_read_lock is taken prior to this being called.
3577  *
3578  * @f: QEMUFile where to send the data
3579  */
3580 static int ram_load_postcopy(QEMUFile *f)
3581 {
3582     int flags = 0, ret = 0;
3583     bool place_needed = false;
3584     bool matching_page_sizes = false;
3585     MigrationIncomingState *mis = migration_incoming_get_current();
3586     /* Temporary page that is later 'placed' */
3587     void *postcopy_host_page = postcopy_get_tmp_page(mis);
3588     void *last_host = NULL;
3589     bool all_zero = false;
3590
3591     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3592         ram_addr_t addr;
3593         void *host = NULL;
3594         void *page_buffer = NULL;
3595         void *place_source = NULL;
3596         RAMBlock *block = NULL;
3597         uint8_t ch;
3598
3599         addr = qemu_get_be64(f);
3600
3601         /*
3602          * If qemu file error, we should stop here, and then "addr"
3603          * may be invalid
3604          */
3605         ret = qemu_file_get_error(f);
3606         if (ret) {
3607             break;
3608         }
3609
3610         flags = addr & ~TARGET_PAGE_MASK;
3611         addr &= TARGET_PAGE_MASK;
3612
3613         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3614         place_needed = false;
3615         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
3616             block = ram_block_from_stream(f, flags);
3617
3618             host = host_from_ram_block_offset(block, addr);
3619             if (!host) {
3620                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3621                 ret = -EINVAL;
3622                 break;
3623             }
3624             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
3625             /*
3626              * Postcopy requires that we place whole host pages atomically;
3627              * these may be huge pages for RAMBlocks that are backed by
3628              * hugetlbfs.
3629              * To make it atomic, the data is read into a temporary page
3630              * that's moved into place later.
3631              * The migration protocol uses,  possibly smaller, target-pages
3632              * however the source ensures it always sends all the components
3633              * of a host page in order.
3634              */
3635             page_buffer = postcopy_host_page +
3636                           ((uintptr_t)host & (block->page_size - 1));
3637             /* If all TP are zero then we can optimise the place */
3638             if (!((uintptr_t)host & (block->page_size - 1))) {
3639                 all_zero = true;
3640             } else {
3641                 /* not the 1st TP within the HP */
3642                 if (host != (last_host + TARGET_PAGE_SIZE)) {
3643                     error_report("Non-sequential target page %p/%p",
3644                                   host, last_host);
3645                     ret = -EINVAL;
3646                     break;
3647                 }
3648             }
3649
3650
3651             /*
3652              * If it's the last part of a host page then we place the host
3653              * page
3654              */
3655             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
3656                                      (block->page_size - 1)) == 0;
3657             place_source = postcopy_host_page;
3658         }
3659         last_host = host;
3660
3661         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3662         case RAM_SAVE_FLAG_ZERO:
3663             ch = qemu_get_byte(f);
3664             memset(page_buffer, ch, TARGET_PAGE_SIZE);
3665             if (ch) {
3666                 all_zero = false;
3667             }
3668             break;
3669
3670         case RAM_SAVE_FLAG_PAGE:
3671             all_zero = false;
3672             if (!place_needed || !matching_page_sizes) {
3673                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3674             } else {
3675                 /* Avoids the qemu_file copy during postcopy, which is
3676                  * going to do a copy later; can only do it when we
3677                  * do this read in one go (matching page sizes)
3678                  */
3679                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3680                                          TARGET_PAGE_SIZE);
3681             }
3682             break;
3683         case RAM_SAVE_FLAG_EOS:
3684             /* normal exit */
3685             multifd_recv_sync_main();
3686             break;
3687         default:
3688             error_report("Unknown combination of migration flags: %#x"
3689                          " (postcopy mode)", flags);
3690             ret = -EINVAL;
3691             break;
3692         }
3693
3694         /* Detect for any possible file errors */
3695         if (!ret && qemu_file_get_error(f)) {
3696             ret = qemu_file_get_error(f);
3697         }
3698
3699         if (!ret && place_needed) {
3700             /* This gets called at the last target page in the host page */
3701             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3702
3703             if (all_zero) {
3704                 ret = postcopy_place_page_zero(mis, place_dest,
3705                                                block);
3706             } else {
3707                 ret = postcopy_place_page(mis, place_dest,
3708                                           place_source, block);
3709             }
3710         }
3711     }
3712
3713     return ret;
3714 }
3715
3716 static bool postcopy_is_advised(void)
3717 {
3718     PostcopyState ps = postcopy_state_get();
3719     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3720 }
3721
3722 static bool postcopy_is_running(void)
3723 {
3724     PostcopyState ps = postcopy_state_get();
3725     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3726 }
3727
3728 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3729 {
3730     int flags = 0, ret = 0, invalid_flags = 0;
3731     static uint64_t seq_iter;
3732     int len = 0;
3733     /*
3734      * If system is running in postcopy mode, page inserts to host memory must
3735      * be atomic
3736      */
3737     bool postcopy_running = postcopy_is_running();
3738     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3739     bool postcopy_advised = postcopy_is_advised();
3740
3741     seq_iter++;
3742
3743     if (version_id != 4) {
3744         ret = -EINVAL;
3745     }
3746
3747     if (!migrate_use_compression()) {
3748         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3749     }
3750     /* This RCU critical section can be very long running.
3751      * When RCU reclaims in the code start to become numerous,
3752      * it will be necessary to reduce the granularity of this
3753      * critical section.
3754      */
3755     rcu_read_lock();
3756
3757     if (postcopy_running) {
3758         ret = ram_load_postcopy(f);
3759     }
3760
3761     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3762         ram_addr_t addr, total_ram_bytes;
3763         void *host = NULL;
3764         uint8_t ch;
3765
3766         addr = qemu_get_be64(f);
3767         flags = addr & ~TARGET_PAGE_MASK;
3768         addr &= TARGET_PAGE_MASK;
3769
3770         if (flags & invalid_flags) {
3771             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3772                 error_report("Received an unexpected compressed page");
3773             }
3774
3775             ret = -EINVAL;
3776             break;
3777         }
3778
3779         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3780                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3781             RAMBlock *block = ram_block_from_stream(f, flags);
3782
3783             host = host_from_ram_block_offset(block, addr);
3784             if (!host) {
3785                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3786                 ret = -EINVAL;
3787                 break;
3788             }
3789             ramblock_recv_bitmap_set(block, host);
3790             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3791         }
3792
3793         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3794         case RAM_SAVE_FLAG_MEM_SIZE:
3795             /* Synchronize RAM block list */
3796             total_ram_bytes = addr;
3797             while (!ret && total_ram_bytes) {
3798                 RAMBlock *block;
3799                 char id[256];
3800                 ram_addr_t length;
3801
3802                 len = qemu_get_byte(f);
3803                 qemu_get_buffer(f, (uint8_t *)id, len);
3804                 id[len] = 0;
3805                 length = qemu_get_be64(f);
3806
3807                 block = qemu_ram_block_by_name(id);
3808                 if (block && !qemu_ram_is_migratable(block)) {
3809                     error_report("block %s should not be migrated !", id);
3810                     ret = -EINVAL;
3811                 } else if (block) {
3812                     if (length != block->used_length) {
3813                         Error *local_err = NULL;
3814
3815                         ret = qemu_ram_resize(block, length,
3816                                               &local_err);
3817                         if (local_err) {
3818                             error_report_err(local_err);
3819                         }
3820                     }
3821                     /* For postcopy we need to check hugepage sizes match */
3822                     if (postcopy_advised &&
3823                         block->page_size != qemu_host_page_size) {
3824                         uint64_t remote_page_size = qemu_get_be64(f);
3825                         if (remote_page_size != block->page_size) {
3826                             error_report("Mismatched RAM page size %s "
3827                                          "(local) %zd != %" PRId64,
3828                                          id, block->page_size,
3829                                          remote_page_size);
3830                             ret = -EINVAL;
3831                         }
3832                     }
3833                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3834                                           block->idstr);
3835                 } else {
3836                     error_report("Unknown ramblock \"%s\", cannot "
3837                                  "accept migration", id);
3838                     ret = -EINVAL;
3839                 }
3840
3841                 total_ram_bytes -= length;
3842             }
3843             break;
3844
3845         case RAM_SAVE_FLAG_ZERO:
3846             ch = qemu_get_byte(f);
3847             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3848             break;
3849
3850         case RAM_SAVE_FLAG_PAGE:
3851             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3852             break;
3853
3854         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3855             len = qemu_get_be32(f);
3856             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3857                 error_report("Invalid compressed data length: %d", len);
3858                 ret = -EINVAL;
3859                 break;
3860             }
3861             decompress_data_with_multi_threads(f, host, len);
3862             break;
3863
3864         case RAM_SAVE_FLAG_XBZRLE:
3865             if (load_xbzrle(f, addr, host) < 0) {
3866                 error_report("Failed to decompress XBZRLE page at "
3867                              RAM_ADDR_FMT, addr);
3868                 ret = -EINVAL;
3869                 break;
3870             }
3871             break;
3872         case RAM_SAVE_FLAG_EOS:
3873             /* normal exit */
3874             multifd_recv_sync_main();
3875             break;
3876         default:
3877             if (flags & RAM_SAVE_FLAG_HOOK) {
3878                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3879             } else {
3880                 error_report("Unknown combination of migration flags: %#x",
3881                              flags);
3882                 ret = -EINVAL;
3883             }
3884         }
3885         if (!ret) {
3886             ret = qemu_file_get_error(f);
3887         }
3888     }
3889
3890     ret |= wait_for_decompress_done();
3891     rcu_read_unlock();
3892     trace_ram_load_complete(ret, seq_iter);
3893     return ret;
3894 }
3895
3896 static bool ram_has_postcopy(void *opaque)
3897 {
3898     return migrate_postcopy_ram();
3899 }
3900
3901 /* Sync all the dirty bitmap with destination VM.  */
3902 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3903 {
3904     RAMBlock *block;
3905     QEMUFile *file = s->to_dst_file;
3906     int ramblock_count = 0;
3907
3908     trace_ram_dirty_bitmap_sync_start();
3909
3910     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3911         qemu_savevm_send_recv_bitmap(file, block->idstr);
3912         trace_ram_dirty_bitmap_request(block->idstr);
3913         ramblock_count++;
3914     }
3915
3916     trace_ram_dirty_bitmap_sync_wait();
3917
3918     /* Wait until all the ramblocks' dirty bitmap synced */
3919     while (ramblock_count--) {
3920         qemu_sem_wait(&s->rp_state.rp_sem);
3921     }
3922
3923     trace_ram_dirty_bitmap_sync_complete();
3924
3925     return 0;
3926 }
3927
3928 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3929 {
3930     qemu_sem_post(&s->rp_state.rp_sem);
3931 }
3932
3933 /*
3934  * Read the received bitmap, revert it as the initial dirty bitmap.
3935  * This is only used when the postcopy migration is paused but wants
3936  * to resume from a middle point.
3937  */
3938 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3939 {
3940     int ret = -EINVAL;
3941     QEMUFile *file = s->rp_state.from_dst_file;
3942     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3943     uint64_t local_size = nbits / 8;
3944     uint64_t size, end_mark;
3945
3946     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3947
3948     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3949         error_report("%s: incorrect state %s", __func__,
3950                      MigrationStatus_str(s->state));
3951         return -EINVAL;
3952     }
3953
3954     /*
3955      * Note: see comments in ramblock_recv_bitmap_send() on why we
3956      * need the endianess convertion, and the paddings.
3957      */
3958     local_size = ROUND_UP(local_size, 8);
3959
3960     /* Add paddings */
3961     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3962
3963     size = qemu_get_be64(file);
3964
3965     /* The size of the bitmap should match with our ramblock */
3966     if (size != local_size) {
3967         error_report("%s: ramblock '%s' bitmap size mismatch "
3968                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3969                      block->idstr, size, local_size);
3970         ret = -EINVAL;
3971         goto out;
3972     }
3973
3974     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3975     end_mark = qemu_get_be64(file);
3976
3977     ret = qemu_file_get_error(file);
3978     if (ret || size != local_size) {
3979         error_report("%s: read bitmap failed for ramblock '%s': %d"
3980                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3981                      __func__, block->idstr, ret, local_size, size);
3982         ret = -EIO;
3983         goto out;
3984     }
3985
3986     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3987         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3988                      __func__, block->idstr, end_mark);
3989         ret = -EINVAL;
3990         goto out;
3991     }
3992
3993     /*
3994      * Endianess convertion. We are during postcopy (though paused).
3995      * The dirty bitmap won't change. We can directly modify it.
3996      */
3997     bitmap_from_le(block->bmap, le_bitmap, nbits);
3998
3999     /*
4000      * What we received is "received bitmap". Revert it as the initial
4001      * dirty bitmap for this ramblock.
4002      */
4003     bitmap_complement(block->bmap, block->bmap, nbits);
4004
4005     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4006
4007     /*
4008      * We succeeded to sync bitmap for current ramblock. If this is
4009      * the last one to sync, we need to notify the main send thread.
4010      */
4011     ram_dirty_bitmap_reload_notify(s);
4012
4013     ret = 0;
4014 out:
4015     g_free(le_bitmap);
4016     return ret;
4017 }
4018
4019 static int ram_resume_prepare(MigrationState *s, void *opaque)
4020 {
4021     RAMState *rs = *(RAMState **)opaque;
4022     int ret;
4023
4024     ret = ram_dirty_bitmap_sync_all(s, rs);
4025     if (ret) {
4026         return ret;
4027     }
4028
4029     ram_state_resume_prepare(rs, s->to_dst_file);
4030
4031     return 0;
4032 }
4033
4034 static SaveVMHandlers savevm_ram_handlers = {
4035     .save_setup = ram_save_setup,
4036     .save_live_iterate = ram_save_iterate,
4037     .save_live_complete_postcopy = ram_save_complete,
4038     .save_live_complete_precopy = ram_save_complete,
4039     .has_postcopy = ram_has_postcopy,
4040     .save_live_pending = ram_save_pending,
4041     .load_state = ram_load,
4042     .save_cleanup = ram_save_cleanup,
4043     .load_setup = ram_load_setup,
4044     .load_cleanup = ram_load_cleanup,
4045     .resume_prepare = ram_resume_prepare,
4046 };
4047
4048 void ram_mig_init(void)
4049 {
4050     qemu_mutex_init(&XBZRLE.lock);
4051     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4052 }