migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "socket.h"
  40 #include "migration/register.h"
  41 #include "migration/misc.h"
  42 #include "qemu-file.h"
  43 #include "postcopy-ram.h"
  44 #include "migration/page_cache.h"
  45 #include "qemu/error-report.h"
  46 #include "qapi/error.h"
  47 #include "qapi/qapi-events-migration.h"
  48 #include "qapi/qmp/qerror.h"
  49 #include "trace.h"
  50 #include "exec/ram_addr.h"
  51 #include "exec/target_page.h"
  52 #include "qemu/rcu_queue.h"
  53 #include "migration/colo.h"
  54 #include "migration/block.h"
  55 #include "sysemu/sysemu.h"
  56 #include "qemu/uuid.h"
  57 #include "savevm.h"
  58
  59 /***********************************************************/
  60 /* ram save/restore */
  61
  62 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  63  * worked for pages that where filled with the same char.  We switched
  64  * it to only search for the zero value.  And to avoid confusion with
  65  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  66  */
  67
  68 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  69 #define RAM_SAVE_FLAG_ZERO     0x02
  70 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  71 #define RAM_SAVE_FLAG_PAGE     0x08
  72 #define RAM_SAVE_FLAG_EOS      0x10
  73 #define RAM_SAVE_FLAG_CONTINUE 0x20
  74 #define RAM_SAVE_FLAG_XBZRLE   0x40
  75 /* 0x80 is reserved in migration.h start with 0x100 next */
  76 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  77
  78 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  79 {
  80     return buffer_is_zero(p, size);
  81 }
  82
  83 XBZRLECacheStats xbzrle_counters;
  84
  85 /* struct contains XBZRLE cache and a static page
  86    used by the compression */
  87 static struct {
  88     /* buffer used for XBZRLE encoding */
  89     uint8_t *encoded_buf;
  90     /* buffer for storing page content */
  91     uint8_t *current_buf;
  92     /* Cache for XBZRLE, Protected by lock. */
  93     PageCache *cache;
  94     QemuMutex lock;
  95     /* it will store a page full of zeros */
  96     uint8_t *zero_target_page;
  97     /* buffer used for XBZRLE decoding */
  98     uint8_t *decoded_buf;
  99 } XBZRLE;
 100
 101 static void XBZRLE_cache_lock(void)
 102 {
 103     if (migrate_use_xbzrle())
 104         qemu_mutex_lock(&XBZRLE.lock);
 105 }
 106
 107 static void XBZRLE_cache_unlock(void)
 108 {
 109     if (migrate_use_xbzrle())
 110         qemu_mutex_unlock(&XBZRLE.lock);
 111 }
 112
 113 /**
 114  * xbzrle_cache_resize: resize the xbzrle cache
 115  *
 116  * This function is called from qmp_migrate_set_cache_size in main
 117  * thread, possibly while a migration is in progress.  A running
 118  * migration may be using the cache and might finish during this call,
 119  * hence changes to the cache are protected by XBZRLE.lock().
 120  *
 121  * Returns 0 for success or -1 for error
 122  *
 123  * @new_size: new cache size
 124  * @errp: set *errp if the check failed, with reason
 125  */
 126 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 127 {
 128     PageCache *new_cache;
 129     int64_t ret = 0;
 130
 131     /* Check for truncation */
 132     if (new_size != (size_t)new_size) {
 133         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 134                    "exceeding address space");
 135         return -1;
 136     }
 137
 138     if (new_size == migrate_xbzrle_cache_size()) {
 139         /* nothing to do */
 140         return 0;
 141     }
 142
 143     XBZRLE_cache_lock();
 144
 145     if (XBZRLE.cache != NULL) {
 146         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 147         if (!new_cache) {
 148             ret = -1;
 149             goto out;
 150         }
 151
 152         cache_fini(XBZRLE.cache);
 153         XBZRLE.cache = new_cache;
 154     }
 155 out:
 156     XBZRLE_cache_unlock();
 157     return ret;
 158 }
 159
 160 static void ramblock_recv_map_init(void)
 161 {
 162     RAMBlock *rb;
 163
 164     RAMBLOCK_FOREACH(rb) {
 165         assert(!rb->receivedmap);
 166         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 167     }
 168 }
 169
 170 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 171 {
 172     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 173                     rb->receivedmap);
 174 }
 175
 176 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 177 {
 178     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 179 }
 180
 181 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 182 {
 183     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 184 }
 185
 186 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 187                                     size_t nr)
 188 {
 189     bitmap_set_atomic(rb->receivedmap,
 190                       ramblock_recv_bitmap_offset(host_addr, rb),
 191                       nr);
 192 }
 193
 194 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 195
 196 /*
 197  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 198  *
 199  * Returns >0 if success with sent bytes, or <0 if error.
 200  */
 201 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 202                                   const char *block_name)
 203 {
 204     RAMBlock *block = qemu_ram_block_by_name(block_name);
 205     unsigned long *le_bitmap, nbits;
 206     uint64_t size;
 207
 208     if (!block) {
 209         error_report("%s: invalid block name: %s", __func__, block_name);
 210         return -1;
 211     }
 212
 213     nbits = block->used_length >> TARGET_PAGE_BITS;
 214
 215     /*
 216      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 217      * machines we may need 4 more bytes for padding (see below
 218      * comment). So extend it a bit before hand.
 219      */
 220     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 221
 222     /*
 223      * Always use little endian when sending the bitmap. This is
 224      * required that when source and destination VMs are not using the
 225      * same endianess. (Note: big endian won't work.)
 226      */
 227     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 228
 229     /* Size of the bitmap, in bytes */
 230     size = nbits / 8;
 231
 232     /*
 233      * size is always aligned to 8 bytes for 64bit machines, but it
 234      * may not be true for 32bit machines. We need this padding to
 235      * make sure the migration can survive even between 32bit and
 236      * 64bit machines.
 237      */
 238     size = ROUND_UP(size, 8);
 239
 240     qemu_put_be64(file, size);
 241     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 242     /*
 243      * Mark as an end, in case the middle part is screwed up due to
 244      * some "misterious" reason.
 245      */
 246     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 247     qemu_fflush(file);
 248
 249     free(le_bitmap);
 250
 251     if (qemu_file_get_error(file)) {
 252         return qemu_file_get_error(file);
 253     }
 254
 255     return size + sizeof(size);
 256 }
 257
 258 /*
 259  * An outstanding page request, on the source, having been received
 260  * and queued
 261  */
 262 struct RAMSrcPageRequest {
 263     RAMBlock *rb;
 264     hwaddr    offset;
 265     hwaddr    len;
 266
 267     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 268 };
 269
 270 /* State of RAM for migration */
 271 struct RAMState {
 272     /* QEMUFile used for this migration */
 273     QEMUFile *f;
 274     /* Last block that we have visited searching for dirty pages */
 275     RAMBlock *last_seen_block;
 276     /* Last block from where we have sent data */
 277     RAMBlock *last_sent_block;
 278     /* Last dirty target page we have sent */
 279     ram_addr_t last_page;
 280     /* last ram version we have seen */
 281     uint32_t last_version;
 282     /* We are in the first round */
 283     bool ram_bulk_stage;
 284     /* How many times we have dirty too many pages */
 285     int dirty_rate_high_cnt;
 286     /* these variables are used for bitmap sync */
 287     /* last time we did a full bitmap_sync */
 288     int64_t time_last_bitmap_sync;
 289     /* bytes transferred at start_time */
 290     uint64_t bytes_xfer_prev;
 291     /* number of dirty pages since start_time */
 292     uint64_t num_dirty_pages_period;
 293     /* xbzrle misses since the beginning of the period */
 294     uint64_t xbzrle_cache_miss_prev;
 295     /* number of iterations at the beginning of period */
 296     uint64_t iterations_prev;
 297     /* Iterations since start */
 298     uint64_t iterations;
 299     /* number of dirty bits in the bitmap */
 300     uint64_t migration_dirty_pages;
 301     /* protects modification of the bitmap */
 302     QemuMutex bitmap_mutex;
 303     /* The RAMBlock used in the last src_page_requests */
 304     RAMBlock *last_req_rb;
 305     /* Queue of outstanding page requests from the destination */
 306     QemuMutex src_page_req_mutex;
 307     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 308 };
 309 typedef struct RAMState RAMState;
 310
 311 static RAMState *ram_state;
 312
 313 uint64_t ram_bytes_remaining(void)
 314 {
 315     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 316                        0;
 317 }
 318
 319 MigrationStats ram_counters;
 320
 321 /* used by the search for pages to send */
 322 struct PageSearchStatus {
 323     /* Current block being searched */
 324     RAMBlock    *block;
 325     /* Current page to search from */
 326     unsigned long page;
 327     /* Set once we wrap around */
 328     bool         complete_round;
 329 };
 330 typedef struct PageSearchStatus PageSearchStatus;
 331
 332 struct CompressParam {
 333     bool done;
 334     bool quit;
 335     QEMUFile *file;
 336     QemuMutex mutex;
 337     QemuCond cond;
 338     RAMBlock *block;
 339     ram_addr_t offset;
 340
 341     /* internally used fields */
 342     z_stream stream;
 343     uint8_t *originbuf;
 344 };
 345 typedef struct CompressParam CompressParam;
 346
 347 struct DecompressParam {
 348     bool done;
 349     bool quit;
 350     QemuMutex mutex;
 351     QemuCond cond;
 352     void *des;
 353     uint8_t *compbuf;
 354     int len;
 355     z_stream stream;
 356 };
 357 typedef struct DecompressParam DecompressParam;
 358
 359 static CompressParam *comp_param;
 360 static QemuThread *compress_threads;
 361 /* comp_done_cond is used to wake up the migration thread when
 362  * one of the compression threads has finished the compression.
 363  * comp_done_lock is used to co-work with comp_done_cond.
 364  */
 365 static QemuMutex comp_done_lock;
 366 static QemuCond comp_done_cond;
 367 /* The empty QEMUFileOps will be used by file in CompressParam */
 368 static const QEMUFileOps empty_ops = { };
 369
 370 static QEMUFile *decomp_file;
 371 static DecompressParam *decomp_param;
 372 static QemuThread *decompress_threads;
 373 static QemuMutex decomp_done_lock;
 374 static QemuCond decomp_done_cond;
 375
 376 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 377                                 ram_addr_t offset, uint8_t *source_buf);
 378
 379 static void *do_data_compress(void *opaque)
 380 {
 381     CompressParam *param = opaque;
 382     RAMBlock *block;
 383     ram_addr_t offset;
 384
 385     qemu_mutex_lock(&param->mutex);
 386     while (!param->quit) {
 387         if (param->block) {
 388             block = param->block;
 389             offset = param->offset;
 390             param->block = NULL;
 391             qemu_mutex_unlock(&param->mutex);
 392
 393             do_compress_ram_page(param->file, &param->stream, block, offset,
 394                                  param->originbuf);
 395
 396             qemu_mutex_lock(&comp_done_lock);
 397             param->done = true;
 398             qemu_cond_signal(&comp_done_cond);
 399             qemu_mutex_unlock(&comp_done_lock);
 400
 401             qemu_mutex_lock(&param->mutex);
 402         } else {
 403             qemu_cond_wait(&param->cond, &param->mutex);
 404         }
 405     }
 406     qemu_mutex_unlock(&param->mutex);
 407
 408     return NULL;
 409 }
 410
 411 static inline void terminate_compression_threads(void)
 412 {
 413     int idx, thread_count;
 414
 415     thread_count = migrate_compress_threads();
 416
 417     for (idx = 0; idx < thread_count; idx++) {
 418         qemu_mutex_lock(&comp_param[idx].mutex);
 419         comp_param[idx].quit = true;
 420         qemu_cond_signal(&comp_param[idx].cond);
 421         qemu_mutex_unlock(&comp_param[idx].mutex);
 422     }
 423 }
 424
 425 static void compress_threads_save_cleanup(void)
 426 {
 427     int i, thread_count;
 428
 429     if (!migrate_use_compression()) {
 430         return;
 431     }
 432     terminate_compression_threads();
 433     thread_count = migrate_compress_threads();
 434     for (i = 0; i < thread_count; i++) {
 435         /*
 436          * we use it as a indicator which shows if the thread is
 437          * properly init'd or not
 438          */
 439         if (!comp_param[i].file) {
 440             break;
 441         }
 442         qemu_thread_join(compress_threads + i);
 443         qemu_mutex_destroy(&comp_param[i].mutex);
 444         qemu_cond_destroy(&comp_param[i].cond);
 445         deflateEnd(&comp_param[i].stream);
 446         g_free(comp_param[i].originbuf);
 447         qemu_fclose(comp_param[i].file);
 448         comp_param[i].file = NULL;
 449     }
 450     qemu_mutex_destroy(&comp_done_lock);
 451     qemu_cond_destroy(&comp_done_cond);
 452     g_free(compress_threads);
 453     g_free(comp_param);
 454     compress_threads = NULL;
 455     comp_param = NULL;
 456 }
 457
 458 static int compress_threads_save_setup(void)
 459 {
 460     int i, thread_count;
 461
 462     if (!migrate_use_compression()) {
 463         return 0;
 464     }
 465     thread_count = migrate_compress_threads();
 466     compress_threads = g_new0(QemuThread, thread_count);
 467     comp_param = g_new0(CompressParam, thread_count);
 468     qemu_cond_init(&comp_done_cond);
 469     qemu_mutex_init(&comp_done_lock);
 470     for (i = 0; i < thread_count; i++) {
 471         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 472         if (!comp_param[i].originbuf) {
 473             goto exit;
 474         }
 475
 476         if (deflateInit(&comp_param[i].stream,
 477                         migrate_compress_level()) != Z_OK) {
 478             g_free(comp_param[i].originbuf);
 479             goto exit;
 480         }
 481
 482         /* comp_param[i].file is just used as a dummy buffer to save data,
 483          * set its ops to empty.
 484          */
 485         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 486         comp_param[i].done = true;
 487         comp_param[i].quit = false;
 488         qemu_mutex_init(&comp_param[i].mutex);
 489         qemu_cond_init(&comp_param[i].cond);
 490         qemu_thread_create(compress_threads + i, "compress",
 491                            do_data_compress, comp_param + i,
 492                            QEMU_THREAD_JOINABLE);
 493     }
 494     return 0;
 495
 496 exit:
 497     compress_threads_save_cleanup();
 498     return -1;
 499 }
 500
 501 /* Multiple fd's */
 502
 503 #define MULTIFD_MAGIC 0x11223344U
 504 #define MULTIFD_VERSION 1
 505
 506 typedef struct {
 507     uint32_t magic;
 508     uint32_t version;
 509     unsigned char uuid[16]; /* QemuUUID */
 510     uint8_t id;
 511 } __attribute__((packed)) MultiFDInit_t;
 512
 513 typedef struct {
 514     /* this fields are not changed once the thread is created */
 515     /* channel number */
 516     uint8_t id;
 517     /* channel thread name */
 518     char *name;
 519     /* channel thread id */
 520     QemuThread thread;
 521     /* communication channel */
 522     QIOChannel *c;
 523     /* sem where to wait for more work */
 524     QemuSemaphore sem;
 525     /* this mutex protects the following parameters */
 526     QemuMutex mutex;
 527     /* is this channel thread running */
 528     bool running;
 529     /* should this thread finish */
 530     bool quit;
 531 }  MultiFDSendParams;
 532
 533 typedef struct {
 534     /* this fields are not changed once the thread is created */
 535     /* channel number */
 536     uint8_t id;
 537     /* channel thread name */
 538     char *name;
 539     /* channel thread id */
 540     QemuThread thread;
 541     /* communication channel */
 542     QIOChannel *c;
 543     /* sem where to wait for more work */
 544     QemuSemaphore sem;
 545     /* this mutex protects the following parameters */
 546     QemuMutex mutex;
 547     /* is this channel thread running */
 548     bool running;
 549     /* should this thread finish */
 550     bool quit;
 551 } MultiFDRecvParams;
 552
 553 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 554 {
 555     MultiFDInit_t msg;
 556     int ret;
 557
 558     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 559     msg.version = cpu_to_be32(MULTIFD_VERSION);
 560     msg.id = p->id;
 561     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 562
 563     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 564     if (ret != 0) {
 565         return -1;
 566     }
 567     return 0;
 568 }
 569
 570 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 571 {
 572     MultiFDInit_t msg;
 573     int ret;
 574
 575     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 576     if (ret != 0) {
 577         return -1;
 578     }
 579
 580     be32_to_cpus(&msg.magic);
 581     be32_to_cpus(&msg.version);
 582
 583     if (msg.magic != MULTIFD_MAGIC) {
 584         error_setg(errp, "multifd: received packet magic %x "
 585                    "expected %x", msg.magic, MULTIFD_MAGIC);
 586         return -1;
 587     }
 588
 589     if (msg.version != MULTIFD_VERSION) {
 590         error_setg(errp, "multifd: received packet version %d "
 591                    "expected %d", msg.version, MULTIFD_VERSION);
 592         return -1;
 593     }
 594
 595     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 596         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 597         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 598
 599         error_setg(errp, "multifd: received uuid '%s' and expected "
 600                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 601         g_free(uuid);
 602         g_free(msg_uuid);
 603         return -1;
 604     }
 605
 606     if (msg.id > migrate_multifd_channels()) {
 607         error_setg(errp, "multifd: received channel version %d "
 608                    "expected %d", msg.version, MULTIFD_VERSION);
 609         return -1;
 610     }
 611
 612     return msg.id;
 613 }
 614
 615 struct {
 616     MultiFDSendParams *params;
 617     /* number of created threads */
 618     int count;
 619 } *multifd_send_state;
 620
 621 static void multifd_send_terminate_threads(Error *err)
 622 {
 623     int i;
 624
 625     if (err) {
 626         MigrationState *s = migrate_get_current();
 627         migrate_set_error(s, err);
 628         if (s->state == MIGRATION_STATUS_SETUP ||
 629             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
 630             s->state == MIGRATION_STATUS_DEVICE ||
 631             s->state == MIGRATION_STATUS_ACTIVE) {
 632             migrate_set_state(&s->state, s->state,
 633                               MIGRATION_STATUS_FAILED);
 634         }
 635     }
 636
 637     for (i = 0; i < migrate_multifd_channels(); i++) {
 638         MultiFDSendParams *p = &multifd_send_state->params[i];
 639
 640         qemu_mutex_lock(&p->mutex);
 641         p->quit = true;
 642         qemu_sem_post(&p->sem);
 643         qemu_mutex_unlock(&p->mutex);
 644     }
 645 }
 646
 647 int multifd_save_cleanup(Error **errp)
 648 {
 649     int i;
 650     int ret = 0;
 651
 652     if (!migrate_use_multifd()) {
 653         return 0;
 654     }
 655     multifd_send_terminate_threads(NULL);
 656     for (i = 0; i < migrate_multifd_channels(); i++) {
 657         MultiFDSendParams *p = &multifd_send_state->params[i];
 658
 659         if (p->running) {
 660             qemu_thread_join(&p->thread);
 661         }
 662         socket_send_channel_destroy(p->c);
 663         p->c = NULL;
 664         qemu_mutex_destroy(&p->mutex);
 665         qemu_sem_destroy(&p->sem);
 666         g_free(p->name);
 667         p->name = NULL;
 668     }
 669     g_free(multifd_send_state->params);
 670     multifd_send_state->params = NULL;
 671     g_free(multifd_send_state);
 672     multifd_send_state = NULL;
 673     return ret;
 674 }
 675
 676 static void *multifd_send_thread(void *opaque)
 677 {
 678     MultiFDSendParams *p = opaque;
 679     Error *local_err = NULL;
 680
 681     if (multifd_send_initial_packet(p, &local_err) < 0) {
 682         goto out;
 683     }
 684
 685     while (true) {
 686         qemu_mutex_lock(&p->mutex);
 687         if (p->quit) {
 688             qemu_mutex_unlock(&p->mutex);
 689             break;
 690         }
 691         qemu_mutex_unlock(&p->mutex);
 692         qemu_sem_wait(&p->sem);
 693     }
 694
 695 out:
 696     if (local_err) {
 697         multifd_send_terminate_threads(local_err);
 698     }
 699
 700     qemu_mutex_lock(&p->mutex);
 701     p->running = false;
 702     qemu_mutex_unlock(&p->mutex);
 703
 704     return NULL;
 705 }
 706
 707 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
 708 {
 709     MultiFDSendParams *p = opaque;
 710     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
 711     Error *local_err = NULL;
 712
 713     if (qio_task_propagate_error(task, &local_err)) {
 714         if (multifd_save_cleanup(&local_err) != 0) {
 715             migrate_set_error(migrate_get_current(), local_err);
 716         }
 717     } else {
 718         p->c = QIO_CHANNEL(sioc);
 719         qio_channel_set_delay(p->c, false);
 720         p->running = true;
 721         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
 722                            QEMU_THREAD_JOINABLE);
 723
 724         atomic_inc(&multifd_send_state->count);
 725     }
 726 }
 727
 728 int multifd_save_setup(void)
 729 {
 730     int thread_count;
 731     uint8_t i;
 732
 733     if (!migrate_use_multifd()) {
 734         return 0;
 735     }
 736     thread_count = migrate_multifd_channels();
 737     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
 738     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
 739     atomic_set(&multifd_send_state->count, 0);
 740     for (i = 0; i < thread_count; i++) {
 741         MultiFDSendParams *p = &multifd_send_state->params[i];
 742
 743         qemu_mutex_init(&p->mutex);
 744         qemu_sem_init(&p->sem, 0);
 745         p->quit = false;
 746         p->id = i;
 747         p->name = g_strdup_printf("multifdsend_%d", i);
 748         socket_send_channel_create(multifd_new_send_channel_async, p);
 749     }
 750     return 0;
 751 }
 752
 753 struct {
 754     MultiFDRecvParams *params;
 755     /* number of created threads */
 756     int count;
 757 } *multifd_recv_state;
 758
 759 static void multifd_recv_terminate_threads(Error *err)
 760 {
 761     int i;
 762
 763     if (err) {
 764         MigrationState *s = migrate_get_current();
 765         migrate_set_error(s, err);
 766         if (s->state == MIGRATION_STATUS_SETUP ||
 767             s->state == MIGRATION_STATUS_ACTIVE) {
 768             migrate_set_state(&s->state, s->state,
 769                               MIGRATION_STATUS_FAILED);
 770         }
 771     }
 772
 773     for (i = 0; i < migrate_multifd_channels(); i++) {
 774         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 775
 776         qemu_mutex_lock(&p->mutex);
 777         p->quit = true;
 778         qemu_sem_post(&p->sem);
 779         qemu_mutex_unlock(&p->mutex);
 780     }
 781 }
 782
 783 int multifd_load_cleanup(Error **errp)
 784 {
 785     int i;
 786     int ret = 0;
 787
 788     if (!migrate_use_multifd()) {
 789         return 0;
 790     }
 791     multifd_recv_terminate_threads(NULL);
 792     for (i = 0; i < migrate_multifd_channels(); i++) {
 793         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 794
 795         if (p->running) {
 796             qemu_thread_join(&p->thread);
 797         }
 798         object_unref(OBJECT(p->c));
 799         p->c = NULL;
 800         qemu_mutex_destroy(&p->mutex);
 801         qemu_sem_destroy(&p->sem);
 802         g_free(p->name);
 803         p->name = NULL;
 804     }
 805     g_free(multifd_recv_state->params);
 806     multifd_recv_state->params = NULL;
 807     g_free(multifd_recv_state);
 808     multifd_recv_state = NULL;
 809
 810     return ret;
 811 }
 812
 813 static void *multifd_recv_thread(void *opaque)
 814 {
 815     MultiFDRecvParams *p = opaque;
 816
 817     while (true) {
 818         qemu_mutex_lock(&p->mutex);
 819         if (p->quit) {
 820             qemu_mutex_unlock(&p->mutex);
 821             break;
 822         }
 823         qemu_mutex_unlock(&p->mutex);
 824         qemu_sem_wait(&p->sem);
 825     }
 826
 827     qemu_mutex_lock(&p->mutex);
 828     p->running = false;
 829     qemu_mutex_unlock(&p->mutex);
 830
 831     return NULL;
 832 }
 833
 834 int multifd_load_setup(void)
 835 {
 836     int thread_count;
 837     uint8_t i;
 838
 839     if (!migrate_use_multifd()) {
 840         return 0;
 841     }
 842     thread_count = migrate_multifd_channels();
 843     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
 844     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
 845     atomic_set(&multifd_recv_state->count, 0);
 846     for (i = 0; i < thread_count; i++) {
 847         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 848
 849         qemu_mutex_init(&p->mutex);
 850         qemu_sem_init(&p->sem, 0);
 851         p->quit = false;
 852         p->id = i;
 853         p->name = g_strdup_printf("multifdrecv_%d", i);
 854     }
 855     return 0;
 856 }
 857
 858 bool multifd_recv_all_channels_created(void)
 859 {
 860     int thread_count = migrate_multifd_channels();
 861
 862     if (!migrate_use_multifd()) {
 863         return true;
 864     }
 865
 866     return thread_count == atomic_read(&multifd_recv_state->count);
 867 }
 868
 869 void multifd_recv_new_channel(QIOChannel *ioc)
 870 {
 871     MultiFDRecvParams *p;
 872     Error *local_err = NULL;
 873     int id;
 874
 875     id = multifd_recv_initial_packet(ioc, &local_err);
 876     if (id < 0) {
 877         multifd_recv_terminate_threads(local_err);
 878         return;
 879     }
 880
 881     p = &multifd_recv_state->params[id];
 882     if (p->c != NULL) {
 883         error_setg(&local_err, "multifd: received id '%d' already setup'",
 884                    id);
 885         multifd_recv_terminate_threads(local_err);
 886         return;
 887     }
 888     p->c = ioc;
 889     object_ref(OBJECT(ioc));
 890
 891     p->running = true;
 892     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
 893                        QEMU_THREAD_JOINABLE);
 894     atomic_inc(&multifd_recv_state->count);
 895     if (multifd_recv_state->count == migrate_multifd_channels()) {
 896         migration_incoming_process();
 897     }
 898 }
 899
 900 /**
 901  * save_page_header: write page header to wire
 902  *
 903  * If this is the 1st block, it also writes the block identification
 904  *
 905  * Returns the number of bytes written
 906  *
 907  * @f: QEMUFile where to send the data
 908  * @block: block that contains the page we want to send
 909  * @offset: offset inside the block for the page
 910  *          in the lower bits, it contains flags
 911  */
 912 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 913                                ram_addr_t offset)
 914 {
 915     size_t size, len;
 916
 917     if (block == rs->last_sent_block) {
 918         offset |= RAM_SAVE_FLAG_CONTINUE;
 919     }
 920     qemu_put_be64(f, offset);
 921     size = 8;
 922
 923     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 924         len = strlen(block->idstr);
 925         qemu_put_byte(f, len);
 926         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 927         size += 1 + len;
 928         rs->last_sent_block = block;
 929     }
 930     return size;
 931 }
 932
 933 /**
 934  * mig_throttle_guest_down: throotle down the guest
 935  *
 936  * Reduce amount of guest cpu execution to hopefully slow down memory
 937  * writes. If guest dirty memory rate is reduced below the rate at
 938  * which we can transfer pages to the destination then we should be
 939  * able to complete migration. Some workloads dirty memory way too
 940  * fast and will not effectively converge, even with auto-converge.
 941  */
 942 static void mig_throttle_guest_down(void)
 943 {
 944     MigrationState *s = migrate_get_current();
 945     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 946     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 947
 948     /* We have not started throttling yet. Let's start it. */
 949     if (!cpu_throttle_active()) {
 950         cpu_throttle_set(pct_initial);
 951     } else {
 952         /* Throttling already on, just increase the rate */
 953         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 954     }
 955 }
 956
 957 /**
 958  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 959  *
 960  * @rs: current RAM state
 961  * @current_addr: address for the zero page
 962  *
 963  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 964  * The important thing is that a stale (not-yet-0'd) page be replaced
 965  * by the new data.
 966  * As a bonus, if the page wasn't in the cache it gets added so that
 967  * when a small write is made into the 0'd page it gets XBZRLE sent.
 968  */
 969 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 970 {
 971     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 972         return;
 973     }
 974
 975     /* We don't care if this fails to allocate a new cache page
 976      * as long as it updated an old one */
 977     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 978                  ram_counters.dirty_sync_count);
 979 }
 980
 981 #define ENCODING_FLAG_XBZRLE 0x1
 982
 983 /**
 984  * save_xbzrle_page: compress and send current page
 985  *
 986  * Returns: 1 means that we wrote the page
 987  *          0 means that page is identical to the one already sent
 988  *          -1 means that xbzrle would be longer than normal
 989  *
 990  * @rs: current RAM state
 991  * @current_data: pointer to the address of the page contents
 992  * @current_addr: addr of the page
 993  * @block: block that contains the page we want to send
 994  * @offset: offset inside the block for the page
 995  * @last_stage: if we are at the completion stage
 996  */
 997 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 998                             ram_addr_t current_addr, RAMBlock *block,
 999                             ram_addr_t offset, bool last_stage)
1000 {
1001     int encoded_len = 0, bytes_xbzrle;
1002     uint8_t *prev_cached_page;
1003
1004     if (!cache_is_cached(XBZRLE.cache, current_addr,
1005                          ram_counters.dirty_sync_count)) {
1006         xbzrle_counters.cache_miss++;
1007         if (!last_stage) {
1008             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1009                              ram_counters.dirty_sync_count) == -1) {
1010                 return -1;
1011             } else {
1012                 /* update *current_data when the page has been
1013                    inserted into cache */
1014                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1015             }
1016         }
1017         return -1;
1018     }
1019
1020     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1021
1022     /* save current buffer into memory */
1023     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1024
1025     /* XBZRLE encoding (if there is no overflow) */
1026     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1027                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1028                                        TARGET_PAGE_SIZE);
1029     if (encoded_len == 0) {
1030         trace_save_xbzrle_page_skipping();
1031         return 0;
1032     } else if (encoded_len == -1) {
1033         trace_save_xbzrle_page_overflow();
1034         xbzrle_counters.overflow++;
1035         /* update data in the cache */
1036         if (!last_stage) {
1037             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1038             *current_data = prev_cached_page;
1039         }
1040         return -1;
1041     }
1042
1043     /* we need to update the data in the cache, in order to get the same data */
1044     if (!last_stage) {
1045         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1046     }
1047
1048     /* Send XBZRLE based compressed page */
1049     bytes_xbzrle = save_page_header(rs, rs->f, block,
1050                                     offset | RAM_SAVE_FLAG_XBZRLE);
1051     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1052     qemu_put_be16(rs->f, encoded_len);
1053     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1054     bytes_xbzrle += encoded_len + 1 + 2;
1055     xbzrle_counters.pages++;
1056     xbzrle_counters.bytes += bytes_xbzrle;
1057     ram_counters.transferred += bytes_xbzrle;
1058
1059     return 1;
1060 }
1061
1062 /**
1063  * migration_bitmap_find_dirty: find the next dirty page from start
1064  *
1065  * Called with rcu_read_lock() to protect migration_bitmap
1066  *
1067  * Returns the byte offset within memory region of the start of a dirty page
1068  *
1069  * @rs: current RAM state
1070  * @rb: RAMBlock where to search for dirty pages
1071  * @start: page where we start the search
1072  */
1073 static inline
1074 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1075                                           unsigned long start)
1076 {
1077     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1078     unsigned long *bitmap = rb->bmap;
1079     unsigned long next;
1080
1081     if (rs->ram_bulk_stage && start > 0) {
1082         next = start + 1;
1083     } else {
1084         next = find_next_bit(bitmap, size, start);
1085     }
1086
1087     return next;
1088 }
1089
1090 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1091                                                 RAMBlock *rb,
1092                                                 unsigned long page)
1093 {
1094     bool ret;
1095
1096     ret = test_and_clear_bit(page, rb->bmap);
1097
1098     if (ret) {
1099         rs->migration_dirty_pages--;
1100     }
1101     return ret;
1102 }
1103
1104 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1105                                         ram_addr_t start, ram_addr_t length)
1106 {
1107     rs->migration_dirty_pages +=
1108         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1109                                               &rs->num_dirty_pages_period);
1110 }
1111
1112 /**
1113  * ram_pagesize_summary: calculate all the pagesizes of a VM
1114  *
1115  * Returns a summary bitmap of the page sizes of all RAMBlocks
1116  *
1117  * For VMs with just normal pages this is equivalent to the host page
1118  * size. If it's got some huge pages then it's the OR of all the
1119  * different page sizes.
1120  */
1121 uint64_t ram_pagesize_summary(void)
1122 {
1123     RAMBlock *block;
1124     uint64_t summary = 0;
1125
1126     RAMBLOCK_FOREACH(block) {
1127         summary |= block->page_size;
1128     }
1129
1130     return summary;
1131 }
1132
1133 static void migration_bitmap_sync(RAMState *rs)
1134 {
1135     RAMBlock *block;
1136     int64_t end_time;
1137     uint64_t bytes_xfer_now;
1138
1139     ram_counters.dirty_sync_count++;
1140
1141     if (!rs->time_last_bitmap_sync) {
1142         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1143     }
1144
1145     trace_migration_bitmap_sync_start();
1146     memory_global_dirty_log_sync();
1147
1148     qemu_mutex_lock(&rs->bitmap_mutex);
1149     rcu_read_lock();
1150     RAMBLOCK_FOREACH(block) {
1151         migration_bitmap_sync_range(rs, block, 0, block->used_length);
1152     }
1153     rcu_read_unlock();
1154     qemu_mutex_unlock(&rs->bitmap_mutex);
1155
1156     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1157
1158     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1159
1160     /* more than 1 second = 1000 millisecons */
1161     if (end_time > rs->time_last_bitmap_sync + 1000) {
1162         /* calculate period counters */
1163         ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1164             / (end_time - rs->time_last_bitmap_sync);
1165         bytes_xfer_now = ram_counters.transferred;
1166
1167         /* During block migration the auto-converge logic incorrectly detects
1168          * that ram migration makes no progress. Avoid this by disabling the
1169          * throttling logic during the bulk phase of block migration. */
1170         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1171             /* The following detection logic can be refined later. For now:
1172                Check to see if the dirtied bytes is 50% more than the approx.
1173                amount of bytes that just got transferred since the last time we
1174                were in this routine. If that happens twice, start or increase
1175                throttling */
1176
1177             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1178                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1179                 (++rs->dirty_rate_high_cnt >= 2)) {
1180                     trace_migration_throttle();
1181                     rs->dirty_rate_high_cnt = 0;
1182                     mig_throttle_guest_down();
1183             }
1184         }
1185
1186         if (migrate_use_xbzrle()) {
1187             if (rs->iterations_prev != rs->iterations) {
1188                 xbzrle_counters.cache_miss_rate =
1189                    (double)(xbzrle_counters.cache_miss -
1190                             rs->xbzrle_cache_miss_prev) /
1191                    (rs->iterations - rs->iterations_prev);
1192             }
1193             rs->iterations_prev = rs->iterations;
1194             rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1195         }
1196
1197         /* reset period counters */
1198         rs->time_last_bitmap_sync = end_time;
1199         rs->num_dirty_pages_period = 0;
1200         rs->bytes_xfer_prev = bytes_xfer_now;
1201     }
1202     if (migrate_use_events()) {
1203         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
1204     }
1205 }
1206
1207 /**
1208  * save_zero_page: send the zero page to the stream
1209  *
1210  * Returns the number of pages written.
1211  *
1212  * @rs: current RAM state
1213  * @block: block that contains the page we want to send
1214  * @offset: offset inside the block for the page
1215  */
1216 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1217 {
1218     uint8_t *p = block->host + offset;
1219     int pages = -1;
1220
1221     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1222         ram_counters.duplicate++;
1223         ram_counters.transferred +=
1224             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
1225         qemu_put_byte(rs->f, 0);
1226         ram_counters.transferred += 1;
1227         pages = 1;
1228     }
1229
1230     return pages;
1231 }
1232
1233 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1234 {
1235     if (!migrate_release_ram() || !migration_in_postcopy()) {
1236         return;
1237     }
1238
1239     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1240 }
1241
1242 /*
1243  * @pages: the number of pages written by the control path,
1244  *        < 0 - error
1245  *        > 0 - number of pages written
1246  *
1247  * Return true if the pages has been saved, otherwise false is returned.
1248  */
1249 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1250                               int *pages)
1251 {
1252     uint64_t bytes_xmit = 0;
1253     int ret;
1254
1255     *pages = -1;
1256     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1257                                 &bytes_xmit);
1258     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1259         return false;
1260     }
1261
1262     if (bytes_xmit) {
1263         ram_counters.transferred += bytes_xmit;
1264         *pages = 1;
1265     }
1266
1267     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1268         return true;
1269     }
1270
1271     if (bytes_xmit > 0) {
1272         ram_counters.normal++;
1273     } else if (bytes_xmit == 0) {
1274         ram_counters.duplicate++;
1275     }
1276
1277     return true;
1278 }
1279
1280 /*
1281  * directly send the page to the stream
1282  *
1283  * Returns the number of pages written.
1284  *
1285  * @rs: current RAM state
1286  * @block: block that contains the page we want to send
1287  * @offset: offset inside the block for the page
1288  * @buf: the page to be sent
1289  * @async: send to page asyncly
1290  */
1291 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1292                             uint8_t *buf, bool async)
1293 {
1294     ram_counters.transferred += save_page_header(rs, rs->f, block,
1295                                                  offset | RAM_SAVE_FLAG_PAGE);
1296     if (async) {
1297         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1298                               migrate_release_ram() &
1299                               migration_in_postcopy());
1300     } else {
1301         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1302     }
1303     ram_counters.transferred += TARGET_PAGE_SIZE;
1304     ram_counters.normal++;
1305     return 1;
1306 }
1307
1308 /**
1309  * ram_save_page: send the given page to the stream
1310  *
1311  * Returns the number of pages written.
1312  *          < 0 - error
1313  *          >=0 - Number of pages written - this might legally be 0
1314  *                if xbzrle noticed the page was the same.
1315  *
1316  * @rs: current RAM state
1317  * @block: block that contains the page we want to send
1318  * @offset: offset inside the block for the page
1319  * @last_stage: if we are at the completion stage
1320  */
1321 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1322 {
1323     int pages = -1;
1324     uint8_t *p;
1325     bool send_async = true;
1326     RAMBlock *block = pss->block;
1327     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1328     ram_addr_t current_addr = block->offset + offset;
1329
1330     p = block->host + offset;
1331     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1332
1333     XBZRLE_cache_lock();
1334     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1335         migrate_use_xbzrle()) {
1336         pages = save_xbzrle_page(rs, &p, current_addr, block,
1337                                  offset, last_stage);
1338         if (!last_stage) {
1339             /* Can't send this cached data async, since the cache page
1340              * might get updated before it gets to the wire
1341              */
1342             send_async = false;
1343         }
1344     }
1345
1346     /* XBZRLE overflow or normal page */
1347     if (pages == -1) {
1348         pages = save_normal_page(rs, block, offset, p, send_async);
1349     }
1350
1351     XBZRLE_cache_unlock();
1352
1353     return pages;
1354 }
1355
1356 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1357                                 ram_addr_t offset, uint8_t *source_buf)
1358 {
1359     RAMState *rs = ram_state;
1360     int bytes_sent, blen;
1361     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1362
1363     bytes_sent = save_page_header(rs, f, block, offset |
1364                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
1365
1366     /*
1367      * copy it to a internal buffer to avoid it being modified by VM
1368      * so that we can catch up the error during compression and
1369      * decompression
1370      */
1371     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1372     blen = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1373     if (blen < 0) {
1374         bytes_sent = 0;
1375         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1376         error_report("compressed data failed!");
1377     } else {
1378         bytes_sent += blen;
1379         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1380     }
1381
1382     return bytes_sent;
1383 }
1384
1385 static void flush_compressed_data(RAMState *rs)
1386 {
1387     int idx, len, thread_count;
1388
1389     if (!migrate_use_compression()) {
1390         return;
1391     }
1392     thread_count = migrate_compress_threads();
1393
1394     qemu_mutex_lock(&comp_done_lock);
1395     for (idx = 0; idx < thread_count; idx++) {
1396         while (!comp_param[idx].done) {
1397             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1398         }
1399     }
1400     qemu_mutex_unlock(&comp_done_lock);
1401
1402     for (idx = 0; idx < thread_count; idx++) {
1403         qemu_mutex_lock(&comp_param[idx].mutex);
1404         if (!comp_param[idx].quit) {
1405             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1406             ram_counters.transferred += len;
1407         }
1408         qemu_mutex_unlock(&comp_param[idx].mutex);
1409     }
1410 }
1411
1412 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1413                                        ram_addr_t offset)
1414 {
1415     param->block = block;
1416     param->offset = offset;
1417 }
1418
1419 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1420                                            ram_addr_t offset)
1421 {
1422     int idx, thread_count, bytes_xmit = -1, pages = -1;
1423
1424     thread_count = migrate_compress_threads();
1425     qemu_mutex_lock(&comp_done_lock);
1426     while (true) {
1427         for (idx = 0; idx < thread_count; idx++) {
1428             if (comp_param[idx].done) {
1429                 comp_param[idx].done = false;
1430                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1431                 qemu_mutex_lock(&comp_param[idx].mutex);
1432                 set_compress_params(&comp_param[idx], block, offset);
1433                 qemu_cond_signal(&comp_param[idx].cond);
1434                 qemu_mutex_unlock(&comp_param[idx].mutex);
1435                 pages = 1;
1436                 ram_counters.normal++;
1437                 ram_counters.transferred += bytes_xmit;
1438                 break;
1439             }
1440         }
1441         if (pages > 0) {
1442             break;
1443         } else {
1444             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1445         }
1446     }
1447     qemu_mutex_unlock(&comp_done_lock);
1448
1449     return pages;
1450 }
1451
1452 /**
1453  * find_dirty_block: find the next dirty page and update any state
1454  * associated with the search process.
1455  *
1456  * Returns if a page is found
1457  *
1458  * @rs: current RAM state
1459  * @pss: data about the state of the current dirty page scan
1460  * @again: set to false if the search has scanned the whole of RAM
1461  */
1462 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1463 {
1464     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1465     if (pss->complete_round && pss->block == rs->last_seen_block &&
1466         pss->page >= rs->last_page) {
1467         /*
1468          * We've been once around the RAM and haven't found anything.
1469          * Give up.
1470          */
1471         *again = false;
1472         return false;
1473     }
1474     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1475         /* Didn't find anything in this RAM Block */
1476         pss->page = 0;
1477         pss->block = QLIST_NEXT_RCU(pss->block, next);
1478         if (!pss->block) {
1479             /* Hit the end of the list */
1480             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1481             /* Flag that we've looped */
1482             pss->complete_round = true;
1483             rs->ram_bulk_stage = false;
1484             if (migrate_use_xbzrle()) {
1485                 /* If xbzrle is on, stop using the data compression at this
1486                  * point. In theory, xbzrle can do better than compression.
1487                  */
1488                 flush_compressed_data(rs);
1489             }
1490         }
1491         /* Didn't find anything this time, but try again on the new block */
1492         *again = true;
1493         return false;
1494     } else {
1495         /* Can go around again, but... */
1496         *again = true;
1497         /* We've found something so probably don't need to */
1498         return true;
1499     }
1500 }
1501
1502 /**
1503  * unqueue_page: gets a page of the queue
1504  *
1505  * Helper for 'get_queued_page' - gets a page off the queue
1506  *
1507  * Returns the block of the page (or NULL if none available)
1508  *
1509  * @rs: current RAM state
1510  * @offset: used to return the offset within the RAMBlock
1511  */
1512 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1513 {
1514     RAMBlock *block = NULL;
1515
1516     qemu_mutex_lock(&rs->src_page_req_mutex);
1517     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1518         struct RAMSrcPageRequest *entry =
1519                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1520         block = entry->rb;
1521         *offset = entry->offset;
1522
1523         if (entry->len > TARGET_PAGE_SIZE) {
1524             entry->len -= TARGET_PAGE_SIZE;
1525             entry->offset += TARGET_PAGE_SIZE;
1526         } else {
1527             memory_region_unref(block->mr);
1528             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1529             g_free(entry);
1530         }
1531     }
1532     qemu_mutex_unlock(&rs->src_page_req_mutex);
1533
1534     return block;
1535 }
1536
1537 /**
1538  * get_queued_page: unqueue a page from the postocpy requests
1539  *
1540  * Skips pages that are already sent (!dirty)
1541  *
1542  * Returns if a queued page is found
1543  *
1544  * @rs: current RAM state
1545  * @pss: data about the state of the current dirty page scan
1546  */
1547 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1548 {
1549     RAMBlock  *block;
1550     ram_addr_t offset;
1551     bool dirty;
1552
1553     do {
1554         block = unqueue_page(rs, &offset);
1555         /*
1556          * We're sending this page, and since it's postcopy nothing else
1557          * will dirty it, and we must make sure it doesn't get sent again
1558          * even if this queue request was received after the background
1559          * search already sent it.
1560          */
1561         if (block) {
1562             unsigned long page;
1563
1564             page = offset >> TARGET_PAGE_BITS;
1565             dirty = test_bit(page, block->bmap);
1566             if (!dirty) {
1567                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1568                        page, test_bit(page, block->unsentmap));
1569             } else {
1570                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1571             }
1572         }
1573
1574     } while (block && !dirty);
1575
1576     if (block) {
1577         /*
1578          * As soon as we start servicing pages out of order, then we have
1579          * to kill the bulk stage, since the bulk stage assumes
1580          * in (migration_bitmap_find_and_reset_dirty) that every page is
1581          * dirty, that's no longer true.
1582          */
1583         rs->ram_bulk_stage = false;
1584
1585         /*
1586          * We want the background search to continue from the queued page
1587          * since the guest is likely to want other pages near to the page
1588          * it just requested.
1589          */
1590         pss->block = block;
1591         pss->page = offset >> TARGET_PAGE_BITS;
1592     }
1593
1594     return !!block;
1595 }
1596
1597 /**
1598  * migration_page_queue_free: drop any remaining pages in the ram
1599  * request queue
1600  *
1601  * It should be empty at the end anyway, but in error cases there may
1602  * be some left.  in case that there is any page left, we drop it.
1603  *
1604  */
1605 static void migration_page_queue_free(RAMState *rs)
1606 {
1607     struct RAMSrcPageRequest *mspr, *next_mspr;
1608     /* This queue generally should be empty - but in the case of a failed
1609      * migration might have some droppings in.
1610      */
1611     rcu_read_lock();
1612     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1613         memory_region_unref(mspr->rb->mr);
1614         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1615         g_free(mspr);
1616     }
1617     rcu_read_unlock();
1618 }
1619
1620 /**
1621  * ram_save_queue_pages: queue the page for transmission
1622  *
1623  * A request from postcopy destination for example.
1624  *
1625  * Returns zero on success or negative on error
1626  *
1627  * @rbname: Name of the RAMBLock of the request. NULL means the
1628  *          same that last one.
1629  * @start: starting address from the start of the RAMBlock
1630  * @len: length (in bytes) to send
1631  */
1632 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1633 {
1634     RAMBlock *ramblock;
1635     RAMState *rs = ram_state;
1636
1637     ram_counters.postcopy_requests++;
1638     rcu_read_lock();
1639     if (!rbname) {
1640         /* Reuse last RAMBlock */
1641         ramblock = rs->last_req_rb;
1642
1643         if (!ramblock) {
1644             /*
1645              * Shouldn't happen, we can't reuse the last RAMBlock if
1646              * it's the 1st request.
1647              */
1648             error_report("ram_save_queue_pages no previous block");
1649             goto err;
1650         }
1651     } else {
1652         ramblock = qemu_ram_block_by_name(rbname);
1653
1654         if (!ramblock) {
1655             /* We shouldn't be asked for a non-existent RAMBlock */
1656             error_report("ram_save_queue_pages no block '%s'", rbname);
1657             goto err;
1658         }
1659         rs->last_req_rb = ramblock;
1660     }
1661     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1662     if (start+len > ramblock->used_length) {
1663         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1664                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1665                      __func__, start, len, ramblock->used_length);
1666         goto err;
1667     }
1668
1669     struct RAMSrcPageRequest *new_entry =
1670         g_malloc0(sizeof(struct RAMSrcPageRequest));
1671     new_entry->rb = ramblock;
1672     new_entry->offset = start;
1673     new_entry->len = len;
1674
1675     memory_region_ref(ramblock->mr);
1676     qemu_mutex_lock(&rs->src_page_req_mutex);
1677     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1678     qemu_mutex_unlock(&rs->src_page_req_mutex);
1679     rcu_read_unlock();
1680
1681     return 0;
1682
1683 err:
1684     rcu_read_unlock();
1685     return -1;
1686 }
1687
1688 static bool save_page_use_compression(RAMState *rs)
1689 {
1690     if (!migrate_use_compression()) {
1691         return false;
1692     }
1693
1694     /*
1695      * If xbzrle is on, stop using the data compression after first
1696      * round of migration even if compression is enabled. In theory,
1697      * xbzrle can do better than compression.
1698      */
1699     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1700         return true;
1701     }
1702
1703     return false;
1704 }
1705
1706 /**
1707  * ram_save_target_page: save one target page
1708  *
1709  * Returns the number of pages written
1710  *
1711  * @rs: current RAM state
1712  * @pss: data about the page we want to send
1713  * @last_stage: if we are at the completion stage
1714  */
1715 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1716                                 bool last_stage)
1717 {
1718     RAMBlock *block = pss->block;
1719     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1720     int res;
1721
1722     if (control_save_page(rs, block, offset, &res)) {
1723         return res;
1724     }
1725
1726     /*
1727      * When starting the process of a new block, the first page of
1728      * the block should be sent out before other pages in the same
1729      * block, and all the pages in last block should have been sent
1730      * out, keeping this order is important, because the 'cont' flag
1731      * is used to avoid resending the block name.
1732      */
1733     if (block != rs->last_sent_block && save_page_use_compression(rs)) {
1734             flush_compressed_data(rs);
1735     }
1736
1737     res = save_zero_page(rs, block, offset);
1738     if (res > 0) {
1739         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1740          * page would be stale
1741          */
1742         if (!save_page_use_compression(rs)) {
1743             XBZRLE_cache_lock();
1744             xbzrle_cache_zero_page(rs, block->offset + offset);
1745             XBZRLE_cache_unlock();
1746         }
1747         ram_release_pages(block->idstr, offset, res);
1748         return res;
1749     }
1750
1751     /*
1752      * Make sure the first page is sent out before other pages.
1753      *
1754      * we post it as normal page as compression will take much
1755      * CPU resource.
1756      */
1757     if (block == rs->last_sent_block && save_page_use_compression(rs)) {
1758         return compress_page_with_multi_thread(rs, block, offset);
1759     }
1760
1761     return ram_save_page(rs, pss, last_stage);
1762 }
1763
1764 /**
1765  * ram_save_host_page: save a whole host page
1766  *
1767  * Starting at *offset send pages up to the end of the current host
1768  * page. It's valid for the initial offset to point into the middle of
1769  * a host page in which case the remainder of the hostpage is sent.
1770  * Only dirty target pages are sent. Note that the host page size may
1771  * be a huge page for this block.
1772  * The saving stops at the boundary of the used_length of the block
1773  * if the RAMBlock isn't a multiple of the host page size.
1774  *
1775  * Returns the number of pages written or negative on error
1776  *
1777  * @rs: current RAM state
1778  * @ms: current migration state
1779  * @pss: data about the page we want to send
1780  * @last_stage: if we are at the completion stage
1781  */
1782 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1783                               bool last_stage)
1784 {
1785     int tmppages, pages = 0;
1786     size_t pagesize_bits =
1787         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1788
1789     do {
1790         /* Check the pages is dirty and if it is send it */
1791         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1792             pss->page++;
1793             continue;
1794         }
1795
1796         tmppages = ram_save_target_page(rs, pss, last_stage);
1797         if (tmppages < 0) {
1798             return tmppages;
1799         }
1800
1801         pages += tmppages;
1802         if (pss->block->unsentmap) {
1803             clear_bit(pss->page, pss->block->unsentmap);
1804         }
1805
1806         pss->page++;
1807     } while ((pss->page & (pagesize_bits - 1)) &&
1808              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1809
1810     /* The offset we leave with is the last one we looked at */
1811     pss->page--;
1812     return pages;
1813 }
1814
1815 /**
1816  * ram_find_and_save_block: finds a dirty page and sends it to f
1817  *
1818  * Called within an RCU critical section.
1819  *
1820  * Returns the number of pages written where zero means no dirty pages
1821  *
1822  * @rs: current RAM state
1823  * @last_stage: if we are at the completion stage
1824  *
1825  * On systems where host-page-size > target-page-size it will send all the
1826  * pages in a host page that are dirty.
1827  */
1828
1829 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1830 {
1831     PageSearchStatus pss;
1832     int pages = 0;
1833     bool again, found;
1834
1835     /* No dirty page as there is zero RAM */
1836     if (!ram_bytes_total()) {
1837         return pages;
1838     }
1839
1840     pss.block = rs->last_seen_block;
1841     pss.page = rs->last_page;
1842     pss.complete_round = false;
1843
1844     if (!pss.block) {
1845         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1846     }
1847
1848     do {
1849         again = true;
1850         found = get_queued_page(rs, &pss);
1851
1852         if (!found) {
1853             /* priority queue empty, so just search for something dirty */
1854             found = find_dirty_block(rs, &pss, &again);
1855         }
1856
1857         if (found) {
1858             pages = ram_save_host_page(rs, &pss, last_stage);
1859         }
1860     } while (!pages && again);
1861
1862     rs->last_seen_block = pss.block;
1863     rs->last_page = pss.page;
1864
1865     return pages;
1866 }
1867
1868 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1869 {
1870     uint64_t pages = size / TARGET_PAGE_SIZE;
1871
1872     if (zero) {
1873         ram_counters.duplicate += pages;
1874     } else {
1875         ram_counters.normal += pages;
1876         ram_counters.transferred += size;
1877         qemu_update_position(f, size);
1878     }
1879 }
1880
1881 uint64_t ram_bytes_total(void)
1882 {
1883     RAMBlock *block;
1884     uint64_t total = 0;
1885
1886     rcu_read_lock();
1887     RAMBLOCK_FOREACH(block) {
1888         total += block->used_length;
1889     }
1890     rcu_read_unlock();
1891     return total;
1892 }
1893
1894 static void xbzrle_load_setup(void)
1895 {
1896     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1897 }
1898
1899 static void xbzrle_load_cleanup(void)
1900 {
1901     g_free(XBZRLE.decoded_buf);
1902     XBZRLE.decoded_buf = NULL;
1903 }
1904
1905 static void ram_state_cleanup(RAMState **rsp)
1906 {
1907     if (*rsp) {
1908         migration_page_queue_free(*rsp);
1909         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1910         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1911         g_free(*rsp);
1912         *rsp = NULL;
1913     }
1914 }
1915
1916 static void xbzrle_cleanup(void)
1917 {
1918     XBZRLE_cache_lock();
1919     if (XBZRLE.cache) {
1920         cache_fini(XBZRLE.cache);
1921         g_free(XBZRLE.encoded_buf);
1922         g_free(XBZRLE.current_buf);
1923         g_free(XBZRLE.zero_target_page);
1924         XBZRLE.cache = NULL;
1925         XBZRLE.encoded_buf = NULL;
1926         XBZRLE.current_buf = NULL;
1927         XBZRLE.zero_target_page = NULL;
1928     }
1929     XBZRLE_cache_unlock();
1930 }
1931
1932 static void ram_save_cleanup(void *opaque)
1933 {
1934     RAMState **rsp = opaque;
1935     RAMBlock *block;
1936
1937     /* caller have hold iothread lock or is in a bh, so there is
1938      * no writing race against this migration_bitmap
1939      */
1940     memory_global_dirty_log_stop();
1941
1942     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1943         g_free(block->bmap);
1944         block->bmap = NULL;
1945         g_free(block->unsentmap);
1946         block->unsentmap = NULL;
1947     }
1948
1949     xbzrle_cleanup();
1950     compress_threads_save_cleanup();
1951     ram_state_cleanup(rsp);
1952 }
1953
1954 static void ram_state_reset(RAMState *rs)
1955 {
1956     rs->last_seen_block = NULL;
1957     rs->last_sent_block = NULL;
1958     rs->last_page = 0;
1959     rs->last_version = ram_list.version;
1960     rs->ram_bulk_stage = true;
1961 }
1962
1963 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1964
1965 /*
1966  * 'expected' is the value you expect the bitmap mostly to be full
1967  * of; it won't bother printing lines that are all this value.
1968  * If 'todump' is null the migration bitmap is dumped.
1969  */
1970 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1971                            unsigned long pages)
1972 {
1973     int64_t cur;
1974     int64_t linelen = 128;
1975     char linebuf[129];
1976
1977     for (cur = 0; cur < pages; cur += linelen) {
1978         int64_t curb;
1979         bool found = false;
1980         /*
1981          * Last line; catch the case where the line length
1982          * is longer than remaining ram
1983          */
1984         if (cur + linelen > pages) {
1985             linelen = pages - cur;
1986         }
1987         for (curb = 0; curb < linelen; curb++) {
1988             bool thisbit = test_bit(cur + curb, todump);
1989             linebuf[curb] = thisbit ? '1' : '.';
1990             found = found || (thisbit != expected);
1991         }
1992         if (found) {
1993             linebuf[curb] = '\0';
1994             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1995         }
1996     }
1997 }
1998
1999 /* **** functions for postcopy ***** */
2000
2001 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2002 {
2003     struct RAMBlock *block;
2004
2005     RAMBLOCK_FOREACH(block) {
2006         unsigned long *bitmap = block->bmap;
2007         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2008         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2009
2010         while (run_start < range) {
2011             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2012             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2013                               (run_end - run_start) << TARGET_PAGE_BITS);
2014             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2015         }
2016     }
2017 }
2018
2019 /**
2020  * postcopy_send_discard_bm_ram: discard a RAMBlock
2021  *
2022  * Returns zero on success
2023  *
2024  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2025  * Note: At this point the 'unsentmap' is the processed bitmap combined
2026  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2027  *
2028  * @ms: current migration state
2029  * @pds: state for postcopy
2030  * @start: RAMBlock starting page
2031  * @length: RAMBlock size
2032  */
2033 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2034                                         PostcopyDiscardState *pds,
2035                                         RAMBlock *block)
2036 {
2037     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2038     unsigned long current;
2039     unsigned long *unsentmap = block->unsentmap;
2040
2041     for (current = 0; current < end; ) {
2042         unsigned long one = find_next_bit(unsentmap, end, current);
2043
2044         if (one <= end) {
2045             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2046             unsigned long discard_length;
2047
2048             if (zero >= end) {
2049                 discard_length = end - one;
2050             } else {
2051                 discard_length = zero - one;
2052             }
2053             if (discard_length) {
2054                 postcopy_discard_send_range(ms, pds, one, discard_length);
2055             }
2056             current = one + discard_length;
2057         } else {
2058             current = one;
2059         }
2060     }
2061
2062     return 0;
2063 }
2064
2065 /**
2066  * postcopy_each_ram_send_discard: discard all RAMBlocks
2067  *
2068  * Returns 0 for success or negative for error
2069  *
2070  * Utility for the outgoing postcopy code.
2071  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2072  *   passing it bitmap indexes and name.
2073  * (qemu_ram_foreach_block ends up passing unscaled lengths
2074  *  which would mean postcopy code would have to deal with target page)
2075  *
2076  * @ms: current migration state
2077  */
2078 static int postcopy_each_ram_send_discard(MigrationState *ms)
2079 {
2080     struct RAMBlock *block;
2081     int ret;
2082
2083     RAMBLOCK_FOREACH(block) {
2084         PostcopyDiscardState *pds =
2085             postcopy_discard_send_init(ms, block->idstr);
2086
2087         /*
2088          * Postcopy sends chunks of bitmap over the wire, but it
2089          * just needs indexes at this point, avoids it having
2090          * target page specific code.
2091          */
2092         ret = postcopy_send_discard_bm_ram(ms, pds, block);
2093         postcopy_discard_send_finish(ms, pds);
2094         if (ret) {
2095             return ret;
2096         }
2097     }
2098
2099     return 0;
2100 }
2101
2102 /**
2103  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2104  *
2105  * Helper for postcopy_chunk_hostpages; it's called twice to
2106  * canonicalize the two bitmaps, that are similar, but one is
2107  * inverted.
2108  *
2109  * Postcopy requires that all target pages in a hostpage are dirty or
2110  * clean, not a mix.  This function canonicalizes the bitmaps.
2111  *
2112  * @ms: current migration state
2113  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2114  *               otherwise we need to canonicalize partially dirty host pages
2115  * @block: block that contains the page we want to canonicalize
2116  * @pds: state for postcopy
2117  */
2118 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2119                                           RAMBlock *block,
2120                                           PostcopyDiscardState *pds)
2121 {
2122     RAMState *rs = ram_state;
2123     unsigned long *bitmap = block->bmap;
2124     unsigned long *unsentmap = block->unsentmap;
2125     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2126     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2127     unsigned long run_start;
2128
2129     if (block->page_size == TARGET_PAGE_SIZE) {
2130         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2131         return;
2132     }
2133
2134     if (unsent_pass) {
2135         /* Find a sent page */
2136         run_start = find_next_zero_bit(unsentmap, pages, 0);
2137     } else {
2138         /* Find a dirty page */
2139         run_start = find_next_bit(bitmap, pages, 0);
2140     }
2141
2142     while (run_start < pages) {
2143         bool do_fixup = false;
2144         unsigned long fixup_start_addr;
2145         unsigned long host_offset;
2146
2147         /*
2148          * If the start of this run of pages is in the middle of a host
2149          * page, then we need to fixup this host page.
2150          */
2151         host_offset = run_start % host_ratio;
2152         if (host_offset) {
2153             do_fixup = true;
2154             run_start -= host_offset;
2155             fixup_start_addr = run_start;
2156             /* For the next pass */
2157             run_start = run_start + host_ratio;
2158         } else {
2159             /* Find the end of this run */
2160             unsigned long run_end;
2161             if (unsent_pass) {
2162                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2163             } else {
2164                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2165             }
2166             /*
2167              * If the end isn't at the start of a host page, then the
2168              * run doesn't finish at the end of a host page
2169              * and we need to discard.
2170              */
2171             host_offset = run_end % host_ratio;
2172             if (host_offset) {
2173                 do_fixup = true;
2174                 fixup_start_addr = run_end - host_offset;
2175                 /*
2176                  * This host page has gone, the next loop iteration starts
2177                  * from after the fixup
2178                  */
2179                 run_start = fixup_start_addr + host_ratio;
2180             } else {
2181                 /*
2182                  * No discards on this iteration, next loop starts from
2183                  * next sent/dirty page
2184                  */
2185                 run_start = run_end + 1;
2186             }
2187         }
2188
2189         if (do_fixup) {
2190             unsigned long page;
2191
2192             /* Tell the destination to discard this page */
2193             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2194                 /* For the unsent_pass we:
2195                  *     discard partially sent pages
2196                  * For the !unsent_pass (dirty) we:
2197                  *     discard partially dirty pages that were sent
2198                  *     (any partially sent pages were already discarded
2199                  *     by the previous unsent_pass)
2200                  */
2201                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2202                                             host_ratio);
2203             }
2204
2205             /* Clean up the bitmap */
2206             for (page = fixup_start_addr;
2207                  page < fixup_start_addr + host_ratio; page++) {
2208                 /* All pages in this host page are now not sent */
2209                 set_bit(page, unsentmap);
2210
2211                 /*
2212                  * Remark them as dirty, updating the count for any pages
2213                  * that weren't previously dirty.
2214                  */
2215                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2216             }
2217         }
2218
2219         if (unsent_pass) {
2220             /* Find the next sent page for the next iteration */
2221             run_start = find_next_zero_bit(unsentmap, pages, run_start);
2222         } else {
2223             /* Find the next dirty page for the next iteration */
2224             run_start = find_next_bit(bitmap, pages, run_start);
2225         }
2226     }
2227 }
2228
2229 /**
2230  * postcopy_chuck_hostpages: discrad any partially sent host page
2231  *
2232  * Utility for the outgoing postcopy code.
2233  *
2234  * Discard any partially sent host-page size chunks, mark any partially
2235  * dirty host-page size chunks as all dirty.  In this case the host-page
2236  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2237  *
2238  * Returns zero on success
2239  *
2240  * @ms: current migration state
2241  * @block: block we want to work with
2242  */
2243 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2244 {
2245     PostcopyDiscardState *pds =
2246         postcopy_discard_send_init(ms, block->idstr);
2247
2248     /* First pass: Discard all partially sent host pages */
2249     postcopy_chunk_hostpages_pass(ms, true, block, pds);
2250     /*
2251      * Second pass: Ensure that all partially dirty host pages are made
2252      * fully dirty.
2253      */
2254     postcopy_chunk_hostpages_pass(ms, false, block, pds);
2255
2256     postcopy_discard_send_finish(ms, pds);
2257     return 0;
2258 }
2259
2260 /**
2261  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2262  *
2263  * Returns zero on success
2264  *
2265  * Transmit the set of pages to be discarded after precopy to the target
2266  * these are pages that:
2267  *     a) Have been previously transmitted but are now dirty again
2268  *     b) Pages that have never been transmitted, this ensures that
2269  *        any pages on the destination that have been mapped by background
2270  *        tasks get discarded (transparent huge pages is the specific concern)
2271  * Hopefully this is pretty sparse
2272  *
2273  * @ms: current migration state
2274  */
2275 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2276 {
2277     RAMState *rs = ram_state;
2278     RAMBlock *block;
2279     int ret;
2280
2281     rcu_read_lock();
2282
2283     /* This should be our last sync, the src is now paused */
2284     migration_bitmap_sync(rs);
2285
2286     /* Easiest way to make sure we don't resume in the middle of a host-page */
2287     rs->last_seen_block = NULL;
2288     rs->last_sent_block = NULL;
2289     rs->last_page = 0;
2290
2291     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2292         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2293         unsigned long *bitmap = block->bmap;
2294         unsigned long *unsentmap = block->unsentmap;
2295
2296         if (!unsentmap) {
2297             /* We don't have a safe way to resize the sentmap, so
2298              * if the bitmap was resized it will be NULL at this
2299              * point.
2300              */
2301             error_report("migration ram resized during precopy phase");
2302             rcu_read_unlock();
2303             return -EINVAL;
2304         }
2305         /* Deal with TPS != HPS and huge pages */
2306         ret = postcopy_chunk_hostpages(ms, block);
2307         if (ret) {
2308             rcu_read_unlock();
2309             return ret;
2310         }
2311
2312         /*
2313          * Update the unsentmap to be unsentmap = unsentmap | dirty
2314          */
2315         bitmap_or(unsentmap, unsentmap, bitmap, pages);
2316 #ifdef DEBUG_POSTCOPY
2317         ram_debug_dump_bitmap(unsentmap, true, pages);
2318 #endif
2319     }
2320     trace_ram_postcopy_send_discard_bitmap();
2321
2322     ret = postcopy_each_ram_send_discard(ms);
2323     rcu_read_unlock();
2324
2325     return ret;
2326 }
2327
2328 /**
2329  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2330  *
2331  * Returns zero on success
2332  *
2333  * @rbname: name of the RAMBlock of the request. NULL means the
2334  *          same that last one.
2335  * @start: RAMBlock starting page
2336  * @length: RAMBlock size
2337  */
2338 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2339 {
2340     int ret = -1;
2341
2342     trace_ram_discard_range(rbname, start, length);
2343
2344     rcu_read_lock();
2345     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2346
2347     if (!rb) {
2348         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2349         goto err;
2350     }
2351
2352     bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2353                  length >> qemu_target_page_bits());
2354     ret = ram_block_discard_range(rb, start, length);
2355
2356 err:
2357     rcu_read_unlock();
2358
2359     return ret;
2360 }
2361
2362 /*
2363  * For every allocation, we will try not to crash the VM if the
2364  * allocation failed.
2365  */
2366 static int xbzrle_init(void)
2367 {
2368     Error *local_err = NULL;
2369
2370     if (!migrate_use_xbzrle()) {
2371         return 0;
2372     }
2373
2374     XBZRLE_cache_lock();
2375
2376     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2377     if (!XBZRLE.zero_target_page) {
2378         error_report("%s: Error allocating zero page", __func__);
2379         goto err_out;
2380     }
2381
2382     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2383                               TARGET_PAGE_SIZE, &local_err);
2384     if (!XBZRLE.cache) {
2385         error_report_err(local_err);
2386         goto free_zero_page;
2387     }
2388
2389     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2390     if (!XBZRLE.encoded_buf) {
2391         error_report("%s: Error allocating encoded_buf", __func__);
2392         goto free_cache;
2393     }
2394
2395     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2396     if (!XBZRLE.current_buf) {
2397         error_report("%s: Error allocating current_buf", __func__);
2398         goto free_encoded_buf;
2399     }
2400
2401     /* We are all good */
2402     XBZRLE_cache_unlock();
2403     return 0;
2404
2405 free_encoded_buf:
2406     g_free(XBZRLE.encoded_buf);
2407     XBZRLE.encoded_buf = NULL;
2408 free_cache:
2409     cache_fini(XBZRLE.cache);
2410     XBZRLE.cache = NULL;
2411 free_zero_page:
2412     g_free(XBZRLE.zero_target_page);
2413     XBZRLE.zero_target_page = NULL;
2414 err_out:
2415     XBZRLE_cache_unlock();
2416     return -ENOMEM;
2417 }
2418
2419 static int ram_state_init(RAMState **rsp)
2420 {
2421     *rsp = g_try_new0(RAMState, 1);
2422
2423     if (!*rsp) {
2424         error_report("%s: Init ramstate fail", __func__);
2425         return -1;
2426     }
2427
2428     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2429     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2430     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2431
2432     /*
2433      * Count the total number of pages used by ram blocks not including any
2434      * gaps due to alignment or unplugs.
2435      */
2436     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2437
2438     ram_state_reset(*rsp);
2439
2440     return 0;
2441 }
2442
2443 static void ram_list_init_bitmaps(void)
2444 {
2445     RAMBlock *block;
2446     unsigned long pages;
2447
2448     /* Skip setting bitmap if there is no RAM */
2449     if (ram_bytes_total()) {
2450         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2451             pages = block->max_length >> TARGET_PAGE_BITS;
2452             block->bmap = bitmap_new(pages);
2453             bitmap_set(block->bmap, 0, pages);
2454             if (migrate_postcopy_ram()) {
2455                 block->unsentmap = bitmap_new(pages);
2456                 bitmap_set(block->unsentmap, 0, pages);
2457             }
2458         }
2459     }
2460 }
2461
2462 static void ram_init_bitmaps(RAMState *rs)
2463 {
2464     /* For memory_global_dirty_log_start below.  */
2465     qemu_mutex_lock_iothread();
2466     qemu_mutex_lock_ramlist();
2467     rcu_read_lock();
2468
2469     ram_list_init_bitmaps();
2470     memory_global_dirty_log_start();
2471     migration_bitmap_sync(rs);
2472
2473     rcu_read_unlock();
2474     qemu_mutex_unlock_ramlist();
2475     qemu_mutex_unlock_iothread();
2476 }
2477
2478 static int ram_init_all(RAMState **rsp)
2479 {
2480     if (ram_state_init(rsp)) {
2481         return -1;
2482     }
2483
2484     if (xbzrle_init()) {
2485         ram_state_cleanup(rsp);
2486         return -1;
2487     }
2488
2489     ram_init_bitmaps(*rsp);
2490
2491     return 0;
2492 }
2493
2494 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2495 {
2496     RAMBlock *block;
2497     uint64_t pages = 0;
2498
2499     /*
2500      * Postcopy is not using xbzrle/compression, so no need for that.
2501      * Also, since source are already halted, we don't need to care
2502      * about dirty page logging as well.
2503      */
2504
2505     RAMBLOCK_FOREACH(block) {
2506         pages += bitmap_count_one(block->bmap,
2507                                   block->used_length >> TARGET_PAGE_BITS);
2508     }
2509
2510     /* This may not be aligned with current bitmaps. Recalculate. */
2511     rs->migration_dirty_pages = pages;
2512
2513     rs->last_seen_block = NULL;
2514     rs->last_sent_block = NULL;
2515     rs->last_page = 0;
2516     rs->last_version = ram_list.version;
2517     /*
2518      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2519      * matter what we have sent.
2520      */
2521     rs->ram_bulk_stage = false;
2522
2523     /* Update RAMState cache of output QEMUFile */
2524     rs->f = out;
2525
2526     trace_ram_state_resume_prepare(pages);
2527 }
2528
2529 /*
2530  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2531  * long-running RCU critical section.  When rcu-reclaims in the code
2532  * start to become numerous it will be necessary to reduce the
2533  * granularity of these critical sections.
2534  */
2535
2536 /**
2537  * ram_save_setup: Setup RAM for migration
2538  *
2539  * Returns zero to indicate success and negative for error
2540  *
2541  * @f: QEMUFile where to send the data
2542  * @opaque: RAMState pointer
2543  */
2544 static int ram_save_setup(QEMUFile *f, void *opaque)
2545 {
2546     RAMState **rsp = opaque;
2547     RAMBlock *block;
2548
2549     if (compress_threads_save_setup()) {
2550         return -1;
2551     }
2552
2553     /* migration has already setup the bitmap, reuse it. */
2554     if (!migration_in_colo_state()) {
2555         if (ram_init_all(rsp) != 0) {
2556             compress_threads_save_cleanup();
2557             return -1;
2558         }
2559     }
2560     (*rsp)->f = f;
2561
2562     rcu_read_lock();
2563
2564     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2565
2566     RAMBLOCK_FOREACH(block) {
2567         qemu_put_byte(f, strlen(block->idstr));
2568         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2569         qemu_put_be64(f, block->used_length);
2570         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2571             qemu_put_be64(f, block->page_size);
2572         }
2573     }
2574
2575     rcu_read_unlock();
2576
2577     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2578     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2579
2580     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2581
2582     return 0;
2583 }
2584
2585 /**
2586  * ram_save_iterate: iterative stage for migration
2587  *
2588  * Returns zero to indicate success and negative for error
2589  *
2590  * @f: QEMUFile where to send the data
2591  * @opaque: RAMState pointer
2592  */
2593 static int ram_save_iterate(QEMUFile *f, void *opaque)
2594 {
2595     RAMState **temp = opaque;
2596     RAMState *rs = *temp;
2597     int ret;
2598     int i;
2599     int64_t t0;
2600     int done = 0;
2601
2602     if (blk_mig_bulk_active()) {
2603         /* Avoid transferring ram during bulk phase of block migration as
2604          * the bulk phase will usually take a long time and transferring
2605          * ram updates during that time is pointless. */
2606         goto out;
2607     }
2608
2609     rcu_read_lock();
2610     if (ram_list.version != rs->last_version) {
2611         ram_state_reset(rs);
2612     }
2613
2614     /* Read version before ram_list.blocks */
2615     smp_rmb();
2616
2617     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2618
2619     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2620     i = 0;
2621     while ((ret = qemu_file_rate_limit(f)) == 0) {
2622         int pages;
2623
2624         pages = ram_find_and_save_block(rs, false);
2625         /* no more pages to sent */
2626         if (pages == 0) {
2627             done = 1;
2628             break;
2629         }
2630         rs->iterations++;
2631
2632         /* we want to check in the 1st loop, just in case it was the 1st time
2633            and we had to sync the dirty bitmap.
2634            qemu_get_clock_ns() is a bit expensive, so we only check each some
2635            iterations
2636         */
2637         if ((i & 63) == 0) {
2638             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2639             if (t1 > MAX_WAIT) {
2640                 trace_ram_save_iterate_big_wait(t1, i);
2641                 break;
2642             }
2643         }
2644         i++;
2645     }
2646     flush_compressed_data(rs);
2647     rcu_read_unlock();
2648
2649     /*
2650      * Must occur before EOS (or any QEMUFile operation)
2651      * because of RDMA protocol.
2652      */
2653     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2654
2655 out:
2656     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2657     ram_counters.transferred += 8;
2658
2659     ret = qemu_file_get_error(f);
2660     if (ret < 0) {
2661         return ret;
2662     }
2663
2664     return done;
2665 }
2666
2667 /**
2668  * ram_save_complete: function called to send the remaining amount of ram
2669  *
2670  * Returns zero to indicate success
2671  *
2672  * Called with iothread lock
2673  *
2674  * @f: QEMUFile where to send the data
2675  * @opaque: RAMState pointer
2676  */
2677 static int ram_save_complete(QEMUFile *f, void *opaque)
2678 {
2679     RAMState **temp = opaque;
2680     RAMState *rs = *temp;
2681
2682     rcu_read_lock();
2683
2684     if (!migration_in_postcopy()) {
2685         migration_bitmap_sync(rs);
2686     }
2687
2688     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2689
2690     /* try transferring iterative blocks of memory */
2691
2692     /* flush all remaining blocks regardless of rate limiting */
2693     while (true) {
2694         int pages;
2695
2696         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2697         /* no more blocks to sent */
2698         if (pages == 0) {
2699             break;
2700         }
2701     }
2702
2703     flush_compressed_data(rs);
2704     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2705
2706     rcu_read_unlock();
2707
2708     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2709
2710     return 0;
2711 }
2712
2713 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2714                              uint64_t *res_precopy_only,
2715                              uint64_t *res_compatible,
2716                              uint64_t *res_postcopy_only)
2717 {
2718     RAMState **temp = opaque;
2719     RAMState *rs = *temp;
2720     uint64_t remaining_size;
2721
2722     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2723
2724     if (!migration_in_postcopy() &&
2725         remaining_size < max_size) {
2726         qemu_mutex_lock_iothread();
2727         rcu_read_lock();
2728         migration_bitmap_sync(rs);
2729         rcu_read_unlock();
2730         qemu_mutex_unlock_iothread();
2731         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2732     }
2733
2734     if (migrate_postcopy_ram()) {
2735         /* We can do postcopy, and all the data is postcopiable */
2736         *res_compatible += remaining_size;
2737     } else {
2738         *res_precopy_only += remaining_size;
2739     }
2740 }
2741
2742 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2743 {
2744     unsigned int xh_len;
2745     int xh_flags;
2746     uint8_t *loaded_data;
2747
2748     /* extract RLE header */
2749     xh_flags = qemu_get_byte(f);
2750     xh_len = qemu_get_be16(f);
2751
2752     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2753         error_report("Failed to load XBZRLE page - wrong compression!");
2754         return -1;
2755     }
2756
2757     if (xh_len > TARGET_PAGE_SIZE) {
2758         error_report("Failed to load XBZRLE page - len overflow!");
2759         return -1;
2760     }
2761     loaded_data = XBZRLE.decoded_buf;
2762     /* load data and decode */
2763     /* it can change loaded_data to point to an internal buffer */
2764     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2765
2766     /* decode RLE */
2767     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2768                              TARGET_PAGE_SIZE) == -1) {
2769         error_report("Failed to load XBZRLE page - decode error!");
2770         return -1;
2771     }
2772
2773     return 0;
2774 }
2775
2776 /**
2777  * ram_block_from_stream: read a RAMBlock id from the migration stream
2778  *
2779  * Must be called from within a rcu critical section.
2780  *
2781  * Returns a pointer from within the RCU-protected ram_list.
2782  *
2783  * @f: QEMUFile where to read the data from
2784  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2785  */
2786 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2787 {
2788     static RAMBlock *block = NULL;
2789     char id[256];
2790     uint8_t len;
2791
2792     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2793         if (!block) {
2794             error_report("Ack, bad migration stream!");
2795             return NULL;
2796         }
2797         return block;
2798     }
2799
2800     len = qemu_get_byte(f);
2801     qemu_get_buffer(f, (uint8_t *)id, len);
2802     id[len] = 0;
2803
2804     block = qemu_ram_block_by_name(id);
2805     if (!block) {
2806         error_report("Can't find block %s", id);
2807         return NULL;
2808     }
2809
2810     return block;
2811 }
2812
2813 static inline void *host_from_ram_block_offset(RAMBlock *block,
2814                                                ram_addr_t offset)
2815 {
2816     if (!offset_in_ramblock(block, offset)) {
2817         return NULL;
2818     }
2819
2820     return block->host + offset;
2821 }
2822
2823 /**
2824  * ram_handle_compressed: handle the zero page case
2825  *
2826  * If a page (or a whole RDMA chunk) has been
2827  * determined to be zero, then zap it.
2828  *
2829  * @host: host address for the zero page
2830  * @ch: what the page is filled from.  We only support zero
2831  * @size: size of the zero page
2832  */
2833 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2834 {
2835     if (ch != 0 || !is_zero_range(host, size)) {
2836         memset(host, ch, size);
2837     }
2838 }
2839
2840 /* return the size after decompression, or negative value on error */
2841 static int
2842 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2843                      const uint8_t *source, size_t source_len)
2844 {
2845     int err;
2846
2847     err = inflateReset(stream);
2848     if (err != Z_OK) {
2849         return -1;
2850     }
2851
2852     stream->avail_in = source_len;
2853     stream->next_in = (uint8_t *)source;
2854     stream->avail_out = dest_len;
2855     stream->next_out = dest;
2856
2857     err = inflate(stream, Z_NO_FLUSH);
2858     if (err != Z_STREAM_END) {
2859         return -1;
2860     }
2861
2862     return stream->total_out;
2863 }
2864
2865 static void *do_data_decompress(void *opaque)
2866 {
2867     DecompressParam *param = opaque;
2868     unsigned long pagesize;
2869     uint8_t *des;
2870     int len, ret;
2871
2872     qemu_mutex_lock(&param->mutex);
2873     while (!param->quit) {
2874         if (param->des) {
2875             des = param->des;
2876             len = param->len;
2877             param->des = 0;
2878             qemu_mutex_unlock(&param->mutex);
2879
2880             pagesize = TARGET_PAGE_SIZE;
2881
2882             ret = qemu_uncompress_data(&param->stream, des, pagesize,
2883                                        param->compbuf, len);
2884             if (ret < 0) {
2885                 error_report("decompress data failed");
2886                 qemu_file_set_error(decomp_file, ret);
2887             }
2888
2889             qemu_mutex_lock(&decomp_done_lock);
2890             param->done = true;
2891             qemu_cond_signal(&decomp_done_cond);
2892             qemu_mutex_unlock(&decomp_done_lock);
2893
2894             qemu_mutex_lock(&param->mutex);
2895         } else {
2896             qemu_cond_wait(&param->cond, &param->mutex);
2897         }
2898     }
2899     qemu_mutex_unlock(&param->mutex);
2900
2901     return NULL;
2902 }
2903
2904 static int wait_for_decompress_done(void)
2905 {
2906     int idx, thread_count;
2907
2908     if (!migrate_use_compression()) {
2909         return 0;
2910     }
2911
2912     thread_count = migrate_decompress_threads();
2913     qemu_mutex_lock(&decomp_done_lock);
2914     for (idx = 0; idx < thread_count; idx++) {
2915         while (!decomp_param[idx].done) {
2916             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2917         }
2918     }
2919     qemu_mutex_unlock(&decomp_done_lock);
2920     return qemu_file_get_error(decomp_file);
2921 }
2922
2923 static void compress_threads_load_cleanup(void)
2924 {
2925     int i, thread_count;
2926
2927     if (!migrate_use_compression()) {
2928         return;
2929     }
2930     thread_count = migrate_decompress_threads();
2931     for (i = 0; i < thread_count; i++) {
2932         /*
2933          * we use it as a indicator which shows if the thread is
2934          * properly init'd or not
2935          */
2936         if (!decomp_param[i].compbuf) {
2937             break;
2938         }
2939
2940         qemu_mutex_lock(&decomp_param[i].mutex);
2941         decomp_param[i].quit = true;
2942         qemu_cond_signal(&decomp_param[i].cond);
2943         qemu_mutex_unlock(&decomp_param[i].mutex);
2944     }
2945     for (i = 0; i < thread_count; i++) {
2946         if (!decomp_param[i].compbuf) {
2947             break;
2948         }
2949
2950         qemu_thread_join(decompress_threads + i);
2951         qemu_mutex_destroy(&decomp_param[i].mutex);
2952         qemu_cond_destroy(&decomp_param[i].cond);
2953         inflateEnd(&decomp_param[i].stream);
2954         g_free(decomp_param[i].compbuf);
2955         decomp_param[i].compbuf = NULL;
2956     }
2957     g_free(decompress_threads);
2958     g_free(decomp_param);
2959     decompress_threads = NULL;
2960     decomp_param = NULL;
2961     decomp_file = NULL;
2962 }
2963
2964 static int compress_threads_load_setup(QEMUFile *f)
2965 {
2966     int i, thread_count;
2967
2968     if (!migrate_use_compression()) {
2969         return 0;
2970     }
2971
2972     thread_count = migrate_decompress_threads();
2973     decompress_threads = g_new0(QemuThread, thread_count);
2974     decomp_param = g_new0(DecompressParam, thread_count);
2975     qemu_mutex_init(&decomp_done_lock);
2976     qemu_cond_init(&decomp_done_cond);
2977     decomp_file = f;
2978     for (i = 0; i < thread_count; i++) {
2979         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2980             goto exit;
2981         }
2982
2983         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2984         qemu_mutex_init(&decomp_param[i].mutex);
2985         qemu_cond_init(&decomp_param[i].cond);
2986         decomp_param[i].done = true;
2987         decomp_param[i].quit = false;
2988         qemu_thread_create(decompress_threads + i, "decompress",
2989                            do_data_decompress, decomp_param + i,
2990                            QEMU_THREAD_JOINABLE);
2991     }
2992     return 0;
2993 exit:
2994     compress_threads_load_cleanup();
2995     return -1;
2996 }
2997
2998 static void decompress_data_with_multi_threads(QEMUFile *f,
2999                                                void *host, int len)
3000 {
3001     int idx, thread_count;
3002
3003     thread_count = migrate_decompress_threads();
3004     qemu_mutex_lock(&decomp_done_lock);
3005     while (true) {
3006         for (idx = 0; idx < thread_count; idx++) {
3007             if (decomp_param[idx].done) {
3008                 decomp_param[idx].done = false;
3009                 qemu_mutex_lock(&decomp_param[idx].mutex);
3010                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3011                 decomp_param[idx].des = host;
3012                 decomp_param[idx].len = len;
3013                 qemu_cond_signal(&decomp_param[idx].cond);
3014                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3015                 break;
3016             }
3017         }
3018         if (idx < thread_count) {
3019             break;
3020         } else {
3021             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3022         }
3023     }
3024     qemu_mutex_unlock(&decomp_done_lock);
3025 }
3026
3027 /**
3028  * ram_load_setup: Setup RAM for migration incoming side
3029  *
3030  * Returns zero to indicate success and negative for error
3031  *
3032  * @f: QEMUFile where to receive the data
3033  * @opaque: RAMState pointer
3034  */
3035 static int ram_load_setup(QEMUFile *f, void *opaque)
3036 {
3037     if (compress_threads_load_setup(f)) {
3038         return -1;
3039     }
3040
3041     xbzrle_load_setup();
3042     ramblock_recv_map_init();
3043     return 0;
3044 }
3045
3046 static int ram_load_cleanup(void *opaque)
3047 {
3048     RAMBlock *rb;
3049     xbzrle_load_cleanup();
3050     compress_threads_load_cleanup();
3051
3052     RAMBLOCK_FOREACH(rb) {
3053         g_free(rb->receivedmap);
3054         rb->receivedmap = NULL;
3055     }
3056     return 0;
3057 }
3058
3059 /**
3060  * ram_postcopy_incoming_init: allocate postcopy data structures
3061  *
3062  * Returns 0 for success and negative if there was one error
3063  *
3064  * @mis: current migration incoming state
3065  *
3066  * Allocate data structures etc needed by incoming migration with
3067  * postcopy-ram. postcopy-ram's similarly names
3068  * postcopy_ram_incoming_init does the work.
3069  */
3070 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3071 {
3072     unsigned long ram_pages = last_ram_page();
3073
3074     return postcopy_ram_incoming_init(mis, ram_pages);
3075 }
3076
3077 /**
3078  * ram_load_postcopy: load a page in postcopy case
3079  *
3080  * Returns 0 for success or -errno in case of error
3081  *
3082  * Called in postcopy mode by ram_load().
3083  * rcu_read_lock is taken prior to this being called.
3084  *
3085  * @f: QEMUFile where to send the data
3086  */
3087 static int ram_load_postcopy(QEMUFile *f)
3088 {
3089     int flags = 0, ret = 0;
3090     bool place_needed = false;
3091     bool matching_page_sizes = false;
3092     MigrationIncomingState *mis = migration_incoming_get_current();
3093     /* Temporary page that is later 'placed' */
3094     void *postcopy_host_page = postcopy_get_tmp_page(mis);
3095     void *last_host = NULL;
3096     bool all_zero = false;
3097
3098     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3099         ram_addr_t addr;
3100         void *host = NULL;
3101         void *page_buffer = NULL;
3102         void *place_source = NULL;
3103         RAMBlock *block = NULL;
3104         uint8_t ch;
3105
3106         addr = qemu_get_be64(f);
3107
3108         /*
3109          * If qemu file error, we should stop here, and then "addr"
3110          * may be invalid
3111          */
3112         ret = qemu_file_get_error(f);
3113         if (ret) {
3114             break;
3115         }
3116
3117         flags = addr & ~TARGET_PAGE_MASK;
3118         addr &= TARGET_PAGE_MASK;
3119
3120         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3121         place_needed = false;
3122         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
3123             block = ram_block_from_stream(f, flags);
3124
3125             host = host_from_ram_block_offset(block, addr);
3126             if (!host) {
3127                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3128                 ret = -EINVAL;
3129                 break;
3130             }
3131             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
3132             /*
3133              * Postcopy requires that we place whole host pages atomically;
3134              * these may be huge pages for RAMBlocks that are backed by
3135              * hugetlbfs.
3136              * To make it atomic, the data is read into a temporary page
3137              * that's moved into place later.
3138              * The migration protocol uses,  possibly smaller, target-pages
3139              * however the source ensures it always sends all the components
3140              * of a host page in order.
3141              */
3142             page_buffer = postcopy_host_page +
3143                           ((uintptr_t)host & (block->page_size - 1));
3144             /* If all TP are zero then we can optimise the place */
3145             if (!((uintptr_t)host & (block->page_size - 1))) {
3146                 all_zero = true;
3147             } else {
3148                 /* not the 1st TP within the HP */
3149                 if (host != (last_host + TARGET_PAGE_SIZE)) {
3150                     error_report("Non-sequential target page %p/%p",
3151                                   host, last_host);
3152                     ret = -EINVAL;
3153                     break;
3154                 }
3155             }
3156
3157
3158             /*
3159              * If it's the last part of a host page then we place the host
3160              * page
3161              */
3162             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
3163                                      (block->page_size - 1)) == 0;
3164             place_source = postcopy_host_page;
3165         }
3166         last_host = host;
3167
3168         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3169         case RAM_SAVE_FLAG_ZERO:
3170             ch = qemu_get_byte(f);
3171             memset(page_buffer, ch, TARGET_PAGE_SIZE);
3172             if (ch) {
3173                 all_zero = false;
3174             }
3175             break;
3176
3177         case RAM_SAVE_FLAG_PAGE:
3178             all_zero = false;
3179             if (!place_needed || !matching_page_sizes) {
3180                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3181             } else {
3182                 /* Avoids the qemu_file copy during postcopy, which is
3183                  * going to do a copy later; can only do it when we
3184                  * do this read in one go (matching page sizes)
3185                  */
3186                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3187                                          TARGET_PAGE_SIZE);
3188             }
3189             break;
3190         case RAM_SAVE_FLAG_EOS:
3191             /* normal exit */
3192             break;
3193         default:
3194             error_report("Unknown combination of migration flags: %#x"
3195                          " (postcopy mode)", flags);
3196             ret = -EINVAL;
3197             break;
3198         }
3199
3200         /* Detect for any possible file errors */
3201         if (!ret && qemu_file_get_error(f)) {
3202             ret = qemu_file_get_error(f);
3203         }
3204
3205         if (!ret && place_needed) {
3206             /* This gets called at the last target page in the host page */
3207             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3208
3209             if (all_zero) {
3210                 ret = postcopy_place_page_zero(mis, place_dest,
3211                                                block);
3212             } else {
3213                 ret = postcopy_place_page(mis, place_dest,
3214                                           place_source, block);
3215             }
3216         }
3217     }
3218
3219     return ret;
3220 }
3221
3222 static bool postcopy_is_advised(void)
3223 {
3224     PostcopyState ps = postcopy_state_get();
3225     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3226 }
3227
3228 static bool postcopy_is_running(void)
3229 {
3230     PostcopyState ps = postcopy_state_get();
3231     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3232 }
3233
3234 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3235 {
3236     int flags = 0, ret = 0, invalid_flags = 0;
3237     static uint64_t seq_iter;
3238     int len = 0;
3239     /*
3240      * If system is running in postcopy mode, page inserts to host memory must
3241      * be atomic
3242      */
3243     bool postcopy_running = postcopy_is_running();
3244     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3245     bool postcopy_advised = postcopy_is_advised();
3246
3247     seq_iter++;
3248
3249     if (version_id != 4) {
3250         ret = -EINVAL;
3251     }
3252
3253     if (!migrate_use_compression()) {
3254         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3255     }
3256     /* This RCU critical section can be very long running.
3257      * When RCU reclaims in the code start to become numerous,
3258      * it will be necessary to reduce the granularity of this
3259      * critical section.
3260      */
3261     rcu_read_lock();
3262
3263     if (postcopy_running) {
3264         ret = ram_load_postcopy(f);
3265     }
3266
3267     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3268         ram_addr_t addr, total_ram_bytes;
3269         void *host = NULL;
3270         uint8_t ch;
3271
3272         addr = qemu_get_be64(f);
3273         flags = addr & ~TARGET_PAGE_MASK;
3274         addr &= TARGET_PAGE_MASK;
3275
3276         if (flags & invalid_flags) {
3277             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3278                 error_report("Received an unexpected compressed page");
3279             }
3280
3281             ret = -EINVAL;
3282             break;
3283         }
3284
3285         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3286                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3287             RAMBlock *block = ram_block_from_stream(f, flags);
3288
3289             host = host_from_ram_block_offset(block, addr);
3290             if (!host) {
3291                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3292                 ret = -EINVAL;
3293                 break;
3294             }
3295             ramblock_recv_bitmap_set(block, host);
3296             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3297         }
3298
3299         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3300         case RAM_SAVE_FLAG_MEM_SIZE:
3301             /* Synchronize RAM block list */
3302             total_ram_bytes = addr;
3303             while (!ret && total_ram_bytes) {
3304                 RAMBlock *block;
3305                 char id[256];
3306                 ram_addr_t length;
3307
3308                 len = qemu_get_byte(f);
3309                 qemu_get_buffer(f, (uint8_t *)id, len);
3310                 id[len] = 0;
3311                 length = qemu_get_be64(f);
3312
3313                 block = qemu_ram_block_by_name(id);
3314                 if (block) {
3315                     if (length != block->used_length) {
3316                         Error *local_err = NULL;
3317
3318                         ret = qemu_ram_resize(block, length,
3319                                               &local_err);
3320                         if (local_err) {
3321                             error_report_err(local_err);
3322                         }
3323                     }
3324                     /* For postcopy we need to check hugepage sizes match */
3325                     if (postcopy_advised &&
3326                         block->page_size != qemu_host_page_size) {
3327                         uint64_t remote_page_size = qemu_get_be64(f);
3328                         if (remote_page_size != block->page_size) {
3329                             error_report("Mismatched RAM page size %s "
3330                                          "(local) %zd != %" PRId64,
3331                                          id, block->page_size,
3332                                          remote_page_size);
3333                             ret = -EINVAL;
3334                         }
3335                     }
3336                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3337                                           block->idstr);
3338                 } else {
3339                     error_report("Unknown ramblock \"%s\", cannot "
3340                                  "accept migration", id);
3341                     ret = -EINVAL;
3342                 }
3343
3344                 total_ram_bytes -= length;
3345             }
3346             break;
3347
3348         case RAM_SAVE_FLAG_ZERO:
3349             ch = qemu_get_byte(f);
3350             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3351             break;
3352
3353         case RAM_SAVE_FLAG_PAGE:
3354             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3355             break;
3356
3357         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3358             len = qemu_get_be32(f);
3359             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3360                 error_report("Invalid compressed data length: %d", len);
3361                 ret = -EINVAL;
3362                 break;
3363             }
3364             decompress_data_with_multi_threads(f, host, len);
3365             break;
3366
3367         case RAM_SAVE_FLAG_XBZRLE:
3368             if (load_xbzrle(f, addr, host) < 0) {
3369                 error_report("Failed to decompress XBZRLE page at "
3370                              RAM_ADDR_FMT, addr);
3371                 ret = -EINVAL;
3372                 break;
3373             }
3374             break;
3375         case RAM_SAVE_FLAG_EOS:
3376             /* normal exit */
3377             break;
3378         default:
3379             if (flags & RAM_SAVE_FLAG_HOOK) {
3380                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3381             } else {
3382                 error_report("Unknown combination of migration flags: %#x",
3383                              flags);
3384                 ret = -EINVAL;
3385             }
3386         }
3387         if (!ret) {
3388             ret = qemu_file_get_error(f);
3389         }
3390     }
3391
3392     ret |= wait_for_decompress_done();
3393     rcu_read_unlock();
3394     trace_ram_load_complete(ret, seq_iter);
3395     return ret;
3396 }
3397
3398 static bool ram_has_postcopy(void *opaque)
3399 {
3400     return migrate_postcopy_ram();
3401 }
3402
3403 /* Sync all the dirty bitmap with destination VM.  */
3404 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3405 {
3406     RAMBlock *block;
3407     QEMUFile *file = s->to_dst_file;
3408     int ramblock_count = 0;
3409
3410     trace_ram_dirty_bitmap_sync_start();
3411
3412     RAMBLOCK_FOREACH(block) {
3413         qemu_savevm_send_recv_bitmap(file, block->idstr);
3414         trace_ram_dirty_bitmap_request(block->idstr);
3415         ramblock_count++;
3416     }
3417
3418     trace_ram_dirty_bitmap_sync_wait();
3419
3420     /* Wait until all the ramblocks' dirty bitmap synced */
3421     while (ramblock_count--) {
3422         qemu_sem_wait(&s->rp_state.rp_sem);
3423     }
3424
3425     trace_ram_dirty_bitmap_sync_complete();
3426
3427     return 0;
3428 }
3429
3430 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3431 {
3432     qemu_sem_post(&s->rp_state.rp_sem);
3433 }
3434
3435 /*
3436  * Read the received bitmap, revert it as the initial dirty bitmap.
3437  * This is only used when the postcopy migration is paused but wants
3438  * to resume from a middle point.
3439  */
3440 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3441 {
3442     int ret = -EINVAL;
3443     QEMUFile *file = s->rp_state.from_dst_file;
3444     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3445     uint64_t local_size = nbits / 8;
3446     uint64_t size, end_mark;
3447
3448     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3449
3450     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3451         error_report("%s: incorrect state %s", __func__,
3452                      MigrationStatus_str(s->state));
3453         return -EINVAL;
3454     }
3455
3456     /*
3457      * Note: see comments in ramblock_recv_bitmap_send() on why we
3458      * need the endianess convertion, and the paddings.
3459      */
3460     local_size = ROUND_UP(local_size, 8);
3461
3462     /* Add paddings */
3463     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3464
3465     size = qemu_get_be64(file);
3466
3467     /* The size of the bitmap should match with our ramblock */
3468     if (size != local_size) {
3469         error_report("%s: ramblock '%s' bitmap size mismatch "
3470                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3471                      block->idstr, size, local_size);
3472         ret = -EINVAL;
3473         goto out;
3474     }
3475
3476     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3477     end_mark = qemu_get_be64(file);
3478
3479     ret = qemu_file_get_error(file);
3480     if (ret || size != local_size) {
3481         error_report("%s: read bitmap failed for ramblock '%s': %d"
3482                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3483                      __func__, block->idstr, ret, local_size, size);
3484         ret = -EIO;
3485         goto out;
3486     }
3487
3488     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3489         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3490                      __func__, block->idstr, end_mark);
3491         ret = -EINVAL;
3492         goto out;
3493     }
3494
3495     /*
3496      * Endianess convertion. We are during postcopy (though paused).
3497      * The dirty bitmap won't change. We can directly modify it.
3498      */
3499     bitmap_from_le(block->bmap, le_bitmap, nbits);
3500
3501     /*
3502      * What we received is "received bitmap". Revert it as the initial
3503      * dirty bitmap for this ramblock.
3504      */
3505     bitmap_complement(block->bmap, block->bmap, nbits);
3506
3507     trace_ram_dirty_bitmap_reload_complete(block->idstr);
3508
3509     /*
3510      * We succeeded to sync bitmap for current ramblock. If this is
3511      * the last one to sync, we need to notify the main send thread.
3512      */
3513     ram_dirty_bitmap_reload_notify(s);
3514
3515     ret = 0;
3516 out:
3517     free(le_bitmap);
3518     return ret;
3519 }
3520
3521 static int ram_resume_prepare(MigrationState *s, void *opaque)
3522 {
3523     RAMState *rs = *(RAMState **)opaque;
3524     int ret;
3525
3526     ret = ram_dirty_bitmap_sync_all(s, rs);
3527     if (ret) {
3528         return ret;
3529     }
3530
3531     ram_state_resume_prepare(rs, s->to_dst_file);
3532
3533     return 0;
3534 }
3535
3536 static SaveVMHandlers savevm_ram_handlers = {
3537     .save_setup = ram_save_setup,
3538     .save_live_iterate = ram_save_iterate,
3539     .save_live_complete_postcopy = ram_save_complete,
3540     .save_live_complete_precopy = ram_save_complete,
3541     .has_postcopy = ram_has_postcopy,
3542     .save_live_pending = ram_save_pending,
3543     .load_state = ram_load,
3544     .save_cleanup = ram_save_cleanup,
3545     .load_setup = ram_load_setup,
3546     .load_cleanup = ram_load_cleanup,
3547     .resume_prepare = ram_resume_prepare,
3548 };
3549
3550 void ram_mig_init(void)
3551 {
3552     qemu_mutex_init(&XBZRLE.lock);
3553     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
3554 }