migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "migration/register.h"
  40 #include "migration/misc.h"
  41 #include "qemu-file.h"
  42 #include "postcopy-ram.h"
  43 #include "migration/page_cache.h"
  44 #include "qemu/error-report.h"
  45 #include "qapi/error.h"
  46 #include "qapi/qapi-events-migration.h"
  47 #include "qapi/qmp/qerror.h"
  48 #include "trace.h"
  49 #include "exec/ram_addr.h"
  50 #include "exec/target_page.h"
  51 #include "qemu/rcu_queue.h"
  52 #include "migration/colo.h"
  53 #include "migration/block.h"
  54
  55 /***********************************************************/
  56 /* ram save/restore */
  57
  58 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  59  * worked for pages that where filled with the same char.  We switched
  60  * it to only search for the zero value.  And to avoid confusion with
  61  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  62  */
  63
  64 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  65 #define RAM_SAVE_FLAG_ZERO     0x02
  66 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  67 #define RAM_SAVE_FLAG_PAGE     0x08
  68 #define RAM_SAVE_FLAG_EOS      0x10
  69 #define RAM_SAVE_FLAG_CONTINUE 0x20
  70 #define RAM_SAVE_FLAG_XBZRLE   0x40
  71 /* 0x80 is reserved in migration.h start with 0x100 next */
  72 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  73
  74 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  75 {
  76     return buffer_is_zero(p, size);
  77 }
  78
  79 XBZRLECacheStats xbzrle_counters;
  80
  81 /* struct contains XBZRLE cache and a static page
  82    used by the compression */
  83 static struct {
  84     /* buffer used for XBZRLE encoding */
  85     uint8_t *encoded_buf;
  86     /* buffer for storing page content */
  87     uint8_t *current_buf;
  88     /* Cache for XBZRLE, Protected by lock. */
  89     PageCache *cache;
  90     QemuMutex lock;
  91     /* it will store a page full of zeros */
  92     uint8_t *zero_target_page;
  93     /* buffer used for XBZRLE decoding */
  94     uint8_t *decoded_buf;
  95 } XBZRLE;
  96
  97 static void XBZRLE_cache_lock(void)
  98 {
  99     if (migrate_use_xbzrle())
 100         qemu_mutex_lock(&XBZRLE.lock);
 101 }
 102
 103 static void XBZRLE_cache_unlock(void)
 104 {
 105     if (migrate_use_xbzrle())
 106         qemu_mutex_unlock(&XBZRLE.lock);
 107 }
 108
 109 /**
 110  * xbzrle_cache_resize: resize the xbzrle cache
 111  *
 112  * This function is called from qmp_migrate_set_cache_size in main
 113  * thread, possibly while a migration is in progress.  A running
 114  * migration may be using the cache and might finish during this call,
 115  * hence changes to the cache are protected by XBZRLE.lock().
 116  *
 117  * Returns 0 for success or -1 for error
 118  *
 119  * @new_size: new cache size
 120  * @errp: set *errp if the check failed, with reason
 121  */
 122 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 123 {
 124     PageCache *new_cache;
 125     int64_t ret = 0;
 126
 127     /* Check for truncation */
 128     if (new_size != (size_t)new_size) {
 129         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 130                    "exceeding address space");
 131         return -1;
 132     }
 133
 134     if (new_size == migrate_xbzrle_cache_size()) {
 135         /* nothing to do */
 136         return 0;
 137     }
 138
 139     XBZRLE_cache_lock();
 140
 141     if (XBZRLE.cache != NULL) {
 142         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 143         if (!new_cache) {
 144             ret = -1;
 145             goto out;
 146         }
 147
 148         cache_fini(XBZRLE.cache);
 149         XBZRLE.cache = new_cache;
 150     }
 151 out:
 152     XBZRLE_cache_unlock();
 153     return ret;
 154 }
 155
 156 static void ramblock_recv_map_init(void)
 157 {
 158     RAMBlock *rb;
 159
 160     RAMBLOCK_FOREACH(rb) {
 161         assert(!rb->receivedmap);
 162         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 163     }
 164 }
 165
 166 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 167 {
 168     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 169                     rb->receivedmap);
 170 }
 171
 172 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 173 {
 174     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 175 }
 176
 177 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 178 {
 179     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 180 }
 181
 182 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 183                                     size_t nr)
 184 {
 185     bitmap_set_atomic(rb->receivedmap,
 186                       ramblock_recv_bitmap_offset(host_addr, rb),
 187                       nr);
 188 }
 189
 190 /*
 191  * An outstanding page request, on the source, having been received
 192  * and queued
 193  */
 194 struct RAMSrcPageRequest {
 195     RAMBlock *rb;
 196     hwaddr    offset;
 197     hwaddr    len;
 198
 199     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 200 };
 201
 202 /* State of RAM for migration */
 203 struct RAMState {
 204     /* QEMUFile used for this migration */
 205     QEMUFile *f;
 206     /* Last block that we have visited searching for dirty pages */
 207     RAMBlock *last_seen_block;
 208     /* Last block from where we have sent data */
 209     RAMBlock *last_sent_block;
 210     /* Last dirty target page we have sent */
 211     ram_addr_t last_page;
 212     /* last ram version we have seen */
 213     uint32_t last_version;
 214     /* We are in the first round */
 215     bool ram_bulk_stage;
 216     /* How many times we have dirty too many pages */
 217     int dirty_rate_high_cnt;
 218     /* these variables are used for bitmap sync */
 219     /* last time we did a full bitmap_sync */
 220     int64_t time_last_bitmap_sync;
 221     /* bytes transferred at start_time */
 222     uint64_t bytes_xfer_prev;
 223     /* number of dirty pages since start_time */
 224     uint64_t num_dirty_pages_period;
 225     /* xbzrle misses since the beginning of the period */
 226     uint64_t xbzrle_cache_miss_prev;
 227     /* number of iterations at the beginning of period */
 228     uint64_t iterations_prev;
 229     /* Iterations since start */
 230     uint64_t iterations;
 231     /* number of dirty bits in the bitmap */
 232     uint64_t migration_dirty_pages;
 233     /* protects modification of the bitmap */
 234     QemuMutex bitmap_mutex;
 235     /* The RAMBlock used in the last src_page_requests */
 236     RAMBlock *last_req_rb;
 237     /* Queue of outstanding page requests from the destination */
 238     QemuMutex src_page_req_mutex;
 239     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 240 };
 241 typedef struct RAMState RAMState;
 242
 243 static RAMState *ram_state;
 244
 245 uint64_t ram_bytes_remaining(void)
 246 {
 247     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 248                        0;
 249 }
 250
 251 MigrationStats ram_counters;
 252
 253 /* used by the search for pages to send */
 254 struct PageSearchStatus {
 255     /* Current block being searched */
 256     RAMBlock    *block;
 257     /* Current page to search from */
 258     unsigned long page;
 259     /* Set once we wrap around */
 260     bool         complete_round;
 261 };
 262 typedef struct PageSearchStatus PageSearchStatus;
 263
 264 struct CompressParam {
 265     bool done;
 266     bool quit;
 267     QEMUFile *file;
 268     QemuMutex mutex;
 269     QemuCond cond;
 270     RAMBlock *block;
 271     ram_addr_t offset;
 272 };
 273 typedef struct CompressParam CompressParam;
 274
 275 struct DecompressParam {
 276     bool done;
 277     bool quit;
 278     QemuMutex mutex;
 279     QemuCond cond;
 280     void *des;
 281     uint8_t *compbuf;
 282     int len;
 283 };
 284 typedef struct DecompressParam DecompressParam;
 285
 286 static CompressParam *comp_param;
 287 static QemuThread *compress_threads;
 288 /* comp_done_cond is used to wake up the migration thread when
 289  * one of the compression threads has finished the compression.
 290  * comp_done_lock is used to co-work with comp_done_cond.
 291  */
 292 static QemuMutex comp_done_lock;
 293 static QemuCond comp_done_cond;
 294 /* The empty QEMUFileOps will be used by file in CompressParam */
 295 static const QEMUFileOps empty_ops = { };
 296
 297 static DecompressParam *decomp_param;
 298 static QemuThread *decompress_threads;
 299 static QemuMutex decomp_done_lock;
 300 static QemuCond decomp_done_cond;
 301
 302 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 303                                 ram_addr_t offset);
 304
 305 static void *do_data_compress(void *opaque)
 306 {
 307     CompressParam *param = opaque;
 308     RAMBlock *block;
 309     ram_addr_t offset;
 310
 311     qemu_mutex_lock(&param->mutex);
 312     while (!param->quit) {
 313         if (param->block) {
 314             block = param->block;
 315             offset = param->offset;
 316             param->block = NULL;
 317             qemu_mutex_unlock(&param->mutex);
 318
 319             do_compress_ram_page(param->file, block, offset);
 320
 321             qemu_mutex_lock(&comp_done_lock);
 322             param->done = true;
 323             qemu_cond_signal(&comp_done_cond);
 324             qemu_mutex_unlock(&comp_done_lock);
 325
 326             qemu_mutex_lock(&param->mutex);
 327         } else {
 328             qemu_cond_wait(&param->cond, &param->mutex);
 329         }
 330     }
 331     qemu_mutex_unlock(&param->mutex);
 332
 333     return NULL;
 334 }
 335
 336 static inline void terminate_compression_threads(void)
 337 {
 338     int idx, thread_count;
 339
 340     thread_count = migrate_compress_threads();
 341
 342     for (idx = 0; idx < thread_count; idx++) {
 343         qemu_mutex_lock(&comp_param[idx].mutex);
 344         comp_param[idx].quit = true;
 345         qemu_cond_signal(&comp_param[idx].cond);
 346         qemu_mutex_unlock(&comp_param[idx].mutex);
 347     }
 348 }
 349
 350 static void compress_threads_save_cleanup(void)
 351 {
 352     int i, thread_count;
 353
 354     if (!migrate_use_compression()) {
 355         return;
 356     }
 357     terminate_compression_threads();
 358     thread_count = migrate_compress_threads();
 359     for (i = 0; i < thread_count; i++) {
 360         qemu_thread_join(compress_threads + i);
 361         qemu_fclose(comp_param[i].file);
 362         qemu_mutex_destroy(&comp_param[i].mutex);
 363         qemu_cond_destroy(&comp_param[i].cond);
 364     }
 365     qemu_mutex_destroy(&comp_done_lock);
 366     qemu_cond_destroy(&comp_done_cond);
 367     g_free(compress_threads);
 368     g_free(comp_param);
 369     compress_threads = NULL;
 370     comp_param = NULL;
 371 }
 372
 373 static void compress_threads_save_setup(void)
 374 {
 375     int i, thread_count;
 376
 377     if (!migrate_use_compression()) {
 378         return;
 379     }
 380     thread_count = migrate_compress_threads();
 381     compress_threads = g_new0(QemuThread, thread_count);
 382     comp_param = g_new0(CompressParam, thread_count);
 383     qemu_cond_init(&comp_done_cond);
 384     qemu_mutex_init(&comp_done_lock);
 385     for (i = 0; i < thread_count; i++) {
 386         /* comp_param[i].file is just used as a dummy buffer to save data,
 387          * set its ops to empty.
 388          */
 389         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 390         comp_param[i].done = true;
 391         comp_param[i].quit = false;
 392         qemu_mutex_init(&comp_param[i].mutex);
 393         qemu_cond_init(&comp_param[i].cond);
 394         qemu_thread_create(compress_threads + i, "compress",
 395                            do_data_compress, comp_param + i,
 396                            QEMU_THREAD_JOINABLE);
 397     }
 398 }
 399
 400 /* Multiple fd's */
 401
 402 struct MultiFDSendParams {
 403     uint8_t id;
 404     char *name;
 405     QemuThread thread;
 406     QemuSemaphore sem;
 407     QemuMutex mutex;
 408     bool quit;
 409 };
 410 typedef struct MultiFDSendParams MultiFDSendParams;
 411
 412 struct {
 413     MultiFDSendParams *params;
 414     /* number of created threads */
 415     int count;
 416 } *multifd_send_state;
 417
 418 static void terminate_multifd_send_threads(Error *errp)
 419 {
 420     int i;
 421
 422     for (i = 0; i < multifd_send_state->count; i++) {
 423         MultiFDSendParams *p = &multifd_send_state->params[i];
 424
 425         qemu_mutex_lock(&p->mutex);
 426         p->quit = true;
 427         qemu_sem_post(&p->sem);
 428         qemu_mutex_unlock(&p->mutex);
 429     }
 430 }
 431
 432 int multifd_save_cleanup(Error **errp)
 433 {
 434     int i;
 435     int ret = 0;
 436
 437     if (!migrate_use_multifd()) {
 438         return 0;
 439     }
 440     terminate_multifd_send_threads(NULL);
 441     for (i = 0; i < multifd_send_state->count; i++) {
 442         MultiFDSendParams *p = &multifd_send_state->params[i];
 443
 444         qemu_thread_join(&p->thread);
 445         qemu_mutex_destroy(&p->mutex);
 446         qemu_sem_destroy(&p->sem);
 447         g_free(p->name);
 448         p->name = NULL;
 449     }
 450     g_free(multifd_send_state->params);
 451     multifd_send_state->params = NULL;
 452     g_free(multifd_send_state);
 453     multifd_send_state = NULL;
 454     return ret;
 455 }
 456
 457 static void *multifd_send_thread(void *opaque)
 458 {
 459     MultiFDSendParams *p = opaque;
 460
 461     while (true) {
 462         qemu_mutex_lock(&p->mutex);
 463         if (p->quit) {
 464             qemu_mutex_unlock(&p->mutex);
 465             break;
 466         }
 467         qemu_mutex_unlock(&p->mutex);
 468         qemu_sem_wait(&p->sem);
 469     }
 470
 471     return NULL;
 472 }
 473
 474 int multifd_save_setup(void)
 475 {
 476     int thread_count;
 477     uint8_t i;
 478
 479     if (!migrate_use_multifd()) {
 480         return 0;
 481     }
 482     thread_count = migrate_multifd_channels();
 483     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
 484     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
 485     multifd_send_state->count = 0;
 486     for (i = 0; i < thread_count; i++) {
 487         MultiFDSendParams *p = &multifd_send_state->params[i];
 488
 489         qemu_mutex_init(&p->mutex);
 490         qemu_sem_init(&p->sem, 0);
 491         p->quit = false;
 492         p->id = i;
 493         p->name = g_strdup_printf("multifdsend_%d", i);
 494         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
 495                            QEMU_THREAD_JOINABLE);
 496
 497         multifd_send_state->count++;
 498     }
 499     return 0;
 500 }
 501
 502 struct MultiFDRecvParams {
 503     uint8_t id;
 504     char *name;
 505     QemuThread thread;
 506     QemuSemaphore sem;
 507     QemuMutex mutex;
 508     bool quit;
 509 };
 510 typedef struct MultiFDRecvParams MultiFDRecvParams;
 511
 512 struct {
 513     MultiFDRecvParams *params;
 514     /* number of created threads */
 515     int count;
 516 } *multifd_recv_state;
 517
 518 static void terminate_multifd_recv_threads(Error *errp)
 519 {
 520     int i;
 521
 522     for (i = 0; i < multifd_recv_state->count; i++) {
 523         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 524
 525         qemu_mutex_lock(&p->mutex);
 526         p->quit = true;
 527         qemu_sem_post(&p->sem);
 528         qemu_mutex_unlock(&p->mutex);
 529     }
 530 }
 531
 532 int multifd_load_cleanup(Error **errp)
 533 {
 534     int i;
 535     int ret = 0;
 536
 537     if (!migrate_use_multifd()) {
 538         return 0;
 539     }
 540     terminate_multifd_recv_threads(NULL);
 541     for (i = 0; i < multifd_recv_state->count; i++) {
 542         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 543
 544         qemu_thread_join(&p->thread);
 545         qemu_mutex_destroy(&p->mutex);
 546         qemu_sem_destroy(&p->sem);
 547         g_free(p->name);
 548         p->name = NULL;
 549     }
 550     g_free(multifd_recv_state->params);
 551     multifd_recv_state->params = NULL;
 552     g_free(multifd_recv_state);
 553     multifd_recv_state = NULL;
 554
 555     return ret;
 556 }
 557
 558 static void *multifd_recv_thread(void *opaque)
 559 {
 560     MultiFDRecvParams *p = opaque;
 561
 562     while (true) {
 563         qemu_mutex_lock(&p->mutex);
 564         if (p->quit) {
 565             qemu_mutex_unlock(&p->mutex);
 566             break;
 567         }
 568         qemu_mutex_unlock(&p->mutex);
 569         qemu_sem_wait(&p->sem);
 570     }
 571
 572     return NULL;
 573 }
 574
 575 int multifd_load_setup(void)
 576 {
 577     int thread_count;
 578     uint8_t i;
 579
 580     if (!migrate_use_multifd()) {
 581         return 0;
 582     }
 583     thread_count = migrate_multifd_channels();
 584     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
 585     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
 586     multifd_recv_state->count = 0;
 587     for (i = 0; i < thread_count; i++) {
 588         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 589
 590         qemu_mutex_init(&p->mutex);
 591         qemu_sem_init(&p->sem, 0);
 592         p->quit = false;
 593         p->id = i;
 594         p->name = g_strdup_printf("multifdrecv_%d", i);
 595         qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
 596                            QEMU_THREAD_JOINABLE);
 597         multifd_recv_state->count++;
 598     }
 599     return 0;
 600 }
 601
 602 /**
 603  * save_page_header: write page header to wire
 604  *
 605  * If this is the 1st block, it also writes the block identification
 606  *
 607  * Returns the number of bytes written
 608  *
 609  * @f: QEMUFile where to send the data
 610  * @block: block that contains the page we want to send
 611  * @offset: offset inside the block for the page
 612  *          in the lower bits, it contains flags
 613  */
 614 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 615                                ram_addr_t offset)
 616 {
 617     size_t size, len;
 618
 619     if (block == rs->last_sent_block) {
 620         offset |= RAM_SAVE_FLAG_CONTINUE;
 621     }
 622     qemu_put_be64(f, offset);
 623     size = 8;
 624
 625     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 626         len = strlen(block->idstr);
 627         qemu_put_byte(f, len);
 628         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 629         size += 1 + len;
 630         rs->last_sent_block = block;
 631     }
 632     return size;
 633 }
 634
 635 /**
 636  * mig_throttle_guest_down: throotle down the guest
 637  *
 638  * Reduce amount of guest cpu execution to hopefully slow down memory
 639  * writes. If guest dirty memory rate is reduced below the rate at
 640  * which we can transfer pages to the destination then we should be
 641  * able to complete migration. Some workloads dirty memory way too
 642  * fast and will not effectively converge, even with auto-converge.
 643  */
 644 static void mig_throttle_guest_down(void)
 645 {
 646     MigrationState *s = migrate_get_current();
 647     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 648     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 649
 650     /* We have not started throttling yet. Let's start it. */
 651     if (!cpu_throttle_active()) {
 652         cpu_throttle_set(pct_initial);
 653     } else {
 654         /* Throttling already on, just increase the rate */
 655         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 656     }
 657 }
 658
 659 /**
 660  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 661  *
 662  * @rs: current RAM state
 663  * @current_addr: address for the zero page
 664  *
 665  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 666  * The important thing is that a stale (not-yet-0'd) page be replaced
 667  * by the new data.
 668  * As a bonus, if the page wasn't in the cache it gets added so that
 669  * when a small write is made into the 0'd page it gets XBZRLE sent.
 670  */
 671 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 672 {
 673     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 674         return;
 675     }
 676
 677     /* We don't care if this fails to allocate a new cache page
 678      * as long as it updated an old one */
 679     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 680                  ram_counters.dirty_sync_count);
 681 }
 682
 683 #define ENCODING_FLAG_XBZRLE 0x1
 684
 685 /**
 686  * save_xbzrle_page: compress and send current page
 687  *
 688  * Returns: 1 means that we wrote the page
 689  *          0 means that page is identical to the one already sent
 690  *          -1 means that xbzrle would be longer than normal
 691  *
 692  * @rs: current RAM state
 693  * @current_data: pointer to the address of the page contents
 694  * @current_addr: addr of the page
 695  * @block: block that contains the page we want to send
 696  * @offset: offset inside the block for the page
 697  * @last_stage: if we are at the completion stage
 698  */
 699 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 700                             ram_addr_t current_addr, RAMBlock *block,
 701                             ram_addr_t offset, bool last_stage)
 702 {
 703     int encoded_len = 0, bytes_xbzrle;
 704     uint8_t *prev_cached_page;
 705
 706     if (!cache_is_cached(XBZRLE.cache, current_addr,
 707                          ram_counters.dirty_sync_count)) {
 708         xbzrle_counters.cache_miss++;
 709         if (!last_stage) {
 710             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 711                              ram_counters.dirty_sync_count) == -1) {
 712                 return -1;
 713             } else {
 714                 /* update *current_data when the page has been
 715                    inserted into cache */
 716                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 717             }
 718         }
 719         return -1;
 720     }
 721
 722     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 723
 724     /* save current buffer into memory */
 725     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 726
 727     /* XBZRLE encoding (if there is no overflow) */
 728     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 729                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 730                                        TARGET_PAGE_SIZE);
 731     if (encoded_len == 0) {
 732         trace_save_xbzrle_page_skipping();
 733         return 0;
 734     } else if (encoded_len == -1) {
 735         trace_save_xbzrle_page_overflow();
 736         xbzrle_counters.overflow++;
 737         /* update data in the cache */
 738         if (!last_stage) {
 739             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 740             *current_data = prev_cached_page;
 741         }
 742         return -1;
 743     }
 744
 745     /* we need to update the data in the cache, in order to get the same data */
 746     if (!last_stage) {
 747         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 748     }
 749
 750     /* Send XBZRLE based compressed page */
 751     bytes_xbzrle = save_page_header(rs, rs->f, block,
 752                                     offset | RAM_SAVE_FLAG_XBZRLE);
 753     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 754     qemu_put_be16(rs->f, encoded_len);
 755     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 756     bytes_xbzrle += encoded_len + 1 + 2;
 757     xbzrle_counters.pages++;
 758     xbzrle_counters.bytes += bytes_xbzrle;
 759     ram_counters.transferred += bytes_xbzrle;
 760
 761     return 1;
 762 }
 763
 764 /**
 765  * migration_bitmap_find_dirty: find the next dirty page from start
 766  *
 767  * Called with rcu_read_lock() to protect migration_bitmap
 768  *
 769  * Returns the byte offset within memory region of the start of a dirty page
 770  *
 771  * @rs: current RAM state
 772  * @rb: RAMBlock where to search for dirty pages
 773  * @start: page where we start the search
 774  */
 775 static inline
 776 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 777                                           unsigned long start)
 778 {
 779     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 780     unsigned long *bitmap = rb->bmap;
 781     unsigned long next;
 782
 783     if (rs->ram_bulk_stage && start > 0) {
 784         next = start + 1;
 785     } else {
 786         next = find_next_bit(bitmap, size, start);
 787     }
 788
 789     return next;
 790 }
 791
 792 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 793                                                 RAMBlock *rb,
 794                                                 unsigned long page)
 795 {
 796     bool ret;
 797
 798     ret = test_and_clear_bit(page, rb->bmap);
 799
 800     if (ret) {
 801         rs->migration_dirty_pages--;
 802     }
 803     return ret;
 804 }
 805
 806 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 807                                         ram_addr_t start, ram_addr_t length)
 808 {
 809     rs->migration_dirty_pages +=
 810         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 811                                               &rs->num_dirty_pages_period);
 812 }
 813
 814 /**
 815  * ram_pagesize_summary: calculate all the pagesizes of a VM
 816  *
 817  * Returns a summary bitmap of the page sizes of all RAMBlocks
 818  *
 819  * For VMs with just normal pages this is equivalent to the host page
 820  * size. If it's got some huge pages then it's the OR of all the
 821  * different page sizes.
 822  */
 823 uint64_t ram_pagesize_summary(void)
 824 {
 825     RAMBlock *block;
 826     uint64_t summary = 0;
 827
 828     RAMBLOCK_FOREACH(block) {
 829         summary |= block->page_size;
 830     }
 831
 832     return summary;
 833 }
 834
 835 static void migration_bitmap_sync(RAMState *rs)
 836 {
 837     RAMBlock *block;
 838     int64_t end_time;
 839     uint64_t bytes_xfer_now;
 840
 841     ram_counters.dirty_sync_count++;
 842
 843     if (!rs->time_last_bitmap_sync) {
 844         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 845     }
 846
 847     trace_migration_bitmap_sync_start();
 848     memory_global_dirty_log_sync();
 849
 850     qemu_mutex_lock(&rs->bitmap_mutex);
 851     rcu_read_lock();
 852     RAMBLOCK_FOREACH(block) {
 853         migration_bitmap_sync_range(rs, block, 0, block->used_length);
 854     }
 855     rcu_read_unlock();
 856     qemu_mutex_unlock(&rs->bitmap_mutex);
 857
 858     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 859
 860     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 861
 862     /* more than 1 second = 1000 millisecons */
 863     if (end_time > rs->time_last_bitmap_sync + 1000) {
 864         /* calculate period counters */
 865         ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 866             / (end_time - rs->time_last_bitmap_sync);
 867         bytes_xfer_now = ram_counters.transferred;
 868
 869         /* During block migration the auto-converge logic incorrectly detects
 870          * that ram migration makes no progress. Avoid this by disabling the
 871          * throttling logic during the bulk phase of block migration. */
 872         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 873             /* The following detection logic can be refined later. For now:
 874                Check to see if the dirtied bytes is 50% more than the approx.
 875                amount of bytes that just got transferred since the last time we
 876                were in this routine. If that happens twice, start or increase
 877                throttling */
 878
 879             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 880                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 881                 (++rs->dirty_rate_high_cnt >= 2)) {
 882                     trace_migration_throttle();
 883                     rs->dirty_rate_high_cnt = 0;
 884                     mig_throttle_guest_down();
 885             }
 886         }
 887
 888         if (migrate_use_xbzrle()) {
 889             if (rs->iterations_prev != rs->iterations) {
 890                 xbzrle_counters.cache_miss_rate =
 891                    (double)(xbzrle_counters.cache_miss -
 892                             rs->xbzrle_cache_miss_prev) /
 893                    (rs->iterations - rs->iterations_prev);
 894             }
 895             rs->iterations_prev = rs->iterations;
 896             rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 897         }
 898
 899         /* reset period counters */
 900         rs->time_last_bitmap_sync = end_time;
 901         rs->num_dirty_pages_period = 0;
 902         rs->bytes_xfer_prev = bytes_xfer_now;
 903     }
 904     if (migrate_use_events()) {
 905         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
 906     }
 907 }
 908
 909 /**
 910  * save_zero_page: send the zero page to the stream
 911  *
 912  * Returns the number of pages written.
 913  *
 914  * @rs: current RAM state
 915  * @block: block that contains the page we want to send
 916  * @offset: offset inside the block for the page
 917  */
 918 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
 919 {
 920     uint8_t *p = block->host + offset;
 921     int pages = -1;
 922
 923     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 924         ram_counters.duplicate++;
 925         ram_counters.transferred +=
 926             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
 927         qemu_put_byte(rs->f, 0);
 928         ram_counters.transferred += 1;
 929         pages = 1;
 930     }
 931
 932     return pages;
 933 }
 934
 935 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 936 {
 937     if (!migrate_release_ram() || !migration_in_postcopy()) {
 938         return;
 939     }
 940
 941     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 942 }
 943
 944 /**
 945  * ram_save_page: send the given page to the stream
 946  *
 947  * Returns the number of pages written.
 948  *          < 0 - error
 949  *          >=0 - Number of pages written - this might legally be 0
 950  *                if xbzrle noticed the page was the same.
 951  *
 952  * @rs: current RAM state
 953  * @block: block that contains the page we want to send
 954  * @offset: offset inside the block for the page
 955  * @last_stage: if we are at the completion stage
 956  */
 957 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
 958 {
 959     int pages = -1;
 960     uint64_t bytes_xmit;
 961     ram_addr_t current_addr;
 962     uint8_t *p;
 963     int ret;
 964     bool send_async = true;
 965     RAMBlock *block = pss->block;
 966     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 967
 968     p = block->host + offset;
 969     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
 970
 971     /* In doubt sent page as normal */
 972     bytes_xmit = 0;
 973     ret = ram_control_save_page(rs->f, block->offset,
 974                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 975     if (bytes_xmit) {
 976         ram_counters.transferred += bytes_xmit;
 977         pages = 1;
 978     }
 979
 980     XBZRLE_cache_lock();
 981
 982     current_addr = block->offset + offset;
 983
 984     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 985         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 986             if (bytes_xmit > 0) {
 987                 ram_counters.normal++;
 988             } else if (bytes_xmit == 0) {
 989                 ram_counters.duplicate++;
 990             }
 991         }
 992     } else {
 993         pages = save_zero_page(rs, block, offset);
 994         if (pages > 0) {
 995             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 996              * page would be stale
 997              */
 998             xbzrle_cache_zero_page(rs, current_addr);
 999             ram_release_pages(block->idstr, offset, pages);
1000         } else if (!rs->ram_bulk_stage &&
1001                    !migration_in_postcopy() && migrate_use_xbzrle()) {
1002             pages = save_xbzrle_page(rs, &p, current_addr, block,
1003                                      offset, last_stage);
1004             if (!last_stage) {
1005                 /* Can't send this cached data async, since the cache page
1006                  * might get updated before it gets to the wire
1007                  */
1008                 send_async = false;
1009             }
1010         }
1011     }
1012
1013     /* XBZRLE overflow or normal page */
1014     if (pages == -1) {
1015         ram_counters.transferred +=
1016             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
1017         if (send_async) {
1018             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
1019                                   migrate_release_ram() &
1020                                   migration_in_postcopy());
1021         } else {
1022             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
1023         }
1024         ram_counters.transferred += TARGET_PAGE_SIZE;
1025         pages = 1;
1026         ram_counters.normal++;
1027     }
1028
1029     XBZRLE_cache_unlock();
1030
1031     return pages;
1032 }
1033
1034 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
1035                                 ram_addr_t offset)
1036 {
1037     RAMState *rs = ram_state;
1038     int bytes_sent, blen;
1039     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1040
1041     bytes_sent = save_page_header(rs, f, block, offset |
1042                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
1043     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1044                                      migrate_compress_level());
1045     if (blen < 0) {
1046         bytes_sent = 0;
1047         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1048         error_report("compressed data failed!");
1049     } else {
1050         bytes_sent += blen;
1051         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1052     }
1053
1054     return bytes_sent;
1055 }
1056
1057 static void flush_compressed_data(RAMState *rs)
1058 {
1059     int idx, len, thread_count;
1060
1061     if (!migrate_use_compression()) {
1062         return;
1063     }
1064     thread_count = migrate_compress_threads();
1065
1066     qemu_mutex_lock(&comp_done_lock);
1067     for (idx = 0; idx < thread_count; idx++) {
1068         while (!comp_param[idx].done) {
1069             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1070         }
1071     }
1072     qemu_mutex_unlock(&comp_done_lock);
1073
1074     for (idx = 0; idx < thread_count; idx++) {
1075         qemu_mutex_lock(&comp_param[idx].mutex);
1076         if (!comp_param[idx].quit) {
1077             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1078             ram_counters.transferred += len;
1079         }
1080         qemu_mutex_unlock(&comp_param[idx].mutex);
1081     }
1082 }
1083
1084 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1085                                        ram_addr_t offset)
1086 {
1087     param->block = block;
1088     param->offset = offset;
1089 }
1090
1091 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1092                                            ram_addr_t offset)
1093 {
1094     int idx, thread_count, bytes_xmit = -1, pages = -1;
1095
1096     thread_count = migrate_compress_threads();
1097     qemu_mutex_lock(&comp_done_lock);
1098     while (true) {
1099         for (idx = 0; idx < thread_count; idx++) {
1100             if (comp_param[idx].done) {
1101                 comp_param[idx].done = false;
1102                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1103                 qemu_mutex_lock(&comp_param[idx].mutex);
1104                 set_compress_params(&comp_param[idx], block, offset);
1105                 qemu_cond_signal(&comp_param[idx].cond);
1106                 qemu_mutex_unlock(&comp_param[idx].mutex);
1107                 pages = 1;
1108                 ram_counters.normal++;
1109                 ram_counters.transferred += bytes_xmit;
1110                 break;
1111             }
1112         }
1113         if (pages > 0) {
1114             break;
1115         } else {
1116             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1117         }
1118     }
1119     qemu_mutex_unlock(&comp_done_lock);
1120
1121     return pages;
1122 }
1123
1124 /**
1125  * ram_save_compressed_page: compress the given page and send it to the stream
1126  *
1127  * Returns the number of pages written.
1128  *
1129  * @rs: current RAM state
1130  * @block: block that contains the page we want to send
1131  * @offset: offset inside the block for the page
1132  * @last_stage: if we are at the completion stage
1133  */
1134 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
1135                                     bool last_stage)
1136 {
1137     int pages = -1;
1138     uint64_t bytes_xmit = 0;
1139     uint8_t *p;
1140     int ret, blen;
1141     RAMBlock *block = pss->block;
1142     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1143
1144     p = block->host + offset;
1145
1146     ret = ram_control_save_page(rs->f, block->offset,
1147                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
1148     if (bytes_xmit) {
1149         ram_counters.transferred += bytes_xmit;
1150         pages = 1;
1151     }
1152     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
1153         if (ret != RAM_SAVE_CONTROL_DELAYED) {
1154             if (bytes_xmit > 0) {
1155                 ram_counters.normal++;
1156             } else if (bytes_xmit == 0) {
1157                 ram_counters.duplicate++;
1158             }
1159         }
1160     } else {
1161         /* When starting the process of a new block, the first page of
1162          * the block should be sent out before other pages in the same
1163          * block, and all the pages in last block should have been sent
1164          * out, keeping this order is important, because the 'cont' flag
1165          * is used to avoid resending the block name.
1166          */
1167         if (block != rs->last_sent_block) {
1168             flush_compressed_data(rs);
1169             pages = save_zero_page(rs, block, offset);
1170             if (pages == -1) {
1171                 /* Make sure the first page is sent out before other pages */
1172                 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1173                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
1174                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1175                                                  migrate_compress_level());
1176                 if (blen > 0) {
1177                     ram_counters.transferred += bytes_xmit + blen;
1178                     ram_counters.normal++;
1179                     pages = 1;
1180                 } else {
1181                     qemu_file_set_error(rs->f, blen);
1182                     error_report("compressed data failed!");
1183                 }
1184             }
1185             if (pages > 0) {
1186                 ram_release_pages(block->idstr, offset, pages);
1187             }
1188         } else {
1189             pages = save_zero_page(rs, block, offset);
1190             if (pages == -1) {
1191                 pages = compress_page_with_multi_thread(rs, block, offset);
1192             } else {
1193                 ram_release_pages(block->idstr, offset, pages);
1194             }
1195         }
1196     }
1197
1198     return pages;
1199 }
1200
1201 /**
1202  * find_dirty_block: find the next dirty page and update any state
1203  * associated with the search process.
1204  *
1205  * Returns if a page is found
1206  *
1207  * @rs: current RAM state
1208  * @pss: data about the state of the current dirty page scan
1209  * @again: set to false if the search has scanned the whole of RAM
1210  */
1211 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1212 {
1213     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1214     if (pss->complete_round && pss->block == rs->last_seen_block &&
1215         pss->page >= rs->last_page) {
1216         /*
1217          * We've been once around the RAM and haven't found anything.
1218          * Give up.
1219          */
1220         *again = false;
1221         return false;
1222     }
1223     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1224         /* Didn't find anything in this RAM Block */
1225         pss->page = 0;
1226         pss->block = QLIST_NEXT_RCU(pss->block, next);
1227         if (!pss->block) {
1228             /* Hit the end of the list */
1229             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1230             /* Flag that we've looped */
1231             pss->complete_round = true;
1232             rs->ram_bulk_stage = false;
1233             if (migrate_use_xbzrle()) {
1234                 /* If xbzrle is on, stop using the data compression at this
1235                  * point. In theory, xbzrle can do better than compression.
1236                  */
1237                 flush_compressed_data(rs);
1238             }
1239         }
1240         /* Didn't find anything this time, but try again on the new block */
1241         *again = true;
1242         return false;
1243     } else {
1244         /* Can go around again, but... */
1245         *again = true;
1246         /* We've found something so probably don't need to */
1247         return true;
1248     }
1249 }
1250
1251 /**
1252  * unqueue_page: gets a page of the queue
1253  *
1254  * Helper for 'get_queued_page' - gets a page off the queue
1255  *
1256  * Returns the block of the page (or NULL if none available)
1257  *
1258  * @rs: current RAM state
1259  * @offset: used to return the offset within the RAMBlock
1260  */
1261 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1262 {
1263     RAMBlock *block = NULL;
1264
1265     qemu_mutex_lock(&rs->src_page_req_mutex);
1266     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1267         struct RAMSrcPageRequest *entry =
1268                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1269         block = entry->rb;
1270         *offset = entry->offset;
1271
1272         if (entry->len > TARGET_PAGE_SIZE) {
1273             entry->len -= TARGET_PAGE_SIZE;
1274             entry->offset += TARGET_PAGE_SIZE;
1275         } else {
1276             memory_region_unref(block->mr);
1277             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1278             g_free(entry);
1279         }
1280     }
1281     qemu_mutex_unlock(&rs->src_page_req_mutex);
1282
1283     return block;
1284 }
1285
1286 /**
1287  * get_queued_page: unqueue a page from the postocpy requests
1288  *
1289  * Skips pages that are already sent (!dirty)
1290  *
1291  * Returns if a queued page is found
1292  *
1293  * @rs: current RAM state
1294  * @pss: data about the state of the current dirty page scan
1295  */
1296 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1297 {
1298     RAMBlock  *block;
1299     ram_addr_t offset;
1300     bool dirty;
1301
1302     do {
1303         block = unqueue_page(rs, &offset);
1304         /*
1305          * We're sending this page, and since it's postcopy nothing else
1306          * will dirty it, and we must make sure it doesn't get sent again
1307          * even if this queue request was received after the background
1308          * search already sent it.
1309          */
1310         if (block) {
1311             unsigned long page;
1312
1313             page = offset >> TARGET_PAGE_BITS;
1314             dirty = test_bit(page, block->bmap);
1315             if (!dirty) {
1316                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1317                        page, test_bit(page, block->unsentmap));
1318             } else {
1319                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1320             }
1321         }
1322
1323     } while (block && !dirty);
1324
1325     if (block) {
1326         /*
1327          * As soon as we start servicing pages out of order, then we have
1328          * to kill the bulk stage, since the bulk stage assumes
1329          * in (migration_bitmap_find_and_reset_dirty) that every page is
1330          * dirty, that's no longer true.
1331          */
1332         rs->ram_bulk_stage = false;
1333
1334         /*
1335          * We want the background search to continue from the queued page
1336          * since the guest is likely to want other pages near to the page
1337          * it just requested.
1338          */
1339         pss->block = block;
1340         pss->page = offset >> TARGET_PAGE_BITS;
1341     }
1342
1343     return !!block;
1344 }
1345
1346 /**
1347  * migration_page_queue_free: drop any remaining pages in the ram
1348  * request queue
1349  *
1350  * It should be empty at the end anyway, but in error cases there may
1351  * be some left.  in case that there is any page left, we drop it.
1352  *
1353  */
1354 static void migration_page_queue_free(RAMState *rs)
1355 {
1356     struct RAMSrcPageRequest *mspr, *next_mspr;
1357     /* This queue generally should be empty - but in the case of a failed
1358      * migration might have some droppings in.
1359      */
1360     rcu_read_lock();
1361     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1362         memory_region_unref(mspr->rb->mr);
1363         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1364         g_free(mspr);
1365     }
1366     rcu_read_unlock();
1367 }
1368
1369 /**
1370  * ram_save_queue_pages: queue the page for transmission
1371  *
1372  * A request from postcopy destination for example.
1373  *
1374  * Returns zero on success or negative on error
1375  *
1376  * @rbname: Name of the RAMBLock of the request. NULL means the
1377  *          same that last one.
1378  * @start: starting address from the start of the RAMBlock
1379  * @len: length (in bytes) to send
1380  */
1381 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1382 {
1383     RAMBlock *ramblock;
1384     RAMState *rs = ram_state;
1385
1386     ram_counters.postcopy_requests++;
1387     rcu_read_lock();
1388     if (!rbname) {
1389         /* Reuse last RAMBlock */
1390         ramblock = rs->last_req_rb;
1391
1392         if (!ramblock) {
1393             /*
1394              * Shouldn't happen, we can't reuse the last RAMBlock if
1395              * it's the 1st request.
1396              */
1397             error_report("ram_save_queue_pages no previous block");
1398             goto err;
1399         }
1400     } else {
1401         ramblock = qemu_ram_block_by_name(rbname);
1402
1403         if (!ramblock) {
1404             /* We shouldn't be asked for a non-existent RAMBlock */
1405             error_report("ram_save_queue_pages no block '%s'", rbname);
1406             goto err;
1407         }
1408         rs->last_req_rb = ramblock;
1409     }
1410     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1411     if (start+len > ramblock->used_length) {
1412         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1413                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1414                      __func__, start, len, ramblock->used_length);
1415         goto err;
1416     }
1417
1418     struct RAMSrcPageRequest *new_entry =
1419         g_malloc0(sizeof(struct RAMSrcPageRequest));
1420     new_entry->rb = ramblock;
1421     new_entry->offset = start;
1422     new_entry->len = len;
1423
1424     memory_region_ref(ramblock->mr);
1425     qemu_mutex_lock(&rs->src_page_req_mutex);
1426     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1427     qemu_mutex_unlock(&rs->src_page_req_mutex);
1428     rcu_read_unlock();
1429
1430     return 0;
1431
1432 err:
1433     rcu_read_unlock();
1434     return -1;
1435 }
1436
1437 /**
1438  * ram_save_target_page: save one target page
1439  *
1440  * Returns the number of pages written
1441  *
1442  * @rs: current RAM state
1443  * @ms: current migration state
1444  * @pss: data about the page we want to send
1445  * @last_stage: if we are at the completion stage
1446  */
1447 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1448                                 bool last_stage)
1449 {
1450     int res = 0;
1451
1452     /* Check the pages is dirty and if it is send it */
1453     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1454         /*
1455          * If xbzrle is on, stop using the data compression after first
1456          * round of migration even if compression is enabled. In theory,
1457          * xbzrle can do better than compression.
1458          */
1459         if (migrate_use_compression() &&
1460             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1461             res = ram_save_compressed_page(rs, pss, last_stage);
1462         } else {
1463             res = ram_save_page(rs, pss, last_stage);
1464         }
1465
1466         if (res < 0) {
1467             return res;
1468         }
1469         if (pss->block->unsentmap) {
1470             clear_bit(pss->page, pss->block->unsentmap);
1471         }
1472     }
1473
1474     return res;
1475 }
1476
1477 /**
1478  * ram_save_host_page: save a whole host page
1479  *
1480  * Starting at *offset send pages up to the end of the current host
1481  * page. It's valid for the initial offset to point into the middle of
1482  * a host page in which case the remainder of the hostpage is sent.
1483  * Only dirty target pages are sent. Note that the host page size may
1484  * be a huge page for this block.
1485  * The saving stops at the boundary of the used_length of the block
1486  * if the RAMBlock isn't a multiple of the host page size.
1487  *
1488  * Returns the number of pages written or negative on error
1489  *
1490  * @rs: current RAM state
1491  * @ms: current migration state
1492  * @pss: data about the page we want to send
1493  * @last_stage: if we are at the completion stage
1494  */
1495 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1496                               bool last_stage)
1497 {
1498     int tmppages, pages = 0;
1499     size_t pagesize_bits =
1500         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1501
1502     do {
1503         tmppages = ram_save_target_page(rs, pss, last_stage);
1504         if (tmppages < 0) {
1505             return tmppages;
1506         }
1507
1508         pages += tmppages;
1509         pss->page++;
1510     } while ((pss->page & (pagesize_bits - 1)) &&
1511              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1512
1513     /* The offset we leave with is the last one we looked at */
1514     pss->page--;
1515     return pages;
1516 }
1517
1518 /**
1519  * ram_find_and_save_block: finds a dirty page and sends it to f
1520  *
1521  * Called within an RCU critical section.
1522  *
1523  * Returns the number of pages written where zero means no dirty pages
1524  *
1525  * @rs: current RAM state
1526  * @last_stage: if we are at the completion stage
1527  *
1528  * On systems where host-page-size > target-page-size it will send all the
1529  * pages in a host page that are dirty.
1530  */
1531
1532 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1533 {
1534     PageSearchStatus pss;
1535     int pages = 0;
1536     bool again, found;
1537
1538     /* No dirty page as there is zero RAM */
1539     if (!ram_bytes_total()) {
1540         return pages;
1541     }
1542
1543     pss.block = rs->last_seen_block;
1544     pss.page = rs->last_page;
1545     pss.complete_round = false;
1546
1547     if (!pss.block) {
1548         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1549     }
1550
1551     do {
1552         again = true;
1553         found = get_queued_page(rs, &pss);
1554
1555         if (!found) {
1556             /* priority queue empty, so just search for something dirty */
1557             found = find_dirty_block(rs, &pss, &again);
1558         }
1559
1560         if (found) {
1561             pages = ram_save_host_page(rs, &pss, last_stage);
1562         }
1563     } while (!pages && again);
1564
1565     rs->last_seen_block = pss.block;
1566     rs->last_page = pss.page;
1567
1568     return pages;
1569 }
1570
1571 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1572 {
1573     uint64_t pages = size / TARGET_PAGE_SIZE;
1574
1575     if (zero) {
1576         ram_counters.duplicate += pages;
1577     } else {
1578         ram_counters.normal += pages;
1579         ram_counters.transferred += size;
1580         qemu_update_position(f, size);
1581     }
1582 }
1583
1584 uint64_t ram_bytes_total(void)
1585 {
1586     RAMBlock *block;
1587     uint64_t total = 0;
1588
1589     rcu_read_lock();
1590     RAMBLOCK_FOREACH(block) {
1591         total += block->used_length;
1592     }
1593     rcu_read_unlock();
1594     return total;
1595 }
1596
1597 static void xbzrle_load_setup(void)
1598 {
1599     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1600 }
1601
1602 static void xbzrle_load_cleanup(void)
1603 {
1604     g_free(XBZRLE.decoded_buf);
1605     XBZRLE.decoded_buf = NULL;
1606 }
1607
1608 static void ram_state_cleanup(RAMState **rsp)
1609 {
1610     if (*rsp) {
1611         migration_page_queue_free(*rsp);
1612         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1613         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1614         g_free(*rsp);
1615         *rsp = NULL;
1616     }
1617 }
1618
1619 static void xbzrle_cleanup(void)
1620 {
1621     XBZRLE_cache_lock();
1622     if (XBZRLE.cache) {
1623         cache_fini(XBZRLE.cache);
1624         g_free(XBZRLE.encoded_buf);
1625         g_free(XBZRLE.current_buf);
1626         g_free(XBZRLE.zero_target_page);
1627         XBZRLE.cache = NULL;
1628         XBZRLE.encoded_buf = NULL;
1629         XBZRLE.current_buf = NULL;
1630         XBZRLE.zero_target_page = NULL;
1631     }
1632     XBZRLE_cache_unlock();
1633 }
1634
1635 static void ram_save_cleanup(void *opaque)
1636 {
1637     RAMState **rsp = opaque;
1638     RAMBlock *block;
1639
1640     /* caller have hold iothread lock or is in a bh, so there is
1641      * no writing race against this migration_bitmap
1642      */
1643     memory_global_dirty_log_stop();
1644
1645     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1646         g_free(block->bmap);
1647         block->bmap = NULL;
1648         g_free(block->unsentmap);
1649         block->unsentmap = NULL;
1650     }
1651
1652     xbzrle_cleanup();
1653     compress_threads_save_cleanup();
1654     ram_state_cleanup(rsp);
1655 }
1656
1657 static void ram_state_reset(RAMState *rs)
1658 {
1659     rs->last_seen_block = NULL;
1660     rs->last_sent_block = NULL;
1661     rs->last_page = 0;
1662     rs->last_version = ram_list.version;
1663     rs->ram_bulk_stage = true;
1664 }
1665
1666 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1667
1668 /*
1669  * 'expected' is the value you expect the bitmap mostly to be full
1670  * of; it won't bother printing lines that are all this value.
1671  * If 'todump' is null the migration bitmap is dumped.
1672  */
1673 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1674                            unsigned long pages)
1675 {
1676     int64_t cur;
1677     int64_t linelen = 128;
1678     char linebuf[129];
1679
1680     for (cur = 0; cur < pages; cur += linelen) {
1681         int64_t curb;
1682         bool found = false;
1683         /*
1684          * Last line; catch the case where the line length
1685          * is longer than remaining ram
1686          */
1687         if (cur + linelen > pages) {
1688             linelen = pages - cur;
1689         }
1690         for (curb = 0; curb < linelen; curb++) {
1691             bool thisbit = test_bit(cur + curb, todump);
1692             linebuf[curb] = thisbit ? '1' : '.';
1693             found = found || (thisbit != expected);
1694         }
1695         if (found) {
1696             linebuf[curb] = '\0';
1697             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1698         }
1699     }
1700 }
1701
1702 /* **** functions for postcopy ***** */
1703
1704 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1705 {
1706     struct RAMBlock *block;
1707
1708     RAMBLOCK_FOREACH(block) {
1709         unsigned long *bitmap = block->bmap;
1710         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1711         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1712
1713         while (run_start < range) {
1714             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1715             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1716                               (run_end - run_start) << TARGET_PAGE_BITS);
1717             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1718         }
1719     }
1720 }
1721
1722 /**
1723  * postcopy_send_discard_bm_ram: discard a RAMBlock
1724  *
1725  * Returns zero on success
1726  *
1727  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1728  * Note: At this point the 'unsentmap' is the processed bitmap combined
1729  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1730  *
1731  * @ms: current migration state
1732  * @pds: state for postcopy
1733  * @start: RAMBlock starting page
1734  * @length: RAMBlock size
1735  */
1736 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1737                                         PostcopyDiscardState *pds,
1738                                         RAMBlock *block)
1739 {
1740     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1741     unsigned long current;
1742     unsigned long *unsentmap = block->unsentmap;
1743
1744     for (current = 0; current < end; ) {
1745         unsigned long one = find_next_bit(unsentmap, end, current);
1746
1747         if (one <= end) {
1748             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1749             unsigned long discard_length;
1750
1751             if (zero >= end) {
1752                 discard_length = end - one;
1753             } else {
1754                 discard_length = zero - one;
1755             }
1756             if (discard_length) {
1757                 postcopy_discard_send_range(ms, pds, one, discard_length);
1758             }
1759             current = one + discard_length;
1760         } else {
1761             current = one;
1762         }
1763     }
1764
1765     return 0;
1766 }
1767
1768 /**
1769  * postcopy_each_ram_send_discard: discard all RAMBlocks
1770  *
1771  * Returns 0 for success or negative for error
1772  *
1773  * Utility for the outgoing postcopy code.
1774  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1775  *   passing it bitmap indexes and name.
1776  * (qemu_ram_foreach_block ends up passing unscaled lengths
1777  *  which would mean postcopy code would have to deal with target page)
1778  *
1779  * @ms: current migration state
1780  */
1781 static int postcopy_each_ram_send_discard(MigrationState *ms)
1782 {
1783     struct RAMBlock *block;
1784     int ret;
1785
1786     RAMBLOCK_FOREACH(block) {
1787         PostcopyDiscardState *pds =
1788             postcopy_discard_send_init(ms, block->idstr);
1789
1790         /*
1791          * Postcopy sends chunks of bitmap over the wire, but it
1792          * just needs indexes at this point, avoids it having
1793          * target page specific code.
1794          */
1795         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1796         postcopy_discard_send_finish(ms, pds);
1797         if (ret) {
1798             return ret;
1799         }
1800     }
1801
1802     return 0;
1803 }
1804
1805 /**
1806  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1807  *
1808  * Helper for postcopy_chunk_hostpages; it's called twice to
1809  * canonicalize the two bitmaps, that are similar, but one is
1810  * inverted.
1811  *
1812  * Postcopy requires that all target pages in a hostpage are dirty or
1813  * clean, not a mix.  This function canonicalizes the bitmaps.
1814  *
1815  * @ms: current migration state
1816  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1817  *               otherwise we need to canonicalize partially dirty host pages
1818  * @block: block that contains the page we want to canonicalize
1819  * @pds: state for postcopy
1820  */
1821 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1822                                           RAMBlock *block,
1823                                           PostcopyDiscardState *pds)
1824 {
1825     RAMState *rs = ram_state;
1826     unsigned long *bitmap = block->bmap;
1827     unsigned long *unsentmap = block->unsentmap;
1828     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1829     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1830     unsigned long run_start;
1831
1832     if (block->page_size == TARGET_PAGE_SIZE) {
1833         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1834         return;
1835     }
1836
1837     if (unsent_pass) {
1838         /* Find a sent page */
1839         run_start = find_next_zero_bit(unsentmap, pages, 0);
1840     } else {
1841         /* Find a dirty page */
1842         run_start = find_next_bit(bitmap, pages, 0);
1843     }
1844
1845     while (run_start < pages) {
1846         bool do_fixup = false;
1847         unsigned long fixup_start_addr;
1848         unsigned long host_offset;
1849
1850         /*
1851          * If the start of this run of pages is in the middle of a host
1852          * page, then we need to fixup this host page.
1853          */
1854         host_offset = run_start % host_ratio;
1855         if (host_offset) {
1856             do_fixup = true;
1857             run_start -= host_offset;
1858             fixup_start_addr = run_start;
1859             /* For the next pass */
1860             run_start = run_start + host_ratio;
1861         } else {
1862             /* Find the end of this run */
1863             unsigned long run_end;
1864             if (unsent_pass) {
1865                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1866             } else {
1867                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1868             }
1869             /*
1870              * If the end isn't at the start of a host page, then the
1871              * run doesn't finish at the end of a host page
1872              * and we need to discard.
1873              */
1874             host_offset = run_end % host_ratio;
1875             if (host_offset) {
1876                 do_fixup = true;
1877                 fixup_start_addr = run_end - host_offset;
1878                 /*
1879                  * This host page has gone, the next loop iteration starts
1880                  * from after the fixup
1881                  */
1882                 run_start = fixup_start_addr + host_ratio;
1883             } else {
1884                 /*
1885                  * No discards on this iteration, next loop starts from
1886                  * next sent/dirty page
1887                  */
1888                 run_start = run_end + 1;
1889             }
1890         }
1891
1892         if (do_fixup) {
1893             unsigned long page;
1894
1895             /* Tell the destination to discard this page */
1896             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1897                 /* For the unsent_pass we:
1898                  *     discard partially sent pages
1899                  * For the !unsent_pass (dirty) we:
1900                  *     discard partially dirty pages that were sent
1901                  *     (any partially sent pages were already discarded
1902                  *     by the previous unsent_pass)
1903                  */
1904                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1905                                             host_ratio);
1906             }
1907
1908             /* Clean up the bitmap */
1909             for (page = fixup_start_addr;
1910                  page < fixup_start_addr + host_ratio; page++) {
1911                 /* All pages in this host page are now not sent */
1912                 set_bit(page, unsentmap);
1913
1914                 /*
1915                  * Remark them as dirty, updating the count for any pages
1916                  * that weren't previously dirty.
1917                  */
1918                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1919             }
1920         }
1921
1922         if (unsent_pass) {
1923             /* Find the next sent page for the next iteration */
1924             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1925         } else {
1926             /* Find the next dirty page for the next iteration */
1927             run_start = find_next_bit(bitmap, pages, run_start);
1928         }
1929     }
1930 }
1931
1932 /**
1933  * postcopy_chuck_hostpages: discrad any partially sent host page
1934  *
1935  * Utility for the outgoing postcopy code.
1936  *
1937  * Discard any partially sent host-page size chunks, mark any partially
1938  * dirty host-page size chunks as all dirty.  In this case the host-page
1939  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1940  *
1941  * Returns zero on success
1942  *
1943  * @ms: current migration state
1944  * @block: block we want to work with
1945  */
1946 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1947 {
1948     PostcopyDiscardState *pds =
1949         postcopy_discard_send_init(ms, block->idstr);
1950
1951     /* First pass: Discard all partially sent host pages */
1952     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1953     /*
1954      * Second pass: Ensure that all partially dirty host pages are made
1955      * fully dirty.
1956      */
1957     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1958
1959     postcopy_discard_send_finish(ms, pds);
1960     return 0;
1961 }
1962
1963 /**
1964  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1965  *
1966  * Returns zero on success
1967  *
1968  * Transmit the set of pages to be discarded after precopy to the target
1969  * these are pages that:
1970  *     a) Have been previously transmitted but are now dirty again
1971  *     b) Pages that have never been transmitted, this ensures that
1972  *        any pages on the destination that have been mapped by background
1973  *        tasks get discarded (transparent huge pages is the specific concern)
1974  * Hopefully this is pretty sparse
1975  *
1976  * @ms: current migration state
1977  */
1978 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1979 {
1980     RAMState *rs = ram_state;
1981     RAMBlock *block;
1982     int ret;
1983
1984     rcu_read_lock();
1985
1986     /* This should be our last sync, the src is now paused */
1987     migration_bitmap_sync(rs);
1988
1989     /* Easiest way to make sure we don't resume in the middle of a host-page */
1990     rs->last_seen_block = NULL;
1991     rs->last_sent_block = NULL;
1992     rs->last_page = 0;
1993
1994     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1995         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1996         unsigned long *bitmap = block->bmap;
1997         unsigned long *unsentmap = block->unsentmap;
1998
1999         if (!unsentmap) {
2000             /* We don't have a safe way to resize the sentmap, so
2001              * if the bitmap was resized it will be NULL at this
2002              * point.
2003              */
2004             error_report("migration ram resized during precopy phase");
2005             rcu_read_unlock();
2006             return -EINVAL;
2007         }
2008         /* Deal with TPS != HPS and huge pages */
2009         ret = postcopy_chunk_hostpages(ms, block);
2010         if (ret) {
2011             rcu_read_unlock();
2012             return ret;
2013         }
2014
2015         /*
2016          * Update the unsentmap to be unsentmap = unsentmap | dirty
2017          */
2018         bitmap_or(unsentmap, unsentmap, bitmap, pages);
2019 #ifdef DEBUG_POSTCOPY
2020         ram_debug_dump_bitmap(unsentmap, true, pages);
2021 #endif
2022     }
2023     trace_ram_postcopy_send_discard_bitmap();
2024
2025     ret = postcopy_each_ram_send_discard(ms);
2026     rcu_read_unlock();
2027
2028     return ret;
2029 }
2030
2031 /**
2032  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2033  *
2034  * Returns zero on success
2035  *
2036  * @rbname: name of the RAMBlock of the request. NULL means the
2037  *          same that last one.
2038  * @start: RAMBlock starting page
2039  * @length: RAMBlock size
2040  */
2041 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2042 {
2043     int ret = -1;
2044
2045     trace_ram_discard_range(rbname, start, length);
2046
2047     rcu_read_lock();
2048     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2049
2050     if (!rb) {
2051         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2052         goto err;
2053     }
2054
2055     bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2056                  length >> qemu_target_page_bits());
2057     ret = ram_block_discard_range(rb, start, length);
2058
2059 err:
2060     rcu_read_unlock();
2061
2062     return ret;
2063 }
2064
2065 /*
2066  * For every allocation, we will try not to crash the VM if the
2067  * allocation failed.
2068  */
2069 static int xbzrle_init(void)
2070 {
2071     Error *local_err = NULL;
2072
2073     if (!migrate_use_xbzrle()) {
2074         return 0;
2075     }
2076
2077     XBZRLE_cache_lock();
2078
2079     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2080     if (!XBZRLE.zero_target_page) {
2081         error_report("%s: Error allocating zero page", __func__);
2082         goto err_out;
2083     }
2084
2085     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2086                               TARGET_PAGE_SIZE, &local_err);
2087     if (!XBZRLE.cache) {
2088         error_report_err(local_err);
2089         goto free_zero_page;
2090     }
2091
2092     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2093     if (!XBZRLE.encoded_buf) {
2094         error_report("%s: Error allocating encoded_buf", __func__);
2095         goto free_cache;
2096     }
2097
2098     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2099     if (!XBZRLE.current_buf) {
2100         error_report("%s: Error allocating current_buf", __func__);
2101         goto free_encoded_buf;
2102     }
2103
2104     /* We are all good */
2105     XBZRLE_cache_unlock();
2106     return 0;
2107
2108 free_encoded_buf:
2109     g_free(XBZRLE.encoded_buf);
2110     XBZRLE.encoded_buf = NULL;
2111 free_cache:
2112     cache_fini(XBZRLE.cache);
2113     XBZRLE.cache = NULL;
2114 free_zero_page:
2115     g_free(XBZRLE.zero_target_page);
2116     XBZRLE.zero_target_page = NULL;
2117 err_out:
2118     XBZRLE_cache_unlock();
2119     return -ENOMEM;
2120 }
2121
2122 static int ram_state_init(RAMState **rsp)
2123 {
2124     *rsp = g_try_new0(RAMState, 1);
2125
2126     if (!*rsp) {
2127         error_report("%s: Init ramstate fail", __func__);
2128         return -1;
2129     }
2130
2131     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2132     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2133     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2134
2135     /*
2136      * Count the total number of pages used by ram blocks not including any
2137      * gaps due to alignment or unplugs.
2138      */
2139     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2140
2141     ram_state_reset(*rsp);
2142
2143     return 0;
2144 }
2145
2146 static void ram_list_init_bitmaps(void)
2147 {
2148     RAMBlock *block;
2149     unsigned long pages;
2150
2151     /* Skip setting bitmap if there is no RAM */
2152     if (ram_bytes_total()) {
2153         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2154             pages = block->max_length >> TARGET_PAGE_BITS;
2155             block->bmap = bitmap_new(pages);
2156             bitmap_set(block->bmap, 0, pages);
2157             if (migrate_postcopy_ram()) {
2158                 block->unsentmap = bitmap_new(pages);
2159                 bitmap_set(block->unsentmap, 0, pages);
2160             }
2161         }
2162     }
2163 }
2164
2165 static void ram_init_bitmaps(RAMState *rs)
2166 {
2167     /* For memory_global_dirty_log_start below.  */
2168     qemu_mutex_lock_iothread();
2169     qemu_mutex_lock_ramlist();
2170     rcu_read_lock();
2171
2172     ram_list_init_bitmaps();
2173     memory_global_dirty_log_start();
2174     migration_bitmap_sync(rs);
2175
2176     rcu_read_unlock();
2177     qemu_mutex_unlock_ramlist();
2178     qemu_mutex_unlock_iothread();
2179 }
2180
2181 static int ram_init_all(RAMState **rsp)
2182 {
2183     if (ram_state_init(rsp)) {
2184         return -1;
2185     }
2186
2187     if (xbzrle_init()) {
2188         ram_state_cleanup(rsp);
2189         return -1;
2190     }
2191
2192     ram_init_bitmaps(*rsp);
2193
2194     return 0;
2195 }
2196
2197 /*
2198  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2199  * long-running RCU critical section.  When rcu-reclaims in the code
2200  * start to become numerous it will be necessary to reduce the
2201  * granularity of these critical sections.
2202  */
2203
2204 /**
2205  * ram_save_setup: Setup RAM for migration
2206  *
2207  * Returns zero to indicate success and negative for error
2208  *
2209  * @f: QEMUFile where to send the data
2210  * @opaque: RAMState pointer
2211  */
2212 static int ram_save_setup(QEMUFile *f, void *opaque)
2213 {
2214     RAMState **rsp = opaque;
2215     RAMBlock *block;
2216
2217     /* migration has already setup the bitmap, reuse it. */
2218     if (!migration_in_colo_state()) {
2219         if (ram_init_all(rsp) != 0) {
2220             return -1;
2221         }
2222     }
2223     (*rsp)->f = f;
2224
2225     rcu_read_lock();
2226
2227     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2228
2229     RAMBLOCK_FOREACH(block) {
2230         qemu_put_byte(f, strlen(block->idstr));
2231         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2232         qemu_put_be64(f, block->used_length);
2233         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2234             qemu_put_be64(f, block->page_size);
2235         }
2236     }
2237
2238     rcu_read_unlock();
2239     compress_threads_save_setup();
2240
2241     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2242     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2243
2244     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2245
2246     return 0;
2247 }
2248
2249 /**
2250  * ram_save_iterate: iterative stage for migration
2251  *
2252  * Returns zero to indicate success and negative for error
2253  *
2254  * @f: QEMUFile where to send the data
2255  * @opaque: RAMState pointer
2256  */
2257 static int ram_save_iterate(QEMUFile *f, void *opaque)
2258 {
2259     RAMState **temp = opaque;
2260     RAMState *rs = *temp;
2261     int ret;
2262     int i;
2263     int64_t t0;
2264     int done = 0;
2265
2266     if (blk_mig_bulk_active()) {
2267         /* Avoid transferring ram during bulk phase of block migration as
2268          * the bulk phase will usually take a long time and transferring
2269          * ram updates during that time is pointless. */
2270         goto out;
2271     }
2272
2273     rcu_read_lock();
2274     if (ram_list.version != rs->last_version) {
2275         ram_state_reset(rs);
2276     }
2277
2278     /* Read version before ram_list.blocks */
2279     smp_rmb();
2280
2281     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2282
2283     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2284     i = 0;
2285     while ((ret = qemu_file_rate_limit(f)) == 0) {
2286         int pages;
2287
2288         pages = ram_find_and_save_block(rs, false);
2289         /* no more pages to sent */
2290         if (pages == 0) {
2291             done = 1;
2292             break;
2293         }
2294         rs->iterations++;
2295
2296         /* we want to check in the 1st loop, just in case it was the 1st time
2297            and we had to sync the dirty bitmap.
2298            qemu_get_clock_ns() is a bit expensive, so we only check each some
2299            iterations
2300         */
2301         if ((i & 63) == 0) {
2302             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2303             if (t1 > MAX_WAIT) {
2304                 trace_ram_save_iterate_big_wait(t1, i);
2305                 break;
2306             }
2307         }
2308         i++;
2309     }
2310     flush_compressed_data(rs);
2311     rcu_read_unlock();
2312
2313     /*
2314      * Must occur before EOS (or any QEMUFile operation)
2315      * because of RDMA protocol.
2316      */
2317     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2318
2319 out:
2320     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2321     ram_counters.transferred += 8;
2322
2323     ret = qemu_file_get_error(f);
2324     if (ret < 0) {
2325         return ret;
2326     }
2327
2328     return done;
2329 }
2330
2331 /**
2332  * ram_save_complete: function called to send the remaining amount of ram
2333  *
2334  * Returns zero to indicate success
2335  *
2336  * Called with iothread lock
2337  *
2338  * @f: QEMUFile where to send the data
2339  * @opaque: RAMState pointer
2340  */
2341 static int ram_save_complete(QEMUFile *f, void *opaque)
2342 {
2343     RAMState **temp = opaque;
2344     RAMState *rs = *temp;
2345
2346     rcu_read_lock();
2347
2348     if (!migration_in_postcopy()) {
2349         migration_bitmap_sync(rs);
2350     }
2351
2352     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2353
2354     /* try transferring iterative blocks of memory */
2355
2356     /* flush all remaining blocks regardless of rate limiting */
2357     while (true) {
2358         int pages;
2359
2360         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2361         /* no more blocks to sent */
2362         if (pages == 0) {
2363             break;
2364         }
2365     }
2366
2367     flush_compressed_data(rs);
2368     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2369
2370     rcu_read_unlock();
2371
2372     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2373
2374     return 0;
2375 }
2376
2377 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2378                              uint64_t *res_precopy_only,
2379                              uint64_t *res_compatible,
2380                              uint64_t *res_postcopy_only)
2381 {
2382     RAMState **temp = opaque;
2383     RAMState *rs = *temp;
2384     uint64_t remaining_size;
2385
2386     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2387
2388     if (!migration_in_postcopy() &&
2389         remaining_size < max_size) {
2390         qemu_mutex_lock_iothread();
2391         rcu_read_lock();
2392         migration_bitmap_sync(rs);
2393         rcu_read_unlock();
2394         qemu_mutex_unlock_iothread();
2395         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2396     }
2397
2398     if (migrate_postcopy_ram()) {
2399         /* We can do postcopy, and all the data is postcopiable */
2400         *res_compatible += remaining_size;
2401     } else {
2402         *res_precopy_only += remaining_size;
2403     }
2404 }
2405
2406 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2407 {
2408     unsigned int xh_len;
2409     int xh_flags;
2410     uint8_t *loaded_data;
2411
2412     /* extract RLE header */
2413     xh_flags = qemu_get_byte(f);
2414     xh_len = qemu_get_be16(f);
2415
2416     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2417         error_report("Failed to load XBZRLE page - wrong compression!");
2418         return -1;
2419     }
2420
2421     if (xh_len > TARGET_PAGE_SIZE) {
2422         error_report("Failed to load XBZRLE page - len overflow!");
2423         return -1;
2424     }
2425     loaded_data = XBZRLE.decoded_buf;
2426     /* load data and decode */
2427     /* it can change loaded_data to point to an internal buffer */
2428     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2429
2430     /* decode RLE */
2431     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2432                              TARGET_PAGE_SIZE) == -1) {
2433         error_report("Failed to load XBZRLE page - decode error!");
2434         return -1;
2435     }
2436
2437     return 0;
2438 }
2439
2440 /**
2441  * ram_block_from_stream: read a RAMBlock id from the migration stream
2442  *
2443  * Must be called from within a rcu critical section.
2444  *
2445  * Returns a pointer from within the RCU-protected ram_list.
2446  *
2447  * @f: QEMUFile where to read the data from
2448  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2449  */
2450 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2451 {
2452     static RAMBlock *block = NULL;
2453     char id[256];
2454     uint8_t len;
2455
2456     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2457         if (!block) {
2458             error_report("Ack, bad migration stream!");
2459             return NULL;
2460         }
2461         return block;
2462     }
2463
2464     len = qemu_get_byte(f);
2465     qemu_get_buffer(f, (uint8_t *)id, len);
2466     id[len] = 0;
2467
2468     block = qemu_ram_block_by_name(id);
2469     if (!block) {
2470         error_report("Can't find block %s", id);
2471         return NULL;
2472     }
2473
2474     return block;
2475 }
2476
2477 static inline void *host_from_ram_block_offset(RAMBlock *block,
2478                                                ram_addr_t offset)
2479 {
2480     if (!offset_in_ramblock(block, offset)) {
2481         return NULL;
2482     }
2483
2484     return block->host + offset;
2485 }
2486
2487 /**
2488  * ram_handle_compressed: handle the zero page case
2489  *
2490  * If a page (or a whole RDMA chunk) has been
2491  * determined to be zero, then zap it.
2492  *
2493  * @host: host address for the zero page
2494  * @ch: what the page is filled from.  We only support zero
2495  * @size: size of the zero page
2496  */
2497 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2498 {
2499     if (ch != 0 || !is_zero_range(host, size)) {
2500         memset(host, ch, size);
2501     }
2502 }
2503
2504 static void *do_data_decompress(void *opaque)
2505 {
2506     DecompressParam *param = opaque;
2507     unsigned long pagesize;
2508     uint8_t *des;
2509     int len;
2510
2511     qemu_mutex_lock(&param->mutex);
2512     while (!param->quit) {
2513         if (param->des) {
2514             des = param->des;
2515             len = param->len;
2516             param->des = 0;
2517             qemu_mutex_unlock(&param->mutex);
2518
2519             pagesize = TARGET_PAGE_SIZE;
2520             /* uncompress() will return failed in some case, especially
2521              * when the page is dirted when doing the compression, it's
2522              * not a problem because the dirty page will be retransferred
2523              * and uncompress() won't break the data in other pages.
2524              */
2525             uncompress((Bytef *)des, &pagesize,
2526                        (const Bytef *)param->compbuf, len);
2527
2528             qemu_mutex_lock(&decomp_done_lock);
2529             param->done = true;
2530             qemu_cond_signal(&decomp_done_cond);
2531             qemu_mutex_unlock(&decomp_done_lock);
2532
2533             qemu_mutex_lock(&param->mutex);
2534         } else {
2535             qemu_cond_wait(&param->cond, &param->mutex);
2536         }
2537     }
2538     qemu_mutex_unlock(&param->mutex);
2539
2540     return NULL;
2541 }
2542
2543 static void wait_for_decompress_done(void)
2544 {
2545     int idx, thread_count;
2546
2547     if (!migrate_use_compression()) {
2548         return;
2549     }
2550
2551     thread_count = migrate_decompress_threads();
2552     qemu_mutex_lock(&decomp_done_lock);
2553     for (idx = 0; idx < thread_count; idx++) {
2554         while (!decomp_param[idx].done) {
2555             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2556         }
2557     }
2558     qemu_mutex_unlock(&decomp_done_lock);
2559 }
2560
2561 static void compress_threads_load_setup(void)
2562 {
2563     int i, thread_count;
2564
2565     if (!migrate_use_compression()) {
2566         return;
2567     }
2568     thread_count = migrate_decompress_threads();
2569     decompress_threads = g_new0(QemuThread, thread_count);
2570     decomp_param = g_new0(DecompressParam, thread_count);
2571     qemu_mutex_init(&decomp_done_lock);
2572     qemu_cond_init(&decomp_done_cond);
2573     for (i = 0; i < thread_count; i++) {
2574         qemu_mutex_init(&decomp_param[i].mutex);
2575         qemu_cond_init(&decomp_param[i].cond);
2576         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2577         decomp_param[i].done = true;
2578         decomp_param[i].quit = false;
2579         qemu_thread_create(decompress_threads + i, "decompress",
2580                            do_data_decompress, decomp_param + i,
2581                            QEMU_THREAD_JOINABLE);
2582     }
2583 }
2584
2585 static void compress_threads_load_cleanup(void)
2586 {
2587     int i, thread_count;
2588
2589     if (!migrate_use_compression()) {
2590         return;
2591     }
2592     thread_count = migrate_decompress_threads();
2593     for (i = 0; i < thread_count; i++) {
2594         qemu_mutex_lock(&decomp_param[i].mutex);
2595         decomp_param[i].quit = true;
2596         qemu_cond_signal(&decomp_param[i].cond);
2597         qemu_mutex_unlock(&decomp_param[i].mutex);
2598     }
2599     for (i = 0; i < thread_count; i++) {
2600         qemu_thread_join(decompress_threads + i);
2601         qemu_mutex_destroy(&decomp_param[i].mutex);
2602         qemu_cond_destroy(&decomp_param[i].cond);
2603         g_free(decomp_param[i].compbuf);
2604     }
2605     g_free(decompress_threads);
2606     g_free(decomp_param);
2607     decompress_threads = NULL;
2608     decomp_param = NULL;
2609 }
2610
2611 static void decompress_data_with_multi_threads(QEMUFile *f,
2612                                                void *host, int len)
2613 {
2614     int idx, thread_count;
2615
2616     thread_count = migrate_decompress_threads();
2617     qemu_mutex_lock(&decomp_done_lock);
2618     while (true) {
2619         for (idx = 0; idx < thread_count; idx++) {
2620             if (decomp_param[idx].done) {
2621                 decomp_param[idx].done = false;
2622                 qemu_mutex_lock(&decomp_param[idx].mutex);
2623                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2624                 decomp_param[idx].des = host;
2625                 decomp_param[idx].len = len;
2626                 qemu_cond_signal(&decomp_param[idx].cond);
2627                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2628                 break;
2629             }
2630         }
2631         if (idx < thread_count) {
2632             break;
2633         } else {
2634             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2635         }
2636     }
2637     qemu_mutex_unlock(&decomp_done_lock);
2638 }
2639
2640 /**
2641  * ram_load_setup: Setup RAM for migration incoming side
2642  *
2643  * Returns zero to indicate success and negative for error
2644  *
2645  * @f: QEMUFile where to receive the data
2646  * @opaque: RAMState pointer
2647  */
2648 static int ram_load_setup(QEMUFile *f, void *opaque)
2649 {
2650     xbzrle_load_setup();
2651     compress_threads_load_setup();
2652     ramblock_recv_map_init();
2653     return 0;
2654 }
2655
2656 static int ram_load_cleanup(void *opaque)
2657 {
2658     RAMBlock *rb;
2659     xbzrle_load_cleanup();
2660     compress_threads_load_cleanup();
2661
2662     RAMBLOCK_FOREACH(rb) {
2663         g_free(rb->receivedmap);
2664         rb->receivedmap = NULL;
2665     }
2666     return 0;
2667 }
2668
2669 /**
2670  * ram_postcopy_incoming_init: allocate postcopy data structures
2671  *
2672  * Returns 0 for success and negative if there was one error
2673  *
2674  * @mis: current migration incoming state
2675  *
2676  * Allocate data structures etc needed by incoming migration with
2677  * postcopy-ram. postcopy-ram's similarly names
2678  * postcopy_ram_incoming_init does the work.
2679  */
2680 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2681 {
2682     unsigned long ram_pages = last_ram_page();
2683
2684     return postcopy_ram_incoming_init(mis, ram_pages);
2685 }
2686
2687 /**
2688  * ram_load_postcopy: load a page in postcopy case
2689  *
2690  * Returns 0 for success or -errno in case of error
2691  *
2692  * Called in postcopy mode by ram_load().
2693  * rcu_read_lock is taken prior to this being called.
2694  *
2695  * @f: QEMUFile where to send the data
2696  */
2697 static int ram_load_postcopy(QEMUFile *f)
2698 {
2699     int flags = 0, ret = 0;
2700     bool place_needed = false;
2701     bool matching_page_sizes = false;
2702     MigrationIncomingState *mis = migration_incoming_get_current();
2703     /* Temporary page that is later 'placed' */
2704     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2705     void *last_host = NULL;
2706     bool all_zero = false;
2707
2708     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2709         ram_addr_t addr;
2710         void *host = NULL;
2711         void *page_buffer = NULL;
2712         void *place_source = NULL;
2713         RAMBlock *block = NULL;
2714         uint8_t ch;
2715
2716         addr = qemu_get_be64(f);
2717
2718         /*
2719          * If qemu file error, we should stop here, and then "addr"
2720          * may be invalid
2721          */
2722         ret = qemu_file_get_error(f);
2723         if (ret) {
2724             break;
2725         }
2726
2727         flags = addr & ~TARGET_PAGE_MASK;
2728         addr &= TARGET_PAGE_MASK;
2729
2730         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2731         place_needed = false;
2732         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2733             block = ram_block_from_stream(f, flags);
2734
2735             host = host_from_ram_block_offset(block, addr);
2736             if (!host) {
2737                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2738                 ret = -EINVAL;
2739                 break;
2740             }
2741             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2742             /*
2743              * Postcopy requires that we place whole host pages atomically;
2744              * these may be huge pages for RAMBlocks that are backed by
2745              * hugetlbfs.
2746              * To make it atomic, the data is read into a temporary page
2747              * that's moved into place later.
2748              * The migration protocol uses,  possibly smaller, target-pages
2749              * however the source ensures it always sends all the components
2750              * of a host page in order.
2751              */
2752             page_buffer = postcopy_host_page +
2753                           ((uintptr_t)host & (block->page_size - 1));
2754             /* If all TP are zero then we can optimise the place */
2755             if (!((uintptr_t)host & (block->page_size - 1))) {
2756                 all_zero = true;
2757             } else {
2758                 /* not the 1st TP within the HP */
2759                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2760                     error_report("Non-sequential target page %p/%p",
2761                                   host, last_host);
2762                     ret = -EINVAL;
2763                     break;
2764                 }
2765             }
2766
2767
2768             /*
2769              * If it's the last part of a host page then we place the host
2770              * page
2771              */
2772             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2773                                      (block->page_size - 1)) == 0;
2774             place_source = postcopy_host_page;
2775         }
2776         last_host = host;
2777
2778         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2779         case RAM_SAVE_FLAG_ZERO:
2780             ch = qemu_get_byte(f);
2781             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2782             if (ch) {
2783                 all_zero = false;
2784             }
2785             break;
2786
2787         case RAM_SAVE_FLAG_PAGE:
2788             all_zero = false;
2789             if (!place_needed || !matching_page_sizes) {
2790                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2791             } else {
2792                 /* Avoids the qemu_file copy during postcopy, which is
2793                  * going to do a copy later; can only do it when we
2794                  * do this read in one go (matching page sizes)
2795                  */
2796                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2797                                          TARGET_PAGE_SIZE);
2798             }
2799             break;
2800         case RAM_SAVE_FLAG_EOS:
2801             /* normal exit */
2802             break;
2803         default:
2804             error_report("Unknown combination of migration flags: %#x"
2805                          " (postcopy mode)", flags);
2806             ret = -EINVAL;
2807             break;
2808         }
2809
2810         /* Detect for any possible file errors */
2811         if (!ret && qemu_file_get_error(f)) {
2812             ret = qemu_file_get_error(f);
2813         }
2814
2815         if (!ret && place_needed) {
2816             /* This gets called at the last target page in the host page */
2817             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2818
2819             if (all_zero) {
2820                 ret = postcopy_place_page_zero(mis, place_dest,
2821                                                block);
2822             } else {
2823                 ret = postcopy_place_page(mis, place_dest,
2824                                           place_source, block);
2825             }
2826         }
2827     }
2828
2829     return ret;
2830 }
2831
2832 static bool postcopy_is_advised(void)
2833 {
2834     PostcopyState ps = postcopy_state_get();
2835     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
2836 }
2837
2838 static bool postcopy_is_running(void)
2839 {
2840     PostcopyState ps = postcopy_state_get();
2841     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
2842 }
2843
2844 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2845 {
2846     int flags = 0, ret = 0, invalid_flags = 0;
2847     static uint64_t seq_iter;
2848     int len = 0;
2849     /*
2850      * If system is running in postcopy mode, page inserts to host memory must
2851      * be atomic
2852      */
2853     bool postcopy_running = postcopy_is_running();
2854     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2855     bool postcopy_advised = postcopy_is_advised();
2856
2857     seq_iter++;
2858
2859     if (version_id != 4) {
2860         ret = -EINVAL;
2861     }
2862
2863     if (!migrate_use_compression()) {
2864         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2865     }
2866     /* This RCU critical section can be very long running.
2867      * When RCU reclaims in the code start to become numerous,
2868      * it will be necessary to reduce the granularity of this
2869      * critical section.
2870      */
2871     rcu_read_lock();
2872
2873     if (postcopy_running) {
2874         ret = ram_load_postcopy(f);
2875     }
2876
2877     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2878         ram_addr_t addr, total_ram_bytes;
2879         void *host = NULL;
2880         uint8_t ch;
2881
2882         addr = qemu_get_be64(f);
2883         flags = addr & ~TARGET_PAGE_MASK;
2884         addr &= TARGET_PAGE_MASK;
2885
2886         if (flags & invalid_flags) {
2887             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2888                 error_report("Received an unexpected compressed page");
2889             }
2890
2891             ret = -EINVAL;
2892             break;
2893         }
2894
2895         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2896                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2897             RAMBlock *block = ram_block_from_stream(f, flags);
2898
2899             host = host_from_ram_block_offset(block, addr);
2900             if (!host) {
2901                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2902                 ret = -EINVAL;
2903                 break;
2904             }
2905             ramblock_recv_bitmap_set(block, host);
2906             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2907         }
2908
2909         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2910         case RAM_SAVE_FLAG_MEM_SIZE:
2911             /* Synchronize RAM block list */
2912             total_ram_bytes = addr;
2913             while (!ret && total_ram_bytes) {
2914                 RAMBlock *block;
2915                 char id[256];
2916                 ram_addr_t length;
2917
2918                 len = qemu_get_byte(f);
2919                 qemu_get_buffer(f, (uint8_t *)id, len);
2920                 id[len] = 0;
2921                 length = qemu_get_be64(f);
2922
2923                 block = qemu_ram_block_by_name(id);
2924                 if (block) {
2925                     if (length != block->used_length) {
2926                         Error *local_err = NULL;
2927
2928                         ret = qemu_ram_resize(block, length,
2929                                               &local_err);
2930                         if (local_err) {
2931                             error_report_err(local_err);
2932                         }
2933                     }
2934                     /* For postcopy we need to check hugepage sizes match */
2935                     if (postcopy_advised &&
2936                         block->page_size != qemu_host_page_size) {
2937                         uint64_t remote_page_size = qemu_get_be64(f);
2938                         if (remote_page_size != block->page_size) {
2939                             error_report("Mismatched RAM page size %s "
2940                                          "(local) %zd != %" PRId64,
2941                                          id, block->page_size,
2942                                          remote_page_size);
2943                             ret = -EINVAL;
2944                         }
2945                     }
2946                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2947                                           block->idstr);
2948                 } else {
2949                     error_report("Unknown ramblock \"%s\", cannot "
2950                                  "accept migration", id);
2951                     ret = -EINVAL;
2952                 }
2953
2954                 total_ram_bytes -= length;
2955             }
2956             break;
2957
2958         case RAM_SAVE_FLAG_ZERO:
2959             ch = qemu_get_byte(f);
2960             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2961             break;
2962
2963         case RAM_SAVE_FLAG_PAGE:
2964             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2965             break;
2966
2967         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2968             len = qemu_get_be32(f);
2969             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2970                 error_report("Invalid compressed data length: %d", len);
2971                 ret = -EINVAL;
2972                 break;
2973             }
2974             decompress_data_with_multi_threads(f, host, len);
2975             break;
2976
2977         case RAM_SAVE_FLAG_XBZRLE:
2978             if (load_xbzrle(f, addr, host) < 0) {
2979                 error_report("Failed to decompress XBZRLE page at "
2980                              RAM_ADDR_FMT, addr);
2981                 ret = -EINVAL;
2982                 break;
2983             }
2984             break;
2985         case RAM_SAVE_FLAG_EOS:
2986             /* normal exit */
2987             break;
2988         default:
2989             if (flags & RAM_SAVE_FLAG_HOOK) {
2990                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2991             } else {
2992                 error_report("Unknown combination of migration flags: %#x",
2993                              flags);
2994                 ret = -EINVAL;
2995             }
2996         }
2997         if (!ret) {
2998             ret = qemu_file_get_error(f);
2999         }
3000     }
3001
3002     wait_for_decompress_done();
3003     rcu_read_unlock();
3004     trace_ram_load_complete(ret, seq_iter);
3005     return ret;
3006 }
3007
3008 static bool ram_has_postcopy(void *opaque)
3009 {
3010     return migrate_postcopy_ram();
3011 }
3012
3013 static SaveVMHandlers savevm_ram_handlers = {
3014     .save_setup = ram_save_setup,
3015     .save_live_iterate = ram_save_iterate,
3016     .save_live_complete_postcopy = ram_save_complete,
3017     .save_live_complete_precopy = ram_save_complete,
3018     .has_postcopy = ram_has_postcopy,
3019     .save_live_pending = ram_save_pending,
3020     .load_state = ram_load,
3021     .save_cleanup = ram_save_cleanup,
3022     .load_setup = ram_load_setup,
3023     .load_cleanup = ram_load_cleanup,
3024 };
3025
3026 void ram_mig_init(void)
3027 {
3028     qemu_mutex_init(&XBZRLE.lock);
3029     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
3030 }