migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "migration/register.h"
  40 #include "migration/misc.h"
  41 #include "qemu-file.h"
  42 #include "postcopy-ram.h"
  43 #include "migration/page_cache.h"
  44 #include "qemu/error-report.h"
  45 #include "qapi/error.h"
  46 #include "qapi/qapi-events-migration.h"
  47 #include "qapi/qmp/qerror.h"
  48 #include "trace.h"
  49 #include "exec/ram_addr.h"
  50 #include "exec/target_page.h"
  51 #include "qemu/rcu_queue.h"
  52 #include "migration/colo.h"
  53 #include "migration/block.h"
  54
  55 /***********************************************************/
  56 /* ram save/restore */
  57
  58 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  59  * worked for pages that where filled with the same char.  We switched
  60  * it to only search for the zero value.  And to avoid confusion with
  61  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  62  */
  63
  64 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  65 #define RAM_SAVE_FLAG_ZERO     0x02
  66 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  67 #define RAM_SAVE_FLAG_PAGE     0x08
  68 #define RAM_SAVE_FLAG_EOS      0x10
  69 #define RAM_SAVE_FLAG_CONTINUE 0x20
  70 #define RAM_SAVE_FLAG_XBZRLE   0x40
  71 /* 0x80 is reserved in migration.h start with 0x100 next */
  72 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  73
  74 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  75 {
  76     return buffer_is_zero(p, size);
  77 }
  78
  79 XBZRLECacheStats xbzrle_counters;
  80
  81 /* struct contains XBZRLE cache and a static page
  82    used by the compression */
  83 static struct {
  84     /* buffer used for XBZRLE encoding */
  85     uint8_t *encoded_buf;
  86     /* buffer for storing page content */
  87     uint8_t *current_buf;
  88     /* Cache for XBZRLE, Protected by lock. */
  89     PageCache *cache;
  90     QemuMutex lock;
  91     /* it will store a page full of zeros */
  92     uint8_t *zero_target_page;
  93     /* buffer used for XBZRLE decoding */
  94     uint8_t *decoded_buf;
  95 } XBZRLE;
  96
  97 static void XBZRLE_cache_lock(void)
  98 {
  99     if (migrate_use_xbzrle())
 100         qemu_mutex_lock(&XBZRLE.lock);
 101 }
 102
 103 static void XBZRLE_cache_unlock(void)
 104 {
 105     if (migrate_use_xbzrle())
 106         qemu_mutex_unlock(&XBZRLE.lock);
 107 }
 108
 109 /**
 110  * xbzrle_cache_resize: resize the xbzrle cache
 111  *
 112  * This function is called from qmp_migrate_set_cache_size in main
 113  * thread, possibly while a migration is in progress.  A running
 114  * migration may be using the cache and might finish during this call,
 115  * hence changes to the cache are protected by XBZRLE.lock().
 116  *
 117  * Returns 0 for success or -1 for error
 118  *
 119  * @new_size: new cache size
 120  * @errp: set *errp if the check failed, with reason
 121  */
 122 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 123 {
 124     PageCache *new_cache;
 125     int64_t ret = 0;
 126
 127     /* Check for truncation */
 128     if (new_size != (size_t)new_size) {
 129         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 130                    "exceeding address space");
 131         return -1;
 132     }
 133
 134     if (new_size == migrate_xbzrle_cache_size()) {
 135         /* nothing to do */
 136         return 0;
 137     }
 138
 139     XBZRLE_cache_lock();
 140
 141     if (XBZRLE.cache != NULL) {
 142         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 143         if (!new_cache) {
 144             ret = -1;
 145             goto out;
 146         }
 147
 148         cache_fini(XBZRLE.cache);
 149         XBZRLE.cache = new_cache;
 150     }
 151 out:
 152     XBZRLE_cache_unlock();
 153     return ret;
 154 }
 155
 156 static void ramblock_recv_map_init(void)
 157 {
 158     RAMBlock *rb;
 159
 160     RAMBLOCK_FOREACH(rb) {
 161         assert(!rb->receivedmap);
 162         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 163     }
 164 }
 165
 166 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 167 {
 168     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 169                     rb->receivedmap);
 170 }
 171
 172 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 173 {
 174     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 175 }
 176
 177 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 178                                     size_t nr)
 179 {
 180     bitmap_set_atomic(rb->receivedmap,
 181                       ramblock_recv_bitmap_offset(host_addr, rb),
 182                       nr);
 183 }
 184
 185 /*
 186  * An outstanding page request, on the source, having been received
 187  * and queued
 188  */
 189 struct RAMSrcPageRequest {
 190     RAMBlock *rb;
 191     hwaddr    offset;
 192     hwaddr    len;
 193
 194     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 195 };
 196
 197 /* State of RAM for migration */
 198 struct RAMState {
 199     /* QEMUFile used for this migration */
 200     QEMUFile *f;
 201     /* Last block that we have visited searching for dirty pages */
 202     RAMBlock *last_seen_block;
 203     /* Last block from where we have sent data */
 204     RAMBlock *last_sent_block;
 205     /* Last dirty target page we have sent */
 206     ram_addr_t last_page;
 207     /* last ram version we have seen */
 208     uint32_t last_version;
 209     /* We are in the first round */
 210     bool ram_bulk_stage;
 211     /* How many times we have dirty too many pages */
 212     int dirty_rate_high_cnt;
 213     /* these variables are used for bitmap sync */
 214     /* last time we did a full bitmap_sync */
 215     int64_t time_last_bitmap_sync;
 216     /* bytes transferred at start_time */
 217     uint64_t bytes_xfer_prev;
 218     /* number of dirty pages since start_time */
 219     uint64_t num_dirty_pages_period;
 220     /* xbzrle misses since the beginning of the period */
 221     uint64_t xbzrle_cache_miss_prev;
 222     /* number of iterations at the beginning of period */
 223     uint64_t iterations_prev;
 224     /* Iterations since start */
 225     uint64_t iterations;
 226     /* number of dirty bits in the bitmap */
 227     uint64_t migration_dirty_pages;
 228     /* protects modification of the bitmap */
 229     QemuMutex bitmap_mutex;
 230     /* The RAMBlock used in the last src_page_requests */
 231     RAMBlock *last_req_rb;
 232     /* Queue of outstanding page requests from the destination */
 233     QemuMutex src_page_req_mutex;
 234     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 235 };
 236 typedef struct RAMState RAMState;
 237
 238 static RAMState *ram_state;
 239
 240 uint64_t ram_bytes_remaining(void)
 241 {
 242     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 243                        0;
 244 }
 245
 246 MigrationStats ram_counters;
 247
 248 /* used by the search for pages to send */
 249 struct PageSearchStatus {
 250     /* Current block being searched */
 251     RAMBlock    *block;
 252     /* Current page to search from */
 253     unsigned long page;
 254     /* Set once we wrap around */
 255     bool         complete_round;
 256 };
 257 typedef struct PageSearchStatus PageSearchStatus;
 258
 259 struct CompressParam {
 260     bool done;
 261     bool quit;
 262     QEMUFile *file;
 263     QemuMutex mutex;
 264     QemuCond cond;
 265     RAMBlock *block;
 266     ram_addr_t offset;
 267 };
 268 typedef struct CompressParam CompressParam;
 269
 270 struct DecompressParam {
 271     bool done;
 272     bool quit;
 273     QemuMutex mutex;
 274     QemuCond cond;
 275     void *des;
 276     uint8_t *compbuf;
 277     int len;
 278 };
 279 typedef struct DecompressParam DecompressParam;
 280
 281 static CompressParam *comp_param;
 282 static QemuThread *compress_threads;
 283 /* comp_done_cond is used to wake up the migration thread when
 284  * one of the compression threads has finished the compression.
 285  * comp_done_lock is used to co-work with comp_done_cond.
 286  */
 287 static QemuMutex comp_done_lock;
 288 static QemuCond comp_done_cond;
 289 /* The empty QEMUFileOps will be used by file in CompressParam */
 290 static const QEMUFileOps empty_ops = { };
 291
 292 static DecompressParam *decomp_param;
 293 static QemuThread *decompress_threads;
 294 static QemuMutex decomp_done_lock;
 295 static QemuCond decomp_done_cond;
 296
 297 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 298                                 ram_addr_t offset);
 299
 300 static void *do_data_compress(void *opaque)
 301 {
 302     CompressParam *param = opaque;
 303     RAMBlock *block;
 304     ram_addr_t offset;
 305
 306     qemu_mutex_lock(&param->mutex);
 307     while (!param->quit) {
 308         if (param->block) {
 309             block = param->block;
 310             offset = param->offset;
 311             param->block = NULL;
 312             qemu_mutex_unlock(&param->mutex);
 313
 314             do_compress_ram_page(param->file, block, offset);
 315
 316             qemu_mutex_lock(&comp_done_lock);
 317             param->done = true;
 318             qemu_cond_signal(&comp_done_cond);
 319             qemu_mutex_unlock(&comp_done_lock);
 320
 321             qemu_mutex_lock(&param->mutex);
 322         } else {
 323             qemu_cond_wait(&param->cond, &param->mutex);
 324         }
 325     }
 326     qemu_mutex_unlock(&param->mutex);
 327
 328     return NULL;
 329 }
 330
 331 static inline void terminate_compression_threads(void)
 332 {
 333     int idx, thread_count;
 334
 335     thread_count = migrate_compress_threads();
 336
 337     for (idx = 0; idx < thread_count; idx++) {
 338         qemu_mutex_lock(&comp_param[idx].mutex);
 339         comp_param[idx].quit = true;
 340         qemu_cond_signal(&comp_param[idx].cond);
 341         qemu_mutex_unlock(&comp_param[idx].mutex);
 342     }
 343 }
 344
 345 static void compress_threads_save_cleanup(void)
 346 {
 347     int i, thread_count;
 348
 349     if (!migrate_use_compression()) {
 350         return;
 351     }
 352     terminate_compression_threads();
 353     thread_count = migrate_compress_threads();
 354     for (i = 0; i < thread_count; i++) {
 355         qemu_thread_join(compress_threads + i);
 356         qemu_fclose(comp_param[i].file);
 357         qemu_mutex_destroy(&comp_param[i].mutex);
 358         qemu_cond_destroy(&comp_param[i].cond);
 359     }
 360     qemu_mutex_destroy(&comp_done_lock);
 361     qemu_cond_destroy(&comp_done_cond);
 362     g_free(compress_threads);
 363     g_free(comp_param);
 364     compress_threads = NULL;
 365     comp_param = NULL;
 366 }
 367
 368 static void compress_threads_save_setup(void)
 369 {
 370     int i, thread_count;
 371
 372     if (!migrate_use_compression()) {
 373         return;
 374     }
 375     thread_count = migrate_compress_threads();
 376     compress_threads = g_new0(QemuThread, thread_count);
 377     comp_param = g_new0(CompressParam, thread_count);
 378     qemu_cond_init(&comp_done_cond);
 379     qemu_mutex_init(&comp_done_lock);
 380     for (i = 0; i < thread_count; i++) {
 381         /* comp_param[i].file is just used as a dummy buffer to save data,
 382          * set its ops to empty.
 383          */
 384         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 385         comp_param[i].done = true;
 386         comp_param[i].quit = false;
 387         qemu_mutex_init(&comp_param[i].mutex);
 388         qemu_cond_init(&comp_param[i].cond);
 389         qemu_thread_create(compress_threads + i, "compress",
 390                            do_data_compress, comp_param + i,
 391                            QEMU_THREAD_JOINABLE);
 392     }
 393 }
 394
 395 /* Multiple fd's */
 396
 397 struct MultiFDSendParams {
 398     uint8_t id;
 399     char *name;
 400     QemuThread thread;
 401     QemuSemaphore sem;
 402     QemuMutex mutex;
 403     bool quit;
 404 };
 405 typedef struct MultiFDSendParams MultiFDSendParams;
 406
 407 struct {
 408     MultiFDSendParams *params;
 409     /* number of created threads */
 410     int count;
 411 } *multifd_send_state;
 412
 413 static void terminate_multifd_send_threads(Error *errp)
 414 {
 415     int i;
 416
 417     for (i = 0; i < multifd_send_state->count; i++) {
 418         MultiFDSendParams *p = &multifd_send_state->params[i];
 419
 420         qemu_mutex_lock(&p->mutex);
 421         p->quit = true;
 422         qemu_sem_post(&p->sem);
 423         qemu_mutex_unlock(&p->mutex);
 424     }
 425 }
 426
 427 int multifd_save_cleanup(Error **errp)
 428 {
 429     int i;
 430     int ret = 0;
 431
 432     if (!migrate_use_multifd()) {
 433         return 0;
 434     }
 435     terminate_multifd_send_threads(NULL);
 436     for (i = 0; i < multifd_send_state->count; i++) {
 437         MultiFDSendParams *p = &multifd_send_state->params[i];
 438
 439         qemu_thread_join(&p->thread);
 440         qemu_mutex_destroy(&p->mutex);
 441         qemu_sem_destroy(&p->sem);
 442         g_free(p->name);
 443         p->name = NULL;
 444     }
 445     g_free(multifd_send_state->params);
 446     multifd_send_state->params = NULL;
 447     g_free(multifd_send_state);
 448     multifd_send_state = NULL;
 449     return ret;
 450 }
 451
 452 static void *multifd_send_thread(void *opaque)
 453 {
 454     MultiFDSendParams *p = opaque;
 455
 456     while (true) {
 457         qemu_mutex_lock(&p->mutex);
 458         if (p->quit) {
 459             qemu_mutex_unlock(&p->mutex);
 460             break;
 461         }
 462         qemu_mutex_unlock(&p->mutex);
 463         qemu_sem_wait(&p->sem);
 464     }
 465
 466     return NULL;
 467 }
 468
 469 int multifd_save_setup(void)
 470 {
 471     int thread_count;
 472     uint8_t i;
 473
 474     if (!migrate_use_multifd()) {
 475         return 0;
 476     }
 477     thread_count = migrate_multifd_channels();
 478     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
 479     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
 480     multifd_send_state->count = 0;
 481     for (i = 0; i < thread_count; i++) {
 482         MultiFDSendParams *p = &multifd_send_state->params[i];
 483
 484         qemu_mutex_init(&p->mutex);
 485         qemu_sem_init(&p->sem, 0);
 486         p->quit = false;
 487         p->id = i;
 488         p->name = g_strdup_printf("multifdsend_%d", i);
 489         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
 490                            QEMU_THREAD_JOINABLE);
 491
 492         multifd_send_state->count++;
 493     }
 494     return 0;
 495 }
 496
 497 struct MultiFDRecvParams {
 498     uint8_t id;
 499     char *name;
 500     QemuThread thread;
 501     QemuSemaphore sem;
 502     QemuMutex mutex;
 503     bool quit;
 504 };
 505 typedef struct MultiFDRecvParams MultiFDRecvParams;
 506
 507 struct {
 508     MultiFDRecvParams *params;
 509     /* number of created threads */
 510     int count;
 511 } *multifd_recv_state;
 512
 513 static void terminate_multifd_recv_threads(Error *errp)
 514 {
 515     int i;
 516
 517     for (i = 0; i < multifd_recv_state->count; i++) {
 518         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 519
 520         qemu_mutex_lock(&p->mutex);
 521         p->quit = true;
 522         qemu_sem_post(&p->sem);
 523         qemu_mutex_unlock(&p->mutex);
 524     }
 525 }
 526
 527 int multifd_load_cleanup(Error **errp)
 528 {
 529     int i;
 530     int ret = 0;
 531
 532     if (!migrate_use_multifd()) {
 533         return 0;
 534     }
 535     terminate_multifd_recv_threads(NULL);
 536     for (i = 0; i < multifd_recv_state->count; i++) {
 537         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 538
 539         qemu_thread_join(&p->thread);
 540         qemu_mutex_destroy(&p->mutex);
 541         qemu_sem_destroy(&p->sem);
 542         g_free(p->name);
 543         p->name = NULL;
 544     }
 545     g_free(multifd_recv_state->params);
 546     multifd_recv_state->params = NULL;
 547     g_free(multifd_recv_state);
 548     multifd_recv_state = NULL;
 549
 550     return ret;
 551 }
 552
 553 static void *multifd_recv_thread(void *opaque)
 554 {
 555     MultiFDRecvParams *p = opaque;
 556
 557     while (true) {
 558         qemu_mutex_lock(&p->mutex);
 559         if (p->quit) {
 560             qemu_mutex_unlock(&p->mutex);
 561             break;
 562         }
 563         qemu_mutex_unlock(&p->mutex);
 564         qemu_sem_wait(&p->sem);
 565     }
 566
 567     return NULL;
 568 }
 569
 570 int multifd_load_setup(void)
 571 {
 572     int thread_count;
 573     uint8_t i;
 574
 575     if (!migrate_use_multifd()) {
 576         return 0;
 577     }
 578     thread_count = migrate_multifd_channels();
 579     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
 580     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
 581     multifd_recv_state->count = 0;
 582     for (i = 0; i < thread_count; i++) {
 583         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 584
 585         qemu_mutex_init(&p->mutex);
 586         qemu_sem_init(&p->sem, 0);
 587         p->quit = false;
 588         p->id = i;
 589         p->name = g_strdup_printf("multifdrecv_%d", i);
 590         qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
 591                            QEMU_THREAD_JOINABLE);
 592         multifd_recv_state->count++;
 593     }
 594     return 0;
 595 }
 596
 597 /**
 598  * save_page_header: write page header to wire
 599  *
 600  * If this is the 1st block, it also writes the block identification
 601  *
 602  * Returns the number of bytes written
 603  *
 604  * @f: QEMUFile where to send the data
 605  * @block: block that contains the page we want to send
 606  * @offset: offset inside the block for the page
 607  *          in the lower bits, it contains flags
 608  */
 609 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 610                                ram_addr_t offset)
 611 {
 612     size_t size, len;
 613
 614     if (block == rs->last_sent_block) {
 615         offset |= RAM_SAVE_FLAG_CONTINUE;
 616     }
 617     qemu_put_be64(f, offset);
 618     size = 8;
 619
 620     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 621         len = strlen(block->idstr);
 622         qemu_put_byte(f, len);
 623         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 624         size += 1 + len;
 625         rs->last_sent_block = block;
 626     }
 627     return size;
 628 }
 629
 630 /**
 631  * mig_throttle_guest_down: throotle down the guest
 632  *
 633  * Reduce amount of guest cpu execution to hopefully slow down memory
 634  * writes. If guest dirty memory rate is reduced below the rate at
 635  * which we can transfer pages to the destination then we should be
 636  * able to complete migration. Some workloads dirty memory way too
 637  * fast and will not effectively converge, even with auto-converge.
 638  */
 639 static void mig_throttle_guest_down(void)
 640 {
 641     MigrationState *s = migrate_get_current();
 642     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 643     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 644
 645     /* We have not started throttling yet. Let's start it. */
 646     if (!cpu_throttle_active()) {
 647         cpu_throttle_set(pct_initial);
 648     } else {
 649         /* Throttling already on, just increase the rate */
 650         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 651     }
 652 }
 653
 654 /**
 655  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 656  *
 657  * @rs: current RAM state
 658  * @current_addr: address for the zero page
 659  *
 660  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 661  * The important thing is that a stale (not-yet-0'd) page be replaced
 662  * by the new data.
 663  * As a bonus, if the page wasn't in the cache it gets added so that
 664  * when a small write is made into the 0'd page it gets XBZRLE sent.
 665  */
 666 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 667 {
 668     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 669         return;
 670     }
 671
 672     /* We don't care if this fails to allocate a new cache page
 673      * as long as it updated an old one */
 674     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 675                  ram_counters.dirty_sync_count);
 676 }
 677
 678 #define ENCODING_FLAG_XBZRLE 0x1
 679
 680 /**
 681  * save_xbzrle_page: compress and send current page
 682  *
 683  * Returns: 1 means that we wrote the page
 684  *          0 means that page is identical to the one already sent
 685  *          -1 means that xbzrle would be longer than normal
 686  *
 687  * @rs: current RAM state
 688  * @current_data: pointer to the address of the page contents
 689  * @current_addr: addr of the page
 690  * @block: block that contains the page we want to send
 691  * @offset: offset inside the block for the page
 692  * @last_stage: if we are at the completion stage
 693  */
 694 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 695                             ram_addr_t current_addr, RAMBlock *block,
 696                             ram_addr_t offset, bool last_stage)
 697 {
 698     int encoded_len = 0, bytes_xbzrle;
 699     uint8_t *prev_cached_page;
 700
 701     if (!cache_is_cached(XBZRLE.cache, current_addr,
 702                          ram_counters.dirty_sync_count)) {
 703         xbzrle_counters.cache_miss++;
 704         if (!last_stage) {
 705             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 706                              ram_counters.dirty_sync_count) == -1) {
 707                 return -1;
 708             } else {
 709                 /* update *current_data when the page has been
 710                    inserted into cache */
 711                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 712             }
 713         }
 714         return -1;
 715     }
 716
 717     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 718
 719     /* save current buffer into memory */
 720     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 721
 722     /* XBZRLE encoding (if there is no overflow) */
 723     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 724                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 725                                        TARGET_PAGE_SIZE);
 726     if (encoded_len == 0) {
 727         trace_save_xbzrle_page_skipping();
 728         return 0;
 729     } else if (encoded_len == -1) {
 730         trace_save_xbzrle_page_overflow();
 731         xbzrle_counters.overflow++;
 732         /* update data in the cache */
 733         if (!last_stage) {
 734             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 735             *current_data = prev_cached_page;
 736         }
 737         return -1;
 738     }
 739
 740     /* we need to update the data in the cache, in order to get the same data */
 741     if (!last_stage) {
 742         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 743     }
 744
 745     /* Send XBZRLE based compressed page */
 746     bytes_xbzrle = save_page_header(rs, rs->f, block,
 747                                     offset | RAM_SAVE_FLAG_XBZRLE);
 748     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 749     qemu_put_be16(rs->f, encoded_len);
 750     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 751     bytes_xbzrle += encoded_len + 1 + 2;
 752     xbzrle_counters.pages++;
 753     xbzrle_counters.bytes += bytes_xbzrle;
 754     ram_counters.transferred += bytes_xbzrle;
 755
 756     return 1;
 757 }
 758
 759 /**
 760  * migration_bitmap_find_dirty: find the next dirty page from start
 761  *
 762  * Called with rcu_read_lock() to protect migration_bitmap
 763  *
 764  * Returns the byte offset within memory region of the start of a dirty page
 765  *
 766  * @rs: current RAM state
 767  * @rb: RAMBlock where to search for dirty pages
 768  * @start: page where we start the search
 769  */
 770 static inline
 771 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 772                                           unsigned long start)
 773 {
 774     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 775     unsigned long *bitmap = rb->bmap;
 776     unsigned long next;
 777
 778     if (rs->ram_bulk_stage && start > 0) {
 779         next = start + 1;
 780     } else {
 781         next = find_next_bit(bitmap, size, start);
 782     }
 783
 784     return next;
 785 }
 786
 787 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 788                                                 RAMBlock *rb,
 789                                                 unsigned long page)
 790 {
 791     bool ret;
 792
 793     ret = test_and_clear_bit(page, rb->bmap);
 794
 795     if (ret) {
 796         rs->migration_dirty_pages--;
 797     }
 798     return ret;
 799 }
 800
 801 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 802                                         ram_addr_t start, ram_addr_t length)
 803 {
 804     rs->migration_dirty_pages +=
 805         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 806                                               &rs->num_dirty_pages_period);
 807 }
 808
 809 /**
 810  * ram_pagesize_summary: calculate all the pagesizes of a VM
 811  *
 812  * Returns a summary bitmap of the page sizes of all RAMBlocks
 813  *
 814  * For VMs with just normal pages this is equivalent to the host page
 815  * size. If it's got some huge pages then it's the OR of all the
 816  * different page sizes.
 817  */
 818 uint64_t ram_pagesize_summary(void)
 819 {
 820     RAMBlock *block;
 821     uint64_t summary = 0;
 822
 823     RAMBLOCK_FOREACH(block) {
 824         summary |= block->page_size;
 825     }
 826
 827     return summary;
 828 }
 829
 830 static void migration_bitmap_sync(RAMState *rs)
 831 {
 832     RAMBlock *block;
 833     int64_t end_time;
 834     uint64_t bytes_xfer_now;
 835
 836     ram_counters.dirty_sync_count++;
 837
 838     if (!rs->time_last_bitmap_sync) {
 839         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 840     }
 841
 842     trace_migration_bitmap_sync_start();
 843     memory_global_dirty_log_sync();
 844
 845     qemu_mutex_lock(&rs->bitmap_mutex);
 846     rcu_read_lock();
 847     RAMBLOCK_FOREACH(block) {
 848         migration_bitmap_sync_range(rs, block, 0, block->used_length);
 849     }
 850     rcu_read_unlock();
 851     qemu_mutex_unlock(&rs->bitmap_mutex);
 852
 853     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 854
 855     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 856
 857     /* more than 1 second = 1000 millisecons */
 858     if (end_time > rs->time_last_bitmap_sync + 1000) {
 859         /* calculate period counters */
 860         ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 861             / (end_time - rs->time_last_bitmap_sync);
 862         bytes_xfer_now = ram_counters.transferred;
 863
 864         /* During block migration the auto-converge logic incorrectly detects
 865          * that ram migration makes no progress. Avoid this by disabling the
 866          * throttling logic during the bulk phase of block migration. */
 867         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 868             /* The following detection logic can be refined later. For now:
 869                Check to see if the dirtied bytes is 50% more than the approx.
 870                amount of bytes that just got transferred since the last time we
 871                were in this routine. If that happens twice, start or increase
 872                throttling */
 873
 874             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 875                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 876                 (++rs->dirty_rate_high_cnt >= 2)) {
 877                     trace_migration_throttle();
 878                     rs->dirty_rate_high_cnt = 0;
 879                     mig_throttle_guest_down();
 880             }
 881         }
 882
 883         if (migrate_use_xbzrle()) {
 884             if (rs->iterations_prev != rs->iterations) {
 885                 xbzrle_counters.cache_miss_rate =
 886                    (double)(xbzrle_counters.cache_miss -
 887                             rs->xbzrle_cache_miss_prev) /
 888                    (rs->iterations - rs->iterations_prev);
 889             }
 890             rs->iterations_prev = rs->iterations;
 891             rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 892         }
 893
 894         /* reset period counters */
 895         rs->time_last_bitmap_sync = end_time;
 896         rs->num_dirty_pages_period = 0;
 897         rs->bytes_xfer_prev = bytes_xfer_now;
 898     }
 899     if (migrate_use_events()) {
 900         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
 901     }
 902 }
 903
 904 /**
 905  * save_zero_page: send the zero page to the stream
 906  *
 907  * Returns the number of pages written.
 908  *
 909  * @rs: current RAM state
 910  * @block: block that contains the page we want to send
 911  * @offset: offset inside the block for the page
 912  */
 913 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
 914 {
 915     uint8_t *p = block->host + offset;
 916     int pages = -1;
 917
 918     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 919         ram_counters.duplicate++;
 920         ram_counters.transferred +=
 921             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
 922         qemu_put_byte(rs->f, 0);
 923         ram_counters.transferred += 1;
 924         pages = 1;
 925     }
 926
 927     return pages;
 928 }
 929
 930 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 931 {
 932     if (!migrate_release_ram() || !migration_in_postcopy()) {
 933         return;
 934     }
 935
 936     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 937 }
 938
 939 /**
 940  * ram_save_page: send the given page to the stream
 941  *
 942  * Returns the number of pages written.
 943  *          < 0 - error
 944  *          >=0 - Number of pages written - this might legally be 0
 945  *                if xbzrle noticed the page was the same.
 946  *
 947  * @rs: current RAM state
 948  * @block: block that contains the page we want to send
 949  * @offset: offset inside the block for the page
 950  * @last_stage: if we are at the completion stage
 951  */
 952 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
 953 {
 954     int pages = -1;
 955     uint64_t bytes_xmit;
 956     ram_addr_t current_addr;
 957     uint8_t *p;
 958     int ret;
 959     bool send_async = true;
 960     RAMBlock *block = pss->block;
 961     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 962
 963     p = block->host + offset;
 964     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
 965
 966     /* In doubt sent page as normal */
 967     bytes_xmit = 0;
 968     ret = ram_control_save_page(rs->f, block->offset,
 969                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 970     if (bytes_xmit) {
 971         ram_counters.transferred += bytes_xmit;
 972         pages = 1;
 973     }
 974
 975     XBZRLE_cache_lock();
 976
 977     current_addr = block->offset + offset;
 978
 979     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 980         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 981             if (bytes_xmit > 0) {
 982                 ram_counters.normal++;
 983             } else if (bytes_xmit == 0) {
 984                 ram_counters.duplicate++;
 985             }
 986         }
 987     } else {
 988         pages = save_zero_page(rs, block, offset);
 989         if (pages > 0) {
 990             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 991              * page would be stale
 992              */
 993             xbzrle_cache_zero_page(rs, current_addr);
 994             ram_release_pages(block->idstr, offset, pages);
 995         } else if (!rs->ram_bulk_stage &&
 996                    !migration_in_postcopy() && migrate_use_xbzrle()) {
 997             pages = save_xbzrle_page(rs, &p, current_addr, block,
 998                                      offset, last_stage);
 999             if (!last_stage) {
1000                 /* Can't send this cached data async, since the cache page
1001                  * might get updated before it gets to the wire
1002                  */
1003                 send_async = false;
1004             }
1005         }
1006     }
1007
1008     /* XBZRLE overflow or normal page */
1009     if (pages == -1) {
1010         ram_counters.transferred +=
1011             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
1012         if (send_async) {
1013             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
1014                                   migrate_release_ram() &
1015                                   migration_in_postcopy());
1016         } else {
1017             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
1018         }
1019         ram_counters.transferred += TARGET_PAGE_SIZE;
1020         pages = 1;
1021         ram_counters.normal++;
1022     }
1023
1024     XBZRLE_cache_unlock();
1025
1026     return pages;
1027 }
1028
1029 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
1030                                 ram_addr_t offset)
1031 {
1032     RAMState *rs = ram_state;
1033     int bytes_sent, blen;
1034     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1035
1036     bytes_sent = save_page_header(rs, f, block, offset |
1037                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
1038     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1039                                      migrate_compress_level());
1040     if (blen < 0) {
1041         bytes_sent = 0;
1042         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1043         error_report("compressed data failed!");
1044     } else {
1045         bytes_sent += blen;
1046         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1047     }
1048
1049     return bytes_sent;
1050 }
1051
1052 static void flush_compressed_data(RAMState *rs)
1053 {
1054     int idx, len, thread_count;
1055
1056     if (!migrate_use_compression()) {
1057         return;
1058     }
1059     thread_count = migrate_compress_threads();
1060
1061     qemu_mutex_lock(&comp_done_lock);
1062     for (idx = 0; idx < thread_count; idx++) {
1063         while (!comp_param[idx].done) {
1064             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1065         }
1066     }
1067     qemu_mutex_unlock(&comp_done_lock);
1068
1069     for (idx = 0; idx < thread_count; idx++) {
1070         qemu_mutex_lock(&comp_param[idx].mutex);
1071         if (!comp_param[idx].quit) {
1072             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1073             ram_counters.transferred += len;
1074         }
1075         qemu_mutex_unlock(&comp_param[idx].mutex);
1076     }
1077 }
1078
1079 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1080                                        ram_addr_t offset)
1081 {
1082     param->block = block;
1083     param->offset = offset;
1084 }
1085
1086 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1087                                            ram_addr_t offset)
1088 {
1089     int idx, thread_count, bytes_xmit = -1, pages = -1;
1090
1091     thread_count = migrate_compress_threads();
1092     qemu_mutex_lock(&comp_done_lock);
1093     while (true) {
1094         for (idx = 0; idx < thread_count; idx++) {
1095             if (comp_param[idx].done) {
1096                 comp_param[idx].done = false;
1097                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1098                 qemu_mutex_lock(&comp_param[idx].mutex);
1099                 set_compress_params(&comp_param[idx], block, offset);
1100                 qemu_cond_signal(&comp_param[idx].cond);
1101                 qemu_mutex_unlock(&comp_param[idx].mutex);
1102                 pages = 1;
1103                 ram_counters.normal++;
1104                 ram_counters.transferred += bytes_xmit;
1105                 break;
1106             }
1107         }
1108         if (pages > 0) {
1109             break;
1110         } else {
1111             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1112         }
1113     }
1114     qemu_mutex_unlock(&comp_done_lock);
1115
1116     return pages;
1117 }
1118
1119 /**
1120  * ram_save_compressed_page: compress the given page and send it to the stream
1121  *
1122  * Returns the number of pages written.
1123  *
1124  * @rs: current RAM state
1125  * @block: block that contains the page we want to send
1126  * @offset: offset inside the block for the page
1127  * @last_stage: if we are at the completion stage
1128  */
1129 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
1130                                     bool last_stage)
1131 {
1132     int pages = -1;
1133     uint64_t bytes_xmit = 0;
1134     uint8_t *p;
1135     int ret, blen;
1136     RAMBlock *block = pss->block;
1137     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1138
1139     p = block->host + offset;
1140
1141     ret = ram_control_save_page(rs->f, block->offset,
1142                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
1143     if (bytes_xmit) {
1144         ram_counters.transferred += bytes_xmit;
1145         pages = 1;
1146     }
1147     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
1148         if (ret != RAM_SAVE_CONTROL_DELAYED) {
1149             if (bytes_xmit > 0) {
1150                 ram_counters.normal++;
1151             } else if (bytes_xmit == 0) {
1152                 ram_counters.duplicate++;
1153             }
1154         }
1155     } else {
1156         /* When starting the process of a new block, the first page of
1157          * the block should be sent out before other pages in the same
1158          * block, and all the pages in last block should have been sent
1159          * out, keeping this order is important, because the 'cont' flag
1160          * is used to avoid resending the block name.
1161          */
1162         if (block != rs->last_sent_block) {
1163             flush_compressed_data(rs);
1164             pages = save_zero_page(rs, block, offset);
1165             if (pages == -1) {
1166                 /* Make sure the first page is sent out before other pages */
1167                 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1168                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
1169                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1170                                                  migrate_compress_level());
1171                 if (blen > 0) {
1172                     ram_counters.transferred += bytes_xmit + blen;
1173                     ram_counters.normal++;
1174                     pages = 1;
1175                 } else {
1176                     qemu_file_set_error(rs->f, blen);
1177                     error_report("compressed data failed!");
1178                 }
1179             }
1180             if (pages > 0) {
1181                 ram_release_pages(block->idstr, offset, pages);
1182             }
1183         } else {
1184             pages = save_zero_page(rs, block, offset);
1185             if (pages == -1) {
1186                 pages = compress_page_with_multi_thread(rs, block, offset);
1187             } else {
1188                 ram_release_pages(block->idstr, offset, pages);
1189             }
1190         }
1191     }
1192
1193     return pages;
1194 }
1195
1196 /**
1197  * find_dirty_block: find the next dirty page and update any state
1198  * associated with the search process.
1199  *
1200  * Returns if a page is found
1201  *
1202  * @rs: current RAM state
1203  * @pss: data about the state of the current dirty page scan
1204  * @again: set to false if the search has scanned the whole of RAM
1205  */
1206 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1207 {
1208     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1209     if (pss->complete_round && pss->block == rs->last_seen_block &&
1210         pss->page >= rs->last_page) {
1211         /*
1212          * We've been once around the RAM and haven't found anything.
1213          * Give up.
1214          */
1215         *again = false;
1216         return false;
1217     }
1218     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1219         /* Didn't find anything in this RAM Block */
1220         pss->page = 0;
1221         pss->block = QLIST_NEXT_RCU(pss->block, next);
1222         if (!pss->block) {
1223             /* Hit the end of the list */
1224             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1225             /* Flag that we've looped */
1226             pss->complete_round = true;
1227             rs->ram_bulk_stage = false;
1228             if (migrate_use_xbzrle()) {
1229                 /* If xbzrle is on, stop using the data compression at this
1230                  * point. In theory, xbzrle can do better than compression.
1231                  */
1232                 flush_compressed_data(rs);
1233             }
1234         }
1235         /* Didn't find anything this time, but try again on the new block */
1236         *again = true;
1237         return false;
1238     } else {
1239         /* Can go around again, but... */
1240         *again = true;
1241         /* We've found something so probably don't need to */
1242         return true;
1243     }
1244 }
1245
1246 /**
1247  * unqueue_page: gets a page of the queue
1248  *
1249  * Helper for 'get_queued_page' - gets a page off the queue
1250  *
1251  * Returns the block of the page (or NULL if none available)
1252  *
1253  * @rs: current RAM state
1254  * @offset: used to return the offset within the RAMBlock
1255  */
1256 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1257 {
1258     RAMBlock *block = NULL;
1259
1260     qemu_mutex_lock(&rs->src_page_req_mutex);
1261     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1262         struct RAMSrcPageRequest *entry =
1263                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1264         block = entry->rb;
1265         *offset = entry->offset;
1266
1267         if (entry->len > TARGET_PAGE_SIZE) {
1268             entry->len -= TARGET_PAGE_SIZE;
1269             entry->offset += TARGET_PAGE_SIZE;
1270         } else {
1271             memory_region_unref(block->mr);
1272             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1273             g_free(entry);
1274         }
1275     }
1276     qemu_mutex_unlock(&rs->src_page_req_mutex);
1277
1278     return block;
1279 }
1280
1281 /**
1282  * get_queued_page: unqueue a page from the postocpy requests
1283  *
1284  * Skips pages that are already sent (!dirty)
1285  *
1286  * Returns if a queued page is found
1287  *
1288  * @rs: current RAM state
1289  * @pss: data about the state of the current dirty page scan
1290  */
1291 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1292 {
1293     RAMBlock  *block;
1294     ram_addr_t offset;
1295     bool dirty;
1296
1297     do {
1298         block = unqueue_page(rs, &offset);
1299         /*
1300          * We're sending this page, and since it's postcopy nothing else
1301          * will dirty it, and we must make sure it doesn't get sent again
1302          * even if this queue request was received after the background
1303          * search already sent it.
1304          */
1305         if (block) {
1306             unsigned long page;
1307
1308             page = offset >> TARGET_PAGE_BITS;
1309             dirty = test_bit(page, block->bmap);
1310             if (!dirty) {
1311                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1312                        page, test_bit(page, block->unsentmap));
1313             } else {
1314                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1315             }
1316         }
1317
1318     } while (block && !dirty);
1319
1320     if (block) {
1321         /*
1322          * As soon as we start servicing pages out of order, then we have
1323          * to kill the bulk stage, since the bulk stage assumes
1324          * in (migration_bitmap_find_and_reset_dirty) that every page is
1325          * dirty, that's no longer true.
1326          */
1327         rs->ram_bulk_stage = false;
1328
1329         /*
1330          * We want the background search to continue from the queued page
1331          * since the guest is likely to want other pages near to the page
1332          * it just requested.
1333          */
1334         pss->block = block;
1335         pss->page = offset >> TARGET_PAGE_BITS;
1336     }
1337
1338     return !!block;
1339 }
1340
1341 /**
1342  * migration_page_queue_free: drop any remaining pages in the ram
1343  * request queue
1344  *
1345  * It should be empty at the end anyway, but in error cases there may
1346  * be some left.  in case that there is any page left, we drop it.
1347  *
1348  */
1349 static void migration_page_queue_free(RAMState *rs)
1350 {
1351     struct RAMSrcPageRequest *mspr, *next_mspr;
1352     /* This queue generally should be empty - but in the case of a failed
1353      * migration might have some droppings in.
1354      */
1355     rcu_read_lock();
1356     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1357         memory_region_unref(mspr->rb->mr);
1358         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1359         g_free(mspr);
1360     }
1361     rcu_read_unlock();
1362 }
1363
1364 /**
1365  * ram_save_queue_pages: queue the page for transmission
1366  *
1367  * A request from postcopy destination for example.
1368  *
1369  * Returns zero on success or negative on error
1370  *
1371  * @rbname: Name of the RAMBLock of the request. NULL means the
1372  *          same that last one.
1373  * @start: starting address from the start of the RAMBlock
1374  * @len: length (in bytes) to send
1375  */
1376 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1377 {
1378     RAMBlock *ramblock;
1379     RAMState *rs = ram_state;
1380
1381     ram_counters.postcopy_requests++;
1382     rcu_read_lock();
1383     if (!rbname) {
1384         /* Reuse last RAMBlock */
1385         ramblock = rs->last_req_rb;
1386
1387         if (!ramblock) {
1388             /*
1389              * Shouldn't happen, we can't reuse the last RAMBlock if
1390              * it's the 1st request.
1391              */
1392             error_report("ram_save_queue_pages no previous block");
1393             goto err;
1394         }
1395     } else {
1396         ramblock = qemu_ram_block_by_name(rbname);
1397
1398         if (!ramblock) {
1399             /* We shouldn't be asked for a non-existent RAMBlock */
1400             error_report("ram_save_queue_pages no block '%s'", rbname);
1401             goto err;
1402         }
1403         rs->last_req_rb = ramblock;
1404     }
1405     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1406     if (start+len > ramblock->used_length) {
1407         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1408                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1409                      __func__, start, len, ramblock->used_length);
1410         goto err;
1411     }
1412
1413     struct RAMSrcPageRequest *new_entry =
1414         g_malloc0(sizeof(struct RAMSrcPageRequest));
1415     new_entry->rb = ramblock;
1416     new_entry->offset = start;
1417     new_entry->len = len;
1418
1419     memory_region_ref(ramblock->mr);
1420     qemu_mutex_lock(&rs->src_page_req_mutex);
1421     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1422     qemu_mutex_unlock(&rs->src_page_req_mutex);
1423     rcu_read_unlock();
1424
1425     return 0;
1426
1427 err:
1428     rcu_read_unlock();
1429     return -1;
1430 }
1431
1432 /**
1433  * ram_save_target_page: save one target page
1434  *
1435  * Returns the number of pages written
1436  *
1437  * @rs: current RAM state
1438  * @ms: current migration state
1439  * @pss: data about the page we want to send
1440  * @last_stage: if we are at the completion stage
1441  */
1442 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1443                                 bool last_stage)
1444 {
1445     int res = 0;
1446
1447     /* Check the pages is dirty and if it is send it */
1448     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1449         /*
1450          * If xbzrle is on, stop using the data compression after first
1451          * round of migration even if compression is enabled. In theory,
1452          * xbzrle can do better than compression.
1453          */
1454         if (migrate_use_compression() &&
1455             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1456             res = ram_save_compressed_page(rs, pss, last_stage);
1457         } else {
1458             res = ram_save_page(rs, pss, last_stage);
1459         }
1460
1461         if (res < 0) {
1462             return res;
1463         }
1464         if (pss->block->unsentmap) {
1465             clear_bit(pss->page, pss->block->unsentmap);
1466         }
1467     }
1468
1469     return res;
1470 }
1471
1472 /**
1473  * ram_save_host_page: save a whole host page
1474  *
1475  * Starting at *offset send pages up to the end of the current host
1476  * page. It's valid for the initial offset to point into the middle of
1477  * a host page in which case the remainder of the hostpage is sent.
1478  * Only dirty target pages are sent. Note that the host page size may
1479  * be a huge page for this block.
1480  * The saving stops at the boundary of the used_length of the block
1481  * if the RAMBlock isn't a multiple of the host page size.
1482  *
1483  * Returns the number of pages written or negative on error
1484  *
1485  * @rs: current RAM state
1486  * @ms: current migration state
1487  * @pss: data about the page we want to send
1488  * @last_stage: if we are at the completion stage
1489  */
1490 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1491                               bool last_stage)
1492 {
1493     int tmppages, pages = 0;
1494     size_t pagesize_bits =
1495         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1496
1497     do {
1498         tmppages = ram_save_target_page(rs, pss, last_stage);
1499         if (tmppages < 0) {
1500             return tmppages;
1501         }
1502
1503         pages += tmppages;
1504         pss->page++;
1505     } while ((pss->page & (pagesize_bits - 1)) &&
1506              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1507
1508     /* The offset we leave with is the last one we looked at */
1509     pss->page--;
1510     return pages;
1511 }
1512
1513 /**
1514  * ram_find_and_save_block: finds a dirty page and sends it to f
1515  *
1516  * Called within an RCU critical section.
1517  *
1518  * Returns the number of pages written where zero means no dirty pages
1519  *
1520  * @rs: current RAM state
1521  * @last_stage: if we are at the completion stage
1522  *
1523  * On systems where host-page-size > target-page-size it will send all the
1524  * pages in a host page that are dirty.
1525  */
1526
1527 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1528 {
1529     PageSearchStatus pss;
1530     int pages = 0;
1531     bool again, found;
1532
1533     /* No dirty page as there is zero RAM */
1534     if (!ram_bytes_total()) {
1535         return pages;
1536     }
1537
1538     pss.block = rs->last_seen_block;
1539     pss.page = rs->last_page;
1540     pss.complete_round = false;
1541
1542     if (!pss.block) {
1543         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1544     }
1545
1546     do {
1547         again = true;
1548         found = get_queued_page(rs, &pss);
1549
1550         if (!found) {
1551             /* priority queue empty, so just search for something dirty */
1552             found = find_dirty_block(rs, &pss, &again);
1553         }
1554
1555         if (found) {
1556             pages = ram_save_host_page(rs, &pss, last_stage);
1557         }
1558     } while (!pages && again);
1559
1560     rs->last_seen_block = pss.block;
1561     rs->last_page = pss.page;
1562
1563     return pages;
1564 }
1565
1566 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1567 {
1568     uint64_t pages = size / TARGET_PAGE_SIZE;
1569
1570     if (zero) {
1571         ram_counters.duplicate += pages;
1572     } else {
1573         ram_counters.normal += pages;
1574         ram_counters.transferred += size;
1575         qemu_update_position(f, size);
1576     }
1577 }
1578
1579 uint64_t ram_bytes_total(void)
1580 {
1581     RAMBlock *block;
1582     uint64_t total = 0;
1583
1584     rcu_read_lock();
1585     RAMBLOCK_FOREACH(block) {
1586         total += block->used_length;
1587     }
1588     rcu_read_unlock();
1589     return total;
1590 }
1591
1592 static void xbzrle_load_setup(void)
1593 {
1594     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1595 }
1596
1597 static void xbzrle_load_cleanup(void)
1598 {
1599     g_free(XBZRLE.decoded_buf);
1600     XBZRLE.decoded_buf = NULL;
1601 }
1602
1603 static void ram_state_cleanup(RAMState **rsp)
1604 {
1605     if (*rsp) {
1606         migration_page_queue_free(*rsp);
1607         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1608         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1609         g_free(*rsp);
1610         *rsp = NULL;
1611     }
1612 }
1613
1614 static void xbzrle_cleanup(void)
1615 {
1616     XBZRLE_cache_lock();
1617     if (XBZRLE.cache) {
1618         cache_fini(XBZRLE.cache);
1619         g_free(XBZRLE.encoded_buf);
1620         g_free(XBZRLE.current_buf);
1621         g_free(XBZRLE.zero_target_page);
1622         XBZRLE.cache = NULL;
1623         XBZRLE.encoded_buf = NULL;
1624         XBZRLE.current_buf = NULL;
1625         XBZRLE.zero_target_page = NULL;
1626     }
1627     XBZRLE_cache_unlock();
1628 }
1629
1630 static void ram_save_cleanup(void *opaque)
1631 {
1632     RAMState **rsp = opaque;
1633     RAMBlock *block;
1634
1635     /* caller have hold iothread lock or is in a bh, so there is
1636      * no writing race against this migration_bitmap
1637      */
1638     memory_global_dirty_log_stop();
1639
1640     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1641         g_free(block->bmap);
1642         block->bmap = NULL;
1643         g_free(block->unsentmap);
1644         block->unsentmap = NULL;
1645     }
1646
1647     xbzrle_cleanup();
1648     compress_threads_save_cleanup();
1649     ram_state_cleanup(rsp);
1650 }
1651
1652 static void ram_state_reset(RAMState *rs)
1653 {
1654     rs->last_seen_block = NULL;
1655     rs->last_sent_block = NULL;
1656     rs->last_page = 0;
1657     rs->last_version = ram_list.version;
1658     rs->ram_bulk_stage = true;
1659 }
1660
1661 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1662
1663 /*
1664  * 'expected' is the value you expect the bitmap mostly to be full
1665  * of; it won't bother printing lines that are all this value.
1666  * If 'todump' is null the migration bitmap is dumped.
1667  */
1668 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1669                            unsigned long pages)
1670 {
1671     int64_t cur;
1672     int64_t linelen = 128;
1673     char linebuf[129];
1674
1675     for (cur = 0; cur < pages; cur += linelen) {
1676         int64_t curb;
1677         bool found = false;
1678         /*
1679          * Last line; catch the case where the line length
1680          * is longer than remaining ram
1681          */
1682         if (cur + linelen > pages) {
1683             linelen = pages - cur;
1684         }
1685         for (curb = 0; curb < linelen; curb++) {
1686             bool thisbit = test_bit(cur + curb, todump);
1687             linebuf[curb] = thisbit ? '1' : '.';
1688             found = found || (thisbit != expected);
1689         }
1690         if (found) {
1691             linebuf[curb] = '\0';
1692             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1693         }
1694     }
1695 }
1696
1697 /* **** functions for postcopy ***** */
1698
1699 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1700 {
1701     struct RAMBlock *block;
1702
1703     RAMBLOCK_FOREACH(block) {
1704         unsigned long *bitmap = block->bmap;
1705         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1706         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1707
1708         while (run_start < range) {
1709             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1710             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1711                               (run_end - run_start) << TARGET_PAGE_BITS);
1712             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1713         }
1714     }
1715 }
1716
1717 /**
1718  * postcopy_send_discard_bm_ram: discard a RAMBlock
1719  *
1720  * Returns zero on success
1721  *
1722  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1723  * Note: At this point the 'unsentmap' is the processed bitmap combined
1724  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1725  *
1726  * @ms: current migration state
1727  * @pds: state for postcopy
1728  * @start: RAMBlock starting page
1729  * @length: RAMBlock size
1730  */
1731 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1732                                         PostcopyDiscardState *pds,
1733                                         RAMBlock *block)
1734 {
1735     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1736     unsigned long current;
1737     unsigned long *unsentmap = block->unsentmap;
1738
1739     for (current = 0; current < end; ) {
1740         unsigned long one = find_next_bit(unsentmap, end, current);
1741
1742         if (one <= end) {
1743             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1744             unsigned long discard_length;
1745
1746             if (zero >= end) {
1747                 discard_length = end - one;
1748             } else {
1749                 discard_length = zero - one;
1750             }
1751             if (discard_length) {
1752                 postcopy_discard_send_range(ms, pds, one, discard_length);
1753             }
1754             current = one + discard_length;
1755         } else {
1756             current = one;
1757         }
1758     }
1759
1760     return 0;
1761 }
1762
1763 /**
1764  * postcopy_each_ram_send_discard: discard all RAMBlocks
1765  *
1766  * Returns 0 for success or negative for error
1767  *
1768  * Utility for the outgoing postcopy code.
1769  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1770  *   passing it bitmap indexes and name.
1771  * (qemu_ram_foreach_block ends up passing unscaled lengths
1772  *  which would mean postcopy code would have to deal with target page)
1773  *
1774  * @ms: current migration state
1775  */
1776 static int postcopy_each_ram_send_discard(MigrationState *ms)
1777 {
1778     struct RAMBlock *block;
1779     int ret;
1780
1781     RAMBLOCK_FOREACH(block) {
1782         PostcopyDiscardState *pds =
1783             postcopy_discard_send_init(ms, block->idstr);
1784
1785         /*
1786          * Postcopy sends chunks of bitmap over the wire, but it
1787          * just needs indexes at this point, avoids it having
1788          * target page specific code.
1789          */
1790         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1791         postcopy_discard_send_finish(ms, pds);
1792         if (ret) {
1793             return ret;
1794         }
1795     }
1796
1797     return 0;
1798 }
1799
1800 /**
1801  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1802  *
1803  * Helper for postcopy_chunk_hostpages; it's called twice to
1804  * canonicalize the two bitmaps, that are similar, but one is
1805  * inverted.
1806  *
1807  * Postcopy requires that all target pages in a hostpage are dirty or
1808  * clean, not a mix.  This function canonicalizes the bitmaps.
1809  *
1810  * @ms: current migration state
1811  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1812  *               otherwise we need to canonicalize partially dirty host pages
1813  * @block: block that contains the page we want to canonicalize
1814  * @pds: state for postcopy
1815  */
1816 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1817                                           RAMBlock *block,
1818                                           PostcopyDiscardState *pds)
1819 {
1820     RAMState *rs = ram_state;
1821     unsigned long *bitmap = block->bmap;
1822     unsigned long *unsentmap = block->unsentmap;
1823     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1824     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1825     unsigned long run_start;
1826
1827     if (block->page_size == TARGET_PAGE_SIZE) {
1828         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1829         return;
1830     }
1831
1832     if (unsent_pass) {
1833         /* Find a sent page */
1834         run_start = find_next_zero_bit(unsentmap, pages, 0);
1835     } else {
1836         /* Find a dirty page */
1837         run_start = find_next_bit(bitmap, pages, 0);
1838     }
1839
1840     while (run_start < pages) {
1841         bool do_fixup = false;
1842         unsigned long fixup_start_addr;
1843         unsigned long host_offset;
1844
1845         /*
1846          * If the start of this run of pages is in the middle of a host
1847          * page, then we need to fixup this host page.
1848          */
1849         host_offset = run_start % host_ratio;
1850         if (host_offset) {
1851             do_fixup = true;
1852             run_start -= host_offset;
1853             fixup_start_addr = run_start;
1854             /* For the next pass */
1855             run_start = run_start + host_ratio;
1856         } else {
1857             /* Find the end of this run */
1858             unsigned long run_end;
1859             if (unsent_pass) {
1860                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1861             } else {
1862                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1863             }
1864             /*
1865              * If the end isn't at the start of a host page, then the
1866              * run doesn't finish at the end of a host page
1867              * and we need to discard.
1868              */
1869             host_offset = run_end % host_ratio;
1870             if (host_offset) {
1871                 do_fixup = true;
1872                 fixup_start_addr = run_end - host_offset;
1873                 /*
1874                  * This host page has gone, the next loop iteration starts
1875                  * from after the fixup
1876                  */
1877                 run_start = fixup_start_addr + host_ratio;
1878             } else {
1879                 /*
1880                  * No discards on this iteration, next loop starts from
1881                  * next sent/dirty page
1882                  */
1883                 run_start = run_end + 1;
1884             }
1885         }
1886
1887         if (do_fixup) {
1888             unsigned long page;
1889
1890             /* Tell the destination to discard this page */
1891             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1892                 /* For the unsent_pass we:
1893                  *     discard partially sent pages
1894                  * For the !unsent_pass (dirty) we:
1895                  *     discard partially dirty pages that were sent
1896                  *     (any partially sent pages were already discarded
1897                  *     by the previous unsent_pass)
1898                  */
1899                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1900                                             host_ratio);
1901             }
1902
1903             /* Clean up the bitmap */
1904             for (page = fixup_start_addr;
1905                  page < fixup_start_addr + host_ratio; page++) {
1906                 /* All pages in this host page are now not sent */
1907                 set_bit(page, unsentmap);
1908
1909                 /*
1910                  * Remark them as dirty, updating the count for any pages
1911                  * that weren't previously dirty.
1912                  */
1913                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1914             }
1915         }
1916
1917         if (unsent_pass) {
1918             /* Find the next sent page for the next iteration */
1919             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1920         } else {
1921             /* Find the next dirty page for the next iteration */
1922             run_start = find_next_bit(bitmap, pages, run_start);
1923         }
1924     }
1925 }
1926
1927 /**
1928  * postcopy_chuck_hostpages: discrad any partially sent host page
1929  *
1930  * Utility for the outgoing postcopy code.
1931  *
1932  * Discard any partially sent host-page size chunks, mark any partially
1933  * dirty host-page size chunks as all dirty.  In this case the host-page
1934  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1935  *
1936  * Returns zero on success
1937  *
1938  * @ms: current migration state
1939  * @block: block we want to work with
1940  */
1941 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1942 {
1943     PostcopyDiscardState *pds =
1944         postcopy_discard_send_init(ms, block->idstr);
1945
1946     /* First pass: Discard all partially sent host pages */
1947     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1948     /*
1949      * Second pass: Ensure that all partially dirty host pages are made
1950      * fully dirty.
1951      */
1952     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1953
1954     postcopy_discard_send_finish(ms, pds);
1955     return 0;
1956 }
1957
1958 /**
1959  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1960  *
1961  * Returns zero on success
1962  *
1963  * Transmit the set of pages to be discarded after precopy to the target
1964  * these are pages that:
1965  *     a) Have been previously transmitted but are now dirty again
1966  *     b) Pages that have never been transmitted, this ensures that
1967  *        any pages on the destination that have been mapped by background
1968  *        tasks get discarded (transparent huge pages is the specific concern)
1969  * Hopefully this is pretty sparse
1970  *
1971  * @ms: current migration state
1972  */
1973 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1974 {
1975     RAMState *rs = ram_state;
1976     RAMBlock *block;
1977     int ret;
1978
1979     rcu_read_lock();
1980
1981     /* This should be our last sync, the src is now paused */
1982     migration_bitmap_sync(rs);
1983
1984     /* Easiest way to make sure we don't resume in the middle of a host-page */
1985     rs->last_seen_block = NULL;
1986     rs->last_sent_block = NULL;
1987     rs->last_page = 0;
1988
1989     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1990         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1991         unsigned long *bitmap = block->bmap;
1992         unsigned long *unsentmap = block->unsentmap;
1993
1994         if (!unsentmap) {
1995             /* We don't have a safe way to resize the sentmap, so
1996              * if the bitmap was resized it will be NULL at this
1997              * point.
1998              */
1999             error_report("migration ram resized during precopy phase");
2000             rcu_read_unlock();
2001             return -EINVAL;
2002         }
2003         /* Deal with TPS != HPS and huge pages */
2004         ret = postcopy_chunk_hostpages(ms, block);
2005         if (ret) {
2006             rcu_read_unlock();
2007             return ret;
2008         }
2009
2010         /*
2011          * Update the unsentmap to be unsentmap = unsentmap | dirty
2012          */
2013         bitmap_or(unsentmap, unsentmap, bitmap, pages);
2014 #ifdef DEBUG_POSTCOPY
2015         ram_debug_dump_bitmap(unsentmap, true, pages);
2016 #endif
2017     }
2018     trace_ram_postcopy_send_discard_bitmap();
2019
2020     ret = postcopy_each_ram_send_discard(ms);
2021     rcu_read_unlock();
2022
2023     return ret;
2024 }
2025
2026 /**
2027  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2028  *
2029  * Returns zero on success
2030  *
2031  * @rbname: name of the RAMBlock of the request. NULL means the
2032  *          same that last one.
2033  * @start: RAMBlock starting page
2034  * @length: RAMBlock size
2035  */
2036 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2037 {
2038     int ret = -1;
2039
2040     trace_ram_discard_range(rbname, start, length);
2041
2042     rcu_read_lock();
2043     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2044
2045     if (!rb) {
2046         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2047         goto err;
2048     }
2049
2050     bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2051                  length >> qemu_target_page_bits());
2052     ret = ram_block_discard_range(rb, start, length);
2053
2054 err:
2055     rcu_read_unlock();
2056
2057     return ret;
2058 }
2059
2060 /*
2061  * For every allocation, we will try not to crash the VM if the
2062  * allocation failed.
2063  */
2064 static int xbzrle_init(void)
2065 {
2066     Error *local_err = NULL;
2067
2068     if (!migrate_use_xbzrle()) {
2069         return 0;
2070     }
2071
2072     XBZRLE_cache_lock();
2073
2074     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2075     if (!XBZRLE.zero_target_page) {
2076         error_report("%s: Error allocating zero page", __func__);
2077         goto err_out;
2078     }
2079
2080     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2081                               TARGET_PAGE_SIZE, &local_err);
2082     if (!XBZRLE.cache) {
2083         error_report_err(local_err);
2084         goto free_zero_page;
2085     }
2086
2087     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2088     if (!XBZRLE.encoded_buf) {
2089         error_report("%s: Error allocating encoded_buf", __func__);
2090         goto free_cache;
2091     }
2092
2093     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2094     if (!XBZRLE.current_buf) {
2095         error_report("%s: Error allocating current_buf", __func__);
2096         goto free_encoded_buf;
2097     }
2098
2099     /* We are all good */
2100     XBZRLE_cache_unlock();
2101     return 0;
2102
2103 free_encoded_buf:
2104     g_free(XBZRLE.encoded_buf);
2105     XBZRLE.encoded_buf = NULL;
2106 free_cache:
2107     cache_fini(XBZRLE.cache);
2108     XBZRLE.cache = NULL;
2109 free_zero_page:
2110     g_free(XBZRLE.zero_target_page);
2111     XBZRLE.zero_target_page = NULL;
2112 err_out:
2113     XBZRLE_cache_unlock();
2114     return -ENOMEM;
2115 }
2116
2117 static int ram_state_init(RAMState **rsp)
2118 {
2119     *rsp = g_try_new0(RAMState, 1);
2120
2121     if (!*rsp) {
2122         error_report("%s: Init ramstate fail", __func__);
2123         return -1;
2124     }
2125
2126     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2127     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2128     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2129
2130     /*
2131      * Count the total number of pages used by ram blocks not including any
2132      * gaps due to alignment or unplugs.
2133      */
2134     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2135
2136     ram_state_reset(*rsp);
2137
2138     return 0;
2139 }
2140
2141 static void ram_list_init_bitmaps(void)
2142 {
2143     RAMBlock *block;
2144     unsigned long pages;
2145
2146     /* Skip setting bitmap if there is no RAM */
2147     if (ram_bytes_total()) {
2148         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2149             pages = block->max_length >> TARGET_PAGE_BITS;
2150             block->bmap = bitmap_new(pages);
2151             bitmap_set(block->bmap, 0, pages);
2152             if (migrate_postcopy_ram()) {
2153                 block->unsentmap = bitmap_new(pages);
2154                 bitmap_set(block->unsentmap, 0, pages);
2155             }
2156         }
2157     }
2158 }
2159
2160 static void ram_init_bitmaps(RAMState *rs)
2161 {
2162     /* For memory_global_dirty_log_start below.  */
2163     qemu_mutex_lock_iothread();
2164     qemu_mutex_lock_ramlist();
2165     rcu_read_lock();
2166
2167     ram_list_init_bitmaps();
2168     memory_global_dirty_log_start();
2169     migration_bitmap_sync(rs);
2170
2171     rcu_read_unlock();
2172     qemu_mutex_unlock_ramlist();
2173     qemu_mutex_unlock_iothread();
2174 }
2175
2176 static int ram_init_all(RAMState **rsp)
2177 {
2178     if (ram_state_init(rsp)) {
2179         return -1;
2180     }
2181
2182     if (xbzrle_init()) {
2183         ram_state_cleanup(rsp);
2184         return -1;
2185     }
2186
2187     ram_init_bitmaps(*rsp);
2188
2189     return 0;
2190 }
2191
2192 /*
2193  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2194  * long-running RCU critical section.  When rcu-reclaims in the code
2195  * start to become numerous it will be necessary to reduce the
2196  * granularity of these critical sections.
2197  */
2198
2199 /**
2200  * ram_save_setup: Setup RAM for migration
2201  *
2202  * Returns zero to indicate success and negative for error
2203  *
2204  * @f: QEMUFile where to send the data
2205  * @opaque: RAMState pointer
2206  */
2207 static int ram_save_setup(QEMUFile *f, void *opaque)
2208 {
2209     RAMState **rsp = opaque;
2210     RAMBlock *block;
2211
2212     /* migration has already setup the bitmap, reuse it. */
2213     if (!migration_in_colo_state()) {
2214         if (ram_init_all(rsp) != 0) {
2215             return -1;
2216         }
2217     }
2218     (*rsp)->f = f;
2219
2220     rcu_read_lock();
2221
2222     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2223
2224     RAMBLOCK_FOREACH(block) {
2225         qemu_put_byte(f, strlen(block->idstr));
2226         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2227         qemu_put_be64(f, block->used_length);
2228         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2229             qemu_put_be64(f, block->page_size);
2230         }
2231     }
2232
2233     rcu_read_unlock();
2234     compress_threads_save_setup();
2235
2236     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2237     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2238
2239     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2240
2241     return 0;
2242 }
2243
2244 /**
2245  * ram_save_iterate: iterative stage for migration
2246  *
2247  * Returns zero to indicate success and negative for error
2248  *
2249  * @f: QEMUFile where to send the data
2250  * @opaque: RAMState pointer
2251  */
2252 static int ram_save_iterate(QEMUFile *f, void *opaque)
2253 {
2254     RAMState **temp = opaque;
2255     RAMState *rs = *temp;
2256     int ret;
2257     int i;
2258     int64_t t0;
2259     int done = 0;
2260
2261     if (blk_mig_bulk_active()) {
2262         /* Avoid transferring ram during bulk phase of block migration as
2263          * the bulk phase will usually take a long time and transferring
2264          * ram updates during that time is pointless. */
2265         goto out;
2266     }
2267
2268     rcu_read_lock();
2269     if (ram_list.version != rs->last_version) {
2270         ram_state_reset(rs);
2271     }
2272
2273     /* Read version before ram_list.blocks */
2274     smp_rmb();
2275
2276     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2277
2278     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2279     i = 0;
2280     while ((ret = qemu_file_rate_limit(f)) == 0) {
2281         int pages;
2282
2283         pages = ram_find_and_save_block(rs, false);
2284         /* no more pages to sent */
2285         if (pages == 0) {
2286             done = 1;
2287             break;
2288         }
2289         rs->iterations++;
2290
2291         /* we want to check in the 1st loop, just in case it was the 1st time
2292            and we had to sync the dirty bitmap.
2293            qemu_get_clock_ns() is a bit expensive, so we only check each some
2294            iterations
2295         */
2296         if ((i & 63) == 0) {
2297             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2298             if (t1 > MAX_WAIT) {
2299                 trace_ram_save_iterate_big_wait(t1, i);
2300                 break;
2301             }
2302         }
2303         i++;
2304     }
2305     flush_compressed_data(rs);
2306     rcu_read_unlock();
2307
2308     /*
2309      * Must occur before EOS (or any QEMUFile operation)
2310      * because of RDMA protocol.
2311      */
2312     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2313
2314 out:
2315     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2316     ram_counters.transferred += 8;
2317
2318     ret = qemu_file_get_error(f);
2319     if (ret < 0) {
2320         return ret;
2321     }
2322
2323     return done;
2324 }
2325
2326 /**
2327  * ram_save_complete: function called to send the remaining amount of ram
2328  *
2329  * Returns zero to indicate success
2330  *
2331  * Called with iothread lock
2332  *
2333  * @f: QEMUFile where to send the data
2334  * @opaque: RAMState pointer
2335  */
2336 static int ram_save_complete(QEMUFile *f, void *opaque)
2337 {
2338     RAMState **temp = opaque;
2339     RAMState *rs = *temp;
2340
2341     rcu_read_lock();
2342
2343     if (!migration_in_postcopy()) {
2344         migration_bitmap_sync(rs);
2345     }
2346
2347     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2348
2349     /* try transferring iterative blocks of memory */
2350
2351     /* flush all remaining blocks regardless of rate limiting */
2352     while (true) {
2353         int pages;
2354
2355         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2356         /* no more blocks to sent */
2357         if (pages == 0) {
2358             break;
2359         }
2360     }
2361
2362     flush_compressed_data(rs);
2363     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2364
2365     rcu_read_unlock();
2366
2367     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2368
2369     return 0;
2370 }
2371
2372 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2373                              uint64_t *non_postcopiable_pending,
2374                              uint64_t *postcopiable_pending)
2375 {
2376     RAMState **temp = opaque;
2377     RAMState *rs = *temp;
2378     uint64_t remaining_size;
2379
2380     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2381
2382     if (!migration_in_postcopy() &&
2383         remaining_size < max_size) {
2384         qemu_mutex_lock_iothread();
2385         rcu_read_lock();
2386         migration_bitmap_sync(rs);
2387         rcu_read_unlock();
2388         qemu_mutex_unlock_iothread();
2389         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2390     }
2391
2392     if (migrate_postcopy_ram()) {
2393         /* We can do postcopy, and all the data is postcopiable */
2394         *postcopiable_pending += remaining_size;
2395     } else {
2396         *non_postcopiable_pending += remaining_size;
2397     }
2398 }
2399
2400 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2401 {
2402     unsigned int xh_len;
2403     int xh_flags;
2404     uint8_t *loaded_data;
2405
2406     /* extract RLE header */
2407     xh_flags = qemu_get_byte(f);
2408     xh_len = qemu_get_be16(f);
2409
2410     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2411         error_report("Failed to load XBZRLE page - wrong compression!");
2412         return -1;
2413     }
2414
2415     if (xh_len > TARGET_PAGE_SIZE) {
2416         error_report("Failed to load XBZRLE page - len overflow!");
2417         return -1;
2418     }
2419     loaded_data = XBZRLE.decoded_buf;
2420     /* load data and decode */
2421     /* it can change loaded_data to point to an internal buffer */
2422     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2423
2424     /* decode RLE */
2425     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2426                              TARGET_PAGE_SIZE) == -1) {
2427         error_report("Failed to load XBZRLE page - decode error!");
2428         return -1;
2429     }
2430
2431     return 0;
2432 }
2433
2434 /**
2435  * ram_block_from_stream: read a RAMBlock id from the migration stream
2436  *
2437  * Must be called from within a rcu critical section.
2438  *
2439  * Returns a pointer from within the RCU-protected ram_list.
2440  *
2441  * @f: QEMUFile where to read the data from
2442  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2443  */
2444 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2445 {
2446     static RAMBlock *block = NULL;
2447     char id[256];
2448     uint8_t len;
2449
2450     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2451         if (!block) {
2452             error_report("Ack, bad migration stream!");
2453             return NULL;
2454         }
2455         return block;
2456     }
2457
2458     len = qemu_get_byte(f);
2459     qemu_get_buffer(f, (uint8_t *)id, len);
2460     id[len] = 0;
2461
2462     block = qemu_ram_block_by_name(id);
2463     if (!block) {
2464         error_report("Can't find block %s", id);
2465         return NULL;
2466     }
2467
2468     return block;
2469 }
2470
2471 static inline void *host_from_ram_block_offset(RAMBlock *block,
2472                                                ram_addr_t offset)
2473 {
2474     if (!offset_in_ramblock(block, offset)) {
2475         return NULL;
2476     }
2477
2478     return block->host + offset;
2479 }
2480
2481 /**
2482  * ram_handle_compressed: handle the zero page case
2483  *
2484  * If a page (or a whole RDMA chunk) has been
2485  * determined to be zero, then zap it.
2486  *
2487  * @host: host address for the zero page
2488  * @ch: what the page is filled from.  We only support zero
2489  * @size: size of the zero page
2490  */
2491 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2492 {
2493     if (ch != 0 || !is_zero_range(host, size)) {
2494         memset(host, ch, size);
2495     }
2496 }
2497
2498 static void *do_data_decompress(void *opaque)
2499 {
2500     DecompressParam *param = opaque;
2501     unsigned long pagesize;
2502     uint8_t *des;
2503     int len;
2504
2505     qemu_mutex_lock(&param->mutex);
2506     while (!param->quit) {
2507         if (param->des) {
2508             des = param->des;
2509             len = param->len;
2510             param->des = 0;
2511             qemu_mutex_unlock(&param->mutex);
2512
2513             pagesize = TARGET_PAGE_SIZE;
2514             /* uncompress() will return failed in some case, especially
2515              * when the page is dirted when doing the compression, it's
2516              * not a problem because the dirty page will be retransferred
2517              * and uncompress() won't break the data in other pages.
2518              */
2519             uncompress((Bytef *)des, &pagesize,
2520                        (const Bytef *)param->compbuf, len);
2521
2522             qemu_mutex_lock(&decomp_done_lock);
2523             param->done = true;
2524             qemu_cond_signal(&decomp_done_cond);
2525             qemu_mutex_unlock(&decomp_done_lock);
2526
2527             qemu_mutex_lock(&param->mutex);
2528         } else {
2529             qemu_cond_wait(&param->cond, &param->mutex);
2530         }
2531     }
2532     qemu_mutex_unlock(&param->mutex);
2533
2534     return NULL;
2535 }
2536
2537 static void wait_for_decompress_done(void)
2538 {
2539     int idx, thread_count;
2540
2541     if (!migrate_use_compression()) {
2542         return;
2543     }
2544
2545     thread_count = migrate_decompress_threads();
2546     qemu_mutex_lock(&decomp_done_lock);
2547     for (idx = 0; idx < thread_count; idx++) {
2548         while (!decomp_param[idx].done) {
2549             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2550         }
2551     }
2552     qemu_mutex_unlock(&decomp_done_lock);
2553 }
2554
2555 static void compress_threads_load_setup(void)
2556 {
2557     int i, thread_count;
2558
2559     if (!migrate_use_compression()) {
2560         return;
2561     }
2562     thread_count = migrate_decompress_threads();
2563     decompress_threads = g_new0(QemuThread, thread_count);
2564     decomp_param = g_new0(DecompressParam, thread_count);
2565     qemu_mutex_init(&decomp_done_lock);
2566     qemu_cond_init(&decomp_done_cond);
2567     for (i = 0; i < thread_count; i++) {
2568         qemu_mutex_init(&decomp_param[i].mutex);
2569         qemu_cond_init(&decomp_param[i].cond);
2570         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2571         decomp_param[i].done = true;
2572         decomp_param[i].quit = false;
2573         qemu_thread_create(decompress_threads + i, "decompress",
2574                            do_data_decompress, decomp_param + i,
2575                            QEMU_THREAD_JOINABLE);
2576     }
2577 }
2578
2579 static void compress_threads_load_cleanup(void)
2580 {
2581     int i, thread_count;
2582
2583     if (!migrate_use_compression()) {
2584         return;
2585     }
2586     thread_count = migrate_decompress_threads();
2587     for (i = 0; i < thread_count; i++) {
2588         qemu_mutex_lock(&decomp_param[i].mutex);
2589         decomp_param[i].quit = true;
2590         qemu_cond_signal(&decomp_param[i].cond);
2591         qemu_mutex_unlock(&decomp_param[i].mutex);
2592     }
2593     for (i = 0; i < thread_count; i++) {
2594         qemu_thread_join(decompress_threads + i);
2595         qemu_mutex_destroy(&decomp_param[i].mutex);
2596         qemu_cond_destroy(&decomp_param[i].cond);
2597         g_free(decomp_param[i].compbuf);
2598     }
2599     g_free(decompress_threads);
2600     g_free(decomp_param);
2601     decompress_threads = NULL;
2602     decomp_param = NULL;
2603 }
2604
2605 static void decompress_data_with_multi_threads(QEMUFile *f,
2606                                                void *host, int len)
2607 {
2608     int idx, thread_count;
2609
2610     thread_count = migrate_decompress_threads();
2611     qemu_mutex_lock(&decomp_done_lock);
2612     while (true) {
2613         for (idx = 0; idx < thread_count; idx++) {
2614             if (decomp_param[idx].done) {
2615                 decomp_param[idx].done = false;
2616                 qemu_mutex_lock(&decomp_param[idx].mutex);
2617                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2618                 decomp_param[idx].des = host;
2619                 decomp_param[idx].len = len;
2620                 qemu_cond_signal(&decomp_param[idx].cond);
2621                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2622                 break;
2623             }
2624         }
2625         if (idx < thread_count) {
2626             break;
2627         } else {
2628             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2629         }
2630     }
2631     qemu_mutex_unlock(&decomp_done_lock);
2632 }
2633
2634 /**
2635  * ram_load_setup: Setup RAM for migration incoming side
2636  *
2637  * Returns zero to indicate success and negative for error
2638  *
2639  * @f: QEMUFile where to receive the data
2640  * @opaque: RAMState pointer
2641  */
2642 static int ram_load_setup(QEMUFile *f, void *opaque)
2643 {
2644     xbzrle_load_setup();
2645     compress_threads_load_setup();
2646     ramblock_recv_map_init();
2647     return 0;
2648 }
2649
2650 static int ram_load_cleanup(void *opaque)
2651 {
2652     RAMBlock *rb;
2653     xbzrle_load_cleanup();
2654     compress_threads_load_cleanup();
2655
2656     RAMBLOCK_FOREACH(rb) {
2657         g_free(rb->receivedmap);
2658         rb->receivedmap = NULL;
2659     }
2660     return 0;
2661 }
2662
2663 /**
2664  * ram_postcopy_incoming_init: allocate postcopy data structures
2665  *
2666  * Returns 0 for success and negative if there was one error
2667  *
2668  * @mis: current migration incoming state
2669  *
2670  * Allocate data structures etc needed by incoming migration with
2671  * postcopy-ram. postcopy-ram's similarly names
2672  * postcopy_ram_incoming_init does the work.
2673  */
2674 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2675 {
2676     unsigned long ram_pages = last_ram_page();
2677
2678     return postcopy_ram_incoming_init(mis, ram_pages);
2679 }
2680
2681 /**
2682  * ram_load_postcopy: load a page in postcopy case
2683  *
2684  * Returns 0 for success or -errno in case of error
2685  *
2686  * Called in postcopy mode by ram_load().
2687  * rcu_read_lock is taken prior to this being called.
2688  *
2689  * @f: QEMUFile where to send the data
2690  */
2691 static int ram_load_postcopy(QEMUFile *f)
2692 {
2693     int flags = 0, ret = 0;
2694     bool place_needed = false;
2695     bool matching_page_sizes = false;
2696     MigrationIncomingState *mis = migration_incoming_get_current();
2697     /* Temporary page that is later 'placed' */
2698     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2699     void *last_host = NULL;
2700     bool all_zero = false;
2701
2702     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2703         ram_addr_t addr;
2704         void *host = NULL;
2705         void *page_buffer = NULL;
2706         void *place_source = NULL;
2707         RAMBlock *block = NULL;
2708         uint8_t ch;
2709
2710         addr = qemu_get_be64(f);
2711
2712         /*
2713          * If qemu file error, we should stop here, and then "addr"
2714          * may be invalid
2715          */
2716         ret = qemu_file_get_error(f);
2717         if (ret) {
2718             break;
2719         }
2720
2721         flags = addr & ~TARGET_PAGE_MASK;
2722         addr &= TARGET_PAGE_MASK;
2723
2724         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2725         place_needed = false;
2726         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2727             block = ram_block_from_stream(f, flags);
2728
2729             host = host_from_ram_block_offset(block, addr);
2730             if (!host) {
2731                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2732                 ret = -EINVAL;
2733                 break;
2734             }
2735             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2736             /*
2737              * Postcopy requires that we place whole host pages atomically;
2738              * these may be huge pages for RAMBlocks that are backed by
2739              * hugetlbfs.
2740              * To make it atomic, the data is read into a temporary page
2741              * that's moved into place later.
2742              * The migration protocol uses,  possibly smaller, target-pages
2743              * however the source ensures it always sends all the components
2744              * of a host page in order.
2745              */
2746             page_buffer = postcopy_host_page +
2747                           ((uintptr_t)host & (block->page_size - 1));
2748             /* If all TP are zero then we can optimise the place */
2749             if (!((uintptr_t)host & (block->page_size - 1))) {
2750                 all_zero = true;
2751             } else {
2752                 /* not the 1st TP within the HP */
2753                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2754                     error_report("Non-sequential target page %p/%p",
2755                                   host, last_host);
2756                     ret = -EINVAL;
2757                     break;
2758                 }
2759             }
2760
2761
2762             /*
2763              * If it's the last part of a host page then we place the host
2764              * page
2765              */
2766             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2767                                      (block->page_size - 1)) == 0;
2768             place_source = postcopy_host_page;
2769         }
2770         last_host = host;
2771
2772         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2773         case RAM_SAVE_FLAG_ZERO:
2774             ch = qemu_get_byte(f);
2775             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2776             if (ch) {
2777                 all_zero = false;
2778             }
2779             break;
2780
2781         case RAM_SAVE_FLAG_PAGE:
2782             all_zero = false;
2783             if (!place_needed || !matching_page_sizes) {
2784                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2785             } else {
2786                 /* Avoids the qemu_file copy during postcopy, which is
2787                  * going to do a copy later; can only do it when we
2788                  * do this read in one go (matching page sizes)
2789                  */
2790                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2791                                          TARGET_PAGE_SIZE);
2792             }
2793             break;
2794         case RAM_SAVE_FLAG_EOS:
2795             /* normal exit */
2796             break;
2797         default:
2798             error_report("Unknown combination of migration flags: %#x"
2799                          " (postcopy mode)", flags);
2800             ret = -EINVAL;
2801             break;
2802         }
2803
2804         /* Detect for any possible file errors */
2805         if (!ret && qemu_file_get_error(f)) {
2806             ret = qemu_file_get_error(f);
2807         }
2808
2809         if (!ret && place_needed) {
2810             /* This gets called at the last target page in the host page */
2811             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2812
2813             if (all_zero) {
2814                 ret = postcopy_place_page_zero(mis, place_dest,
2815                                                block);
2816             } else {
2817                 ret = postcopy_place_page(mis, place_dest,
2818                                           place_source, block);
2819             }
2820         }
2821     }
2822
2823     return ret;
2824 }
2825
2826 static bool postcopy_is_advised(void)
2827 {
2828     PostcopyState ps = postcopy_state_get();
2829     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
2830 }
2831
2832 static bool postcopy_is_running(void)
2833 {
2834     PostcopyState ps = postcopy_state_get();
2835     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
2836 }
2837
2838 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2839 {
2840     int flags = 0, ret = 0, invalid_flags = 0;
2841     static uint64_t seq_iter;
2842     int len = 0;
2843     /*
2844      * If system is running in postcopy mode, page inserts to host memory must
2845      * be atomic
2846      */
2847     bool postcopy_running = postcopy_is_running();
2848     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2849     bool postcopy_advised = postcopy_is_advised();
2850
2851     seq_iter++;
2852
2853     if (version_id != 4) {
2854         ret = -EINVAL;
2855     }
2856
2857     if (!migrate_use_compression()) {
2858         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2859     }
2860     /* This RCU critical section can be very long running.
2861      * When RCU reclaims in the code start to become numerous,
2862      * it will be necessary to reduce the granularity of this
2863      * critical section.
2864      */
2865     rcu_read_lock();
2866
2867     if (postcopy_running) {
2868         ret = ram_load_postcopy(f);
2869     }
2870
2871     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2872         ram_addr_t addr, total_ram_bytes;
2873         void *host = NULL;
2874         uint8_t ch;
2875
2876         addr = qemu_get_be64(f);
2877         flags = addr & ~TARGET_PAGE_MASK;
2878         addr &= TARGET_PAGE_MASK;
2879
2880         if (flags & invalid_flags) {
2881             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2882                 error_report("Received an unexpected compressed page");
2883             }
2884
2885             ret = -EINVAL;
2886             break;
2887         }
2888
2889         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2890                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2891             RAMBlock *block = ram_block_from_stream(f, flags);
2892
2893             host = host_from_ram_block_offset(block, addr);
2894             if (!host) {
2895                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2896                 ret = -EINVAL;
2897                 break;
2898             }
2899             ramblock_recv_bitmap_set(block, host);
2900             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2901         }
2902
2903         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2904         case RAM_SAVE_FLAG_MEM_SIZE:
2905             /* Synchronize RAM block list */
2906             total_ram_bytes = addr;
2907             while (!ret && total_ram_bytes) {
2908                 RAMBlock *block;
2909                 char id[256];
2910                 ram_addr_t length;
2911
2912                 len = qemu_get_byte(f);
2913                 qemu_get_buffer(f, (uint8_t *)id, len);
2914                 id[len] = 0;
2915                 length = qemu_get_be64(f);
2916
2917                 block = qemu_ram_block_by_name(id);
2918                 if (block) {
2919                     if (length != block->used_length) {
2920                         Error *local_err = NULL;
2921
2922                         ret = qemu_ram_resize(block, length,
2923                                               &local_err);
2924                         if (local_err) {
2925                             error_report_err(local_err);
2926                         }
2927                     }
2928                     /* For postcopy we need to check hugepage sizes match */
2929                     if (postcopy_advised &&
2930                         block->page_size != qemu_host_page_size) {
2931                         uint64_t remote_page_size = qemu_get_be64(f);
2932                         if (remote_page_size != block->page_size) {
2933                             error_report("Mismatched RAM page size %s "
2934                                          "(local) %zd != %" PRId64,
2935                                          id, block->page_size,
2936                                          remote_page_size);
2937                             ret = -EINVAL;
2938                         }
2939                     }
2940                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2941                                           block->idstr);
2942                 } else {
2943                     error_report("Unknown ramblock \"%s\", cannot "
2944                                  "accept migration", id);
2945                     ret = -EINVAL;
2946                 }
2947
2948                 total_ram_bytes -= length;
2949             }
2950             break;
2951
2952         case RAM_SAVE_FLAG_ZERO:
2953             ch = qemu_get_byte(f);
2954             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2955             break;
2956
2957         case RAM_SAVE_FLAG_PAGE:
2958             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2959             break;
2960
2961         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2962             len = qemu_get_be32(f);
2963             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2964                 error_report("Invalid compressed data length: %d", len);
2965                 ret = -EINVAL;
2966                 break;
2967             }
2968             decompress_data_with_multi_threads(f, host, len);
2969             break;
2970
2971         case RAM_SAVE_FLAG_XBZRLE:
2972             if (load_xbzrle(f, addr, host) < 0) {
2973                 error_report("Failed to decompress XBZRLE page at "
2974                              RAM_ADDR_FMT, addr);
2975                 ret = -EINVAL;
2976                 break;
2977             }
2978             break;
2979         case RAM_SAVE_FLAG_EOS:
2980             /* normal exit */
2981             break;
2982         default:
2983             if (flags & RAM_SAVE_FLAG_HOOK) {
2984                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2985             } else {
2986                 error_report("Unknown combination of migration flags: %#x",
2987                              flags);
2988                 ret = -EINVAL;
2989             }
2990         }
2991         if (!ret) {
2992             ret = qemu_file_get_error(f);
2993         }
2994     }
2995
2996     wait_for_decompress_done();
2997     rcu_read_unlock();
2998     trace_ram_load_complete(ret, seq_iter);
2999     return ret;
3000 }
3001
3002 static bool ram_has_postcopy(void *opaque)
3003 {
3004     return migrate_postcopy_ram();
3005 }
3006
3007 static SaveVMHandlers savevm_ram_handlers = {
3008     .save_setup = ram_save_setup,
3009     .save_live_iterate = ram_save_iterate,
3010     .save_live_complete_postcopy = ram_save_complete,
3011     .save_live_complete_precopy = ram_save_complete,
3012     .has_postcopy = ram_has_postcopy,
3013     .save_live_pending = ram_save_pending,
3014     .load_state = ram_load,
3015     .save_cleanup = ram_save_cleanup,
3016     .load_setup = ram_load_setup,
3017     .load_cleanup = ram_load_cleanup,
3018 };
3019
3020 void ram_mig_init(void)
3021 {
3022     qemu_mutex_init(&XBZRLE.lock);
3023     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
3024 }