migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qapi-event.h"
  33 #include "qemu/cutils.h"
  34 #include "qemu/bitops.h"
  35 #include "qemu/bitmap.h"
  36 #include "qemu/main-loop.h"
  37 #include "xbzrle.h"
  38 #include "ram.h"
  39 #include "migration.h"
  40 #include "migration/register.h"
  41 #include "migration/misc.h"
  42 #include "qemu-file.h"
  43 #include "postcopy-ram.h"
  44 #include "migration/page_cache.h"
  45 #include "qemu/error-report.h"
  46 #include "qapi/error.h"
  47 #include "qapi/qmp/qerror.h"
  48 #include "trace.h"
  49 #include "exec/ram_addr.h"
  50 #include "exec/target_page.h"
  51 #include "qemu/rcu_queue.h"
  52 #include "migration/colo.h"
  53 #include "migration/block.h"
  54
  55 /***********************************************************/
  56 /* ram save/restore */
  57
  58 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  59  * worked for pages that where filled with the same char.  We switched
  60  * it to only search for the zero value.  And to avoid confusion with
  61  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  62  */
  63
  64 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  65 #define RAM_SAVE_FLAG_ZERO     0x02
  66 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  67 #define RAM_SAVE_FLAG_PAGE     0x08
  68 #define RAM_SAVE_FLAG_EOS      0x10
  69 #define RAM_SAVE_FLAG_CONTINUE 0x20
  70 #define RAM_SAVE_FLAG_XBZRLE   0x40
  71 /* 0x80 is reserved in migration.h start with 0x100 next */
  72 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  73
  74 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  75 {
  76     return buffer_is_zero(p, size);
  77 }
  78
  79 XBZRLECacheStats xbzrle_counters;
  80
  81 /* struct contains XBZRLE cache and a static page
  82    used by the compression */
  83 static struct {
  84     /* buffer used for XBZRLE encoding */
  85     uint8_t *encoded_buf;
  86     /* buffer for storing page content */
  87     uint8_t *current_buf;
  88     /* Cache for XBZRLE, Protected by lock. */
  89     PageCache *cache;
  90     QemuMutex lock;
  91     /* it will store a page full of zeros */
  92     uint8_t *zero_target_page;
  93     /* buffer used for XBZRLE decoding */
  94     uint8_t *decoded_buf;
  95 } XBZRLE;
  96
  97 static void XBZRLE_cache_lock(void)
  98 {
  99     if (migrate_use_xbzrle())
 100         qemu_mutex_lock(&XBZRLE.lock);
 101 }
 102
 103 static void XBZRLE_cache_unlock(void)
 104 {
 105     if (migrate_use_xbzrle())
 106         qemu_mutex_unlock(&XBZRLE.lock);
 107 }
 108
 109 /**
 110  * xbzrle_cache_resize: resize the xbzrle cache
 111  *
 112  * This function is called from qmp_migrate_set_cache_size in main
 113  * thread, possibly while a migration is in progress.  A running
 114  * migration may be using the cache and might finish during this call,
 115  * hence changes to the cache are protected by XBZRLE.lock().
 116  *
 117  * Returns 0 for success or -1 for error
 118  *
 119  * @new_size: new cache size
 120  * @errp: set *errp if the check failed, with reason
 121  */
 122 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 123 {
 124     PageCache *new_cache;
 125     int64_t ret = 0;
 126
 127     /* Check for truncation */
 128     if (new_size != (size_t)new_size) {
 129         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 130                    "exceeding address space");
 131         return -1;
 132     }
 133
 134     if (new_size == migrate_xbzrle_cache_size()) {
 135         /* nothing to do */
 136         return 0;
 137     }
 138
 139     XBZRLE_cache_lock();
 140
 141     if (XBZRLE.cache != NULL) {
 142         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 143         if (!new_cache) {
 144             ret = -1;
 145             goto out;
 146         }
 147
 148         cache_fini(XBZRLE.cache);
 149         XBZRLE.cache = new_cache;
 150     }
 151 out:
 152     XBZRLE_cache_unlock();
 153     return ret;
 154 }
 155
 156 static void ramblock_recv_map_init(void)
 157 {
 158     RAMBlock *rb;
 159
 160     RAMBLOCK_FOREACH(rb) {
 161         assert(!rb->receivedmap);
 162         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 163     }
 164 }
 165
 166 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 167 {
 168     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 169                     rb->receivedmap);
 170 }
 171
 172 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 173 {
 174     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 175 }
 176
 177 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 178                                     size_t nr)
 179 {
 180     bitmap_set_atomic(rb->receivedmap,
 181                       ramblock_recv_bitmap_offset(host_addr, rb),
 182                       nr);
 183 }
 184
 185 /*
 186  * An outstanding page request, on the source, having been received
 187  * and queued
 188  */
 189 struct RAMSrcPageRequest {
 190     RAMBlock *rb;
 191     hwaddr    offset;
 192     hwaddr    len;
 193
 194     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 195 };
 196
 197 /* State of RAM for migration */
 198 struct RAMState {
 199     /* QEMUFile used for this migration */
 200     QEMUFile *f;
 201     /* Last block that we have visited searching for dirty pages */
 202     RAMBlock *last_seen_block;
 203     /* Last block from where we have sent data */
 204     RAMBlock *last_sent_block;
 205     /* Last dirty target page we have sent */
 206     ram_addr_t last_page;
 207     /* last ram version we have seen */
 208     uint32_t last_version;
 209     /* We are in the first round */
 210     bool ram_bulk_stage;
 211     /* How many times we have dirty too many pages */
 212     int dirty_rate_high_cnt;
 213     /* these variables are used for bitmap sync */
 214     /* last time we did a full bitmap_sync */
 215     int64_t time_last_bitmap_sync;
 216     /* bytes transferred at start_time */
 217     uint64_t bytes_xfer_prev;
 218     /* number of dirty pages since start_time */
 219     uint64_t num_dirty_pages_period;
 220     /* xbzrle misses since the beginning of the period */
 221     uint64_t xbzrle_cache_miss_prev;
 222     /* number of iterations at the beginning of period */
 223     uint64_t iterations_prev;
 224     /* Iterations since start */
 225     uint64_t iterations;
 226     /* number of dirty bits in the bitmap */
 227     uint64_t migration_dirty_pages;
 228     /* protects modification of the bitmap */
 229     QemuMutex bitmap_mutex;
 230     /* The RAMBlock used in the last src_page_requests */
 231     RAMBlock *last_req_rb;
 232     /* Queue of outstanding page requests from the destination */
 233     QemuMutex src_page_req_mutex;
 234     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 235 };
 236 typedef struct RAMState RAMState;
 237
 238 static RAMState *ram_state;
 239
 240 uint64_t ram_bytes_remaining(void)
 241 {
 242     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 243                        0;
 244 }
 245
 246 MigrationStats ram_counters;
 247
 248 /* used by the search for pages to send */
 249 struct PageSearchStatus {
 250     /* Current block being searched */
 251     RAMBlock    *block;
 252     /* Current page to search from */
 253     unsigned long page;
 254     /* Set once we wrap around */
 255     bool         complete_round;
 256 };
 257 typedef struct PageSearchStatus PageSearchStatus;
 258
 259 struct CompressParam {
 260     bool done;
 261     bool quit;
 262     QEMUFile *file;
 263     QemuMutex mutex;
 264     QemuCond cond;
 265     RAMBlock *block;
 266     ram_addr_t offset;
 267 };
 268 typedef struct CompressParam CompressParam;
 269
 270 struct DecompressParam {
 271     bool done;
 272     bool quit;
 273     QemuMutex mutex;
 274     QemuCond cond;
 275     void *des;
 276     uint8_t *compbuf;
 277     int len;
 278 };
 279 typedef struct DecompressParam DecompressParam;
 280
 281 static CompressParam *comp_param;
 282 static QemuThread *compress_threads;
 283 /* comp_done_cond is used to wake up the migration thread when
 284  * one of the compression threads has finished the compression.
 285  * comp_done_lock is used to co-work with comp_done_cond.
 286  */
 287 static QemuMutex comp_done_lock;
 288 static QemuCond comp_done_cond;
 289 /* The empty QEMUFileOps will be used by file in CompressParam */
 290 static const QEMUFileOps empty_ops = { };
 291
 292 static DecompressParam *decomp_param;
 293 static QemuThread *decompress_threads;
 294 static QemuMutex decomp_done_lock;
 295 static QemuCond decomp_done_cond;
 296
 297 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 298                                 ram_addr_t offset);
 299
 300 static void *do_data_compress(void *opaque)
 301 {
 302     CompressParam *param = opaque;
 303     RAMBlock *block;
 304     ram_addr_t offset;
 305
 306     qemu_mutex_lock(&param->mutex);
 307     while (!param->quit) {
 308         if (param->block) {
 309             block = param->block;
 310             offset = param->offset;
 311             param->block = NULL;
 312             qemu_mutex_unlock(&param->mutex);
 313
 314             do_compress_ram_page(param->file, block, offset);
 315
 316             qemu_mutex_lock(&comp_done_lock);
 317             param->done = true;
 318             qemu_cond_signal(&comp_done_cond);
 319             qemu_mutex_unlock(&comp_done_lock);
 320
 321             qemu_mutex_lock(&param->mutex);
 322         } else {
 323             qemu_cond_wait(&param->cond, &param->mutex);
 324         }
 325     }
 326     qemu_mutex_unlock(&param->mutex);
 327
 328     return NULL;
 329 }
 330
 331 static inline void terminate_compression_threads(void)
 332 {
 333     int idx, thread_count;
 334
 335     thread_count = migrate_compress_threads();
 336
 337     for (idx = 0; idx < thread_count; idx++) {
 338         qemu_mutex_lock(&comp_param[idx].mutex);
 339         comp_param[idx].quit = true;
 340         qemu_cond_signal(&comp_param[idx].cond);
 341         qemu_mutex_unlock(&comp_param[idx].mutex);
 342     }
 343 }
 344
 345 static void compress_threads_save_cleanup(void)
 346 {
 347     int i, thread_count;
 348
 349     if (!migrate_use_compression()) {
 350         return;
 351     }
 352     terminate_compression_threads();
 353     thread_count = migrate_compress_threads();
 354     for (i = 0; i < thread_count; i++) {
 355         qemu_thread_join(compress_threads + i);
 356         qemu_fclose(comp_param[i].file);
 357         qemu_mutex_destroy(&comp_param[i].mutex);
 358         qemu_cond_destroy(&comp_param[i].cond);
 359     }
 360     qemu_mutex_destroy(&comp_done_lock);
 361     qemu_cond_destroy(&comp_done_cond);
 362     g_free(compress_threads);
 363     g_free(comp_param);
 364     compress_threads = NULL;
 365     comp_param = NULL;
 366 }
 367
 368 static void compress_threads_save_setup(void)
 369 {
 370     int i, thread_count;
 371
 372     if (!migrate_use_compression()) {
 373         return;
 374     }
 375     thread_count = migrate_compress_threads();
 376     compress_threads = g_new0(QemuThread, thread_count);
 377     comp_param = g_new0(CompressParam, thread_count);
 378     qemu_cond_init(&comp_done_cond);
 379     qemu_mutex_init(&comp_done_lock);
 380     for (i = 0; i < thread_count; i++) {
 381         /* comp_param[i].file is just used as a dummy buffer to save data,
 382          * set its ops to empty.
 383          */
 384         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 385         comp_param[i].done = true;
 386         comp_param[i].quit = false;
 387         qemu_mutex_init(&comp_param[i].mutex);
 388         qemu_cond_init(&comp_param[i].cond);
 389         qemu_thread_create(compress_threads + i, "compress",
 390                            do_data_compress, comp_param + i,
 391                            QEMU_THREAD_JOINABLE);
 392     }
 393 }
 394
 395 /* Multiple fd's */
 396
 397 struct MultiFDSendParams {
 398     uint8_t id;
 399     char *name;
 400     QemuThread thread;
 401     QemuSemaphore sem;
 402     QemuMutex mutex;
 403     bool quit;
 404 };
 405 typedef struct MultiFDSendParams MultiFDSendParams;
 406
 407 struct {
 408     MultiFDSendParams *params;
 409     /* number of created threads */
 410     int count;
 411 } *multifd_send_state;
 412
 413 static void terminate_multifd_send_threads(Error *errp)
 414 {
 415     int i;
 416
 417     for (i = 0; i < multifd_send_state->count; i++) {
 418         MultiFDSendParams *p = &multifd_send_state->params[i];
 419
 420         qemu_mutex_lock(&p->mutex);
 421         p->quit = true;
 422         qemu_sem_post(&p->sem);
 423         qemu_mutex_unlock(&p->mutex);
 424     }
 425 }
 426
 427 int multifd_save_cleanup(Error **errp)
 428 {
 429     int i;
 430     int ret = 0;
 431
 432     if (!migrate_use_multifd()) {
 433         return 0;
 434     }
 435     terminate_multifd_send_threads(NULL);
 436     for (i = 0; i < multifd_send_state->count; i++) {
 437         MultiFDSendParams *p = &multifd_send_state->params[i];
 438
 439         qemu_thread_join(&p->thread);
 440         qemu_mutex_destroy(&p->mutex);
 441         qemu_sem_destroy(&p->sem);
 442         g_free(p->name);
 443         p->name = NULL;
 444     }
 445     g_free(multifd_send_state->params);
 446     multifd_send_state->params = NULL;
 447     g_free(multifd_send_state);
 448     multifd_send_state = NULL;
 449     return ret;
 450 }
 451
 452 static void *multifd_send_thread(void *opaque)
 453 {
 454     MultiFDSendParams *p = opaque;
 455
 456     while (true) {
 457         qemu_mutex_lock(&p->mutex);
 458         if (p->quit) {
 459             qemu_mutex_unlock(&p->mutex);
 460             break;
 461         }
 462         qemu_mutex_unlock(&p->mutex);
 463         qemu_sem_wait(&p->sem);
 464     }
 465
 466     return NULL;
 467 }
 468
 469 int multifd_save_setup(void)
 470 {
 471     int thread_count;
 472     uint8_t i;
 473
 474     if (!migrate_use_multifd()) {
 475         return 0;
 476     }
 477     thread_count = migrate_multifd_channels();
 478     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
 479     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
 480     multifd_send_state->count = 0;
 481     for (i = 0; i < thread_count; i++) {
 482         MultiFDSendParams *p = &multifd_send_state->params[i];
 483
 484         qemu_mutex_init(&p->mutex);
 485         qemu_sem_init(&p->sem, 0);
 486         p->quit = false;
 487         p->id = i;
 488         p->name = g_strdup_printf("multifdsend_%d", i);
 489         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
 490                            QEMU_THREAD_JOINABLE);
 491
 492         multifd_send_state->count++;
 493     }
 494     return 0;
 495 }
 496
 497 struct MultiFDRecvParams {
 498     uint8_t id;
 499     char *name;
 500     QemuThread thread;
 501     QemuSemaphore sem;
 502     QemuMutex mutex;
 503     bool quit;
 504 };
 505 typedef struct MultiFDRecvParams MultiFDRecvParams;
 506
 507 struct {
 508     MultiFDRecvParams *params;
 509     /* number of created threads */
 510     int count;
 511 } *multifd_recv_state;
 512
 513 static void terminate_multifd_recv_threads(Error *errp)
 514 {
 515     int i;
 516
 517     for (i = 0; i < multifd_recv_state->count; i++) {
 518         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 519
 520         qemu_mutex_lock(&p->mutex);
 521         p->quit = true;
 522         qemu_sem_post(&p->sem);
 523         qemu_mutex_unlock(&p->mutex);
 524     }
 525 }
 526
 527 int multifd_load_cleanup(Error **errp)
 528 {
 529     int i;
 530     int ret = 0;
 531
 532     if (!migrate_use_multifd()) {
 533         return 0;
 534     }
 535     terminate_multifd_recv_threads(NULL);
 536     for (i = 0; i < multifd_recv_state->count; i++) {
 537         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 538
 539         qemu_thread_join(&p->thread);
 540         qemu_mutex_destroy(&p->mutex);
 541         qemu_sem_destroy(&p->sem);
 542         g_free(p->name);
 543         p->name = NULL;
 544     }
 545     g_free(multifd_recv_state->params);
 546     multifd_recv_state->params = NULL;
 547     g_free(multifd_recv_state);
 548     multifd_recv_state = NULL;
 549
 550     return ret;
 551 }
 552
 553 static void *multifd_recv_thread(void *opaque)
 554 {
 555     MultiFDRecvParams *p = opaque;
 556
 557     while (true) {
 558         qemu_mutex_lock(&p->mutex);
 559         if (p->quit) {
 560             qemu_mutex_unlock(&p->mutex);
 561             break;
 562         }
 563         qemu_mutex_unlock(&p->mutex);
 564         qemu_sem_wait(&p->sem);
 565     }
 566
 567     return NULL;
 568 }
 569
 570 int multifd_load_setup(void)
 571 {
 572     int thread_count;
 573     uint8_t i;
 574
 575     if (!migrate_use_multifd()) {
 576         return 0;
 577     }
 578     thread_count = migrate_multifd_channels();
 579     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
 580     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
 581     multifd_recv_state->count = 0;
 582     for (i = 0; i < thread_count; i++) {
 583         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 584
 585         qemu_mutex_init(&p->mutex);
 586         qemu_sem_init(&p->sem, 0);
 587         p->quit = false;
 588         p->id = i;
 589         p->name = g_strdup_printf("multifdrecv_%d", i);
 590         qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
 591                            QEMU_THREAD_JOINABLE);
 592         multifd_recv_state->count++;
 593     }
 594     return 0;
 595 }
 596
 597 /**
 598  * save_page_header: write page header to wire
 599  *
 600  * If this is the 1st block, it also writes the block identification
 601  *
 602  * Returns the number of bytes written
 603  *
 604  * @f: QEMUFile where to send the data
 605  * @block: block that contains the page we want to send
 606  * @offset: offset inside the block for the page
 607  *          in the lower bits, it contains flags
 608  */
 609 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 610                                ram_addr_t offset)
 611 {
 612     size_t size, len;
 613
 614     if (block == rs->last_sent_block) {
 615         offset |= RAM_SAVE_FLAG_CONTINUE;
 616     }
 617     qemu_put_be64(f, offset);
 618     size = 8;
 619
 620     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 621         len = strlen(block->idstr);
 622         qemu_put_byte(f, len);
 623         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 624         size += 1 + len;
 625         rs->last_sent_block = block;
 626     }
 627     return size;
 628 }
 629
 630 /**
 631  * mig_throttle_guest_down: throotle down the guest
 632  *
 633  * Reduce amount of guest cpu execution to hopefully slow down memory
 634  * writes. If guest dirty memory rate is reduced below the rate at
 635  * which we can transfer pages to the destination then we should be
 636  * able to complete migration. Some workloads dirty memory way too
 637  * fast and will not effectively converge, even with auto-converge.
 638  */
 639 static void mig_throttle_guest_down(void)
 640 {
 641     MigrationState *s = migrate_get_current();
 642     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 643     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 644
 645     /* We have not started throttling yet. Let's start it. */
 646     if (!cpu_throttle_active()) {
 647         cpu_throttle_set(pct_initial);
 648     } else {
 649         /* Throttling already on, just increase the rate */
 650         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 651     }
 652 }
 653
 654 /**
 655  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 656  *
 657  * @rs: current RAM state
 658  * @current_addr: address for the zero page
 659  *
 660  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 661  * The important thing is that a stale (not-yet-0'd) page be replaced
 662  * by the new data.
 663  * As a bonus, if the page wasn't in the cache it gets added so that
 664  * when a small write is made into the 0'd page it gets XBZRLE sent.
 665  */
 666 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 667 {
 668     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 669         return;
 670     }
 671
 672     /* We don't care if this fails to allocate a new cache page
 673      * as long as it updated an old one */
 674     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 675                  ram_counters.dirty_sync_count);
 676 }
 677
 678 #define ENCODING_FLAG_XBZRLE 0x1
 679
 680 /**
 681  * save_xbzrle_page: compress and send current page
 682  *
 683  * Returns: 1 means that we wrote the page
 684  *          0 means that page is identical to the one already sent
 685  *          -1 means that xbzrle would be longer than normal
 686  *
 687  * @rs: current RAM state
 688  * @current_data: pointer to the address of the page contents
 689  * @current_addr: addr of the page
 690  * @block: block that contains the page we want to send
 691  * @offset: offset inside the block for the page
 692  * @last_stage: if we are at the completion stage
 693  */
 694 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 695                             ram_addr_t current_addr, RAMBlock *block,
 696                             ram_addr_t offset, bool last_stage)
 697 {
 698     int encoded_len = 0, bytes_xbzrle;
 699     uint8_t *prev_cached_page;
 700
 701     if (!cache_is_cached(XBZRLE.cache, current_addr,
 702                          ram_counters.dirty_sync_count)) {
 703         xbzrle_counters.cache_miss++;
 704         if (!last_stage) {
 705             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 706                              ram_counters.dirty_sync_count) == -1) {
 707                 return -1;
 708             } else {
 709                 /* update *current_data when the page has been
 710                    inserted into cache */
 711                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 712             }
 713         }
 714         return -1;
 715     }
 716
 717     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 718
 719     /* save current buffer into memory */
 720     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 721
 722     /* XBZRLE encoding (if there is no overflow) */
 723     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 724                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 725                                        TARGET_PAGE_SIZE);
 726     if (encoded_len == 0) {
 727         trace_save_xbzrle_page_skipping();
 728         return 0;
 729     } else if (encoded_len == -1) {
 730         trace_save_xbzrle_page_overflow();
 731         xbzrle_counters.overflow++;
 732         /* update data in the cache */
 733         if (!last_stage) {
 734             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 735             *current_data = prev_cached_page;
 736         }
 737         return -1;
 738     }
 739
 740     /* we need to update the data in the cache, in order to get the same data */
 741     if (!last_stage) {
 742         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 743     }
 744
 745     /* Send XBZRLE based compressed page */
 746     bytes_xbzrle = save_page_header(rs, rs->f, block,
 747                                     offset | RAM_SAVE_FLAG_XBZRLE);
 748     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 749     qemu_put_be16(rs->f, encoded_len);
 750     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 751     bytes_xbzrle += encoded_len + 1 + 2;
 752     xbzrle_counters.pages++;
 753     xbzrle_counters.bytes += bytes_xbzrle;
 754     ram_counters.transferred += bytes_xbzrle;
 755
 756     return 1;
 757 }
 758
 759 /**
 760  * migration_bitmap_find_dirty: find the next dirty page from start
 761  *
 762  * Called with rcu_read_lock() to protect migration_bitmap
 763  *
 764  * Returns the byte offset within memory region of the start of a dirty page
 765  *
 766  * @rs: current RAM state
 767  * @rb: RAMBlock where to search for dirty pages
 768  * @start: page where we start the search
 769  */
 770 static inline
 771 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 772                                           unsigned long start)
 773 {
 774     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 775     unsigned long *bitmap = rb->bmap;
 776     unsigned long next;
 777
 778     if (rs->ram_bulk_stage && start > 0) {
 779         next = start + 1;
 780     } else {
 781         next = find_next_bit(bitmap, size, start);
 782     }
 783
 784     return next;
 785 }
 786
 787 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 788                                                 RAMBlock *rb,
 789                                                 unsigned long page)
 790 {
 791     bool ret;
 792
 793     ret = test_and_clear_bit(page, rb->bmap);
 794
 795     if (ret) {
 796         rs->migration_dirty_pages--;
 797     }
 798     return ret;
 799 }
 800
 801 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 802                                         ram_addr_t start, ram_addr_t length)
 803 {
 804     rs->migration_dirty_pages +=
 805         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 806                                               &rs->num_dirty_pages_period);
 807 }
 808
 809 /**
 810  * ram_pagesize_summary: calculate all the pagesizes of a VM
 811  *
 812  * Returns a summary bitmap of the page sizes of all RAMBlocks
 813  *
 814  * For VMs with just normal pages this is equivalent to the host page
 815  * size. If it's got some huge pages then it's the OR of all the
 816  * different page sizes.
 817  */
 818 uint64_t ram_pagesize_summary(void)
 819 {
 820     RAMBlock *block;
 821     uint64_t summary = 0;
 822
 823     RAMBLOCK_FOREACH(block) {
 824         summary |= block->page_size;
 825     }
 826
 827     return summary;
 828 }
 829
 830 static void migration_bitmap_sync(RAMState *rs)
 831 {
 832     RAMBlock *block;
 833     int64_t end_time;
 834     uint64_t bytes_xfer_now;
 835
 836     ram_counters.dirty_sync_count++;
 837
 838     if (!rs->time_last_bitmap_sync) {
 839         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 840     }
 841
 842     trace_migration_bitmap_sync_start();
 843     memory_global_dirty_log_sync();
 844
 845     qemu_mutex_lock(&rs->bitmap_mutex);
 846     rcu_read_lock();
 847     RAMBLOCK_FOREACH(block) {
 848         migration_bitmap_sync_range(rs, block, 0, block->used_length);
 849     }
 850     rcu_read_unlock();
 851     qemu_mutex_unlock(&rs->bitmap_mutex);
 852
 853     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 854
 855     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 856
 857     /* more than 1 second = 1000 millisecons */
 858     if (end_time > rs->time_last_bitmap_sync + 1000) {
 859         /* calculate period counters */
 860         ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 861             / (end_time - rs->time_last_bitmap_sync);
 862         bytes_xfer_now = ram_counters.transferred;
 863
 864         /* During block migration the auto-converge logic incorrectly detects
 865          * that ram migration makes no progress. Avoid this by disabling the
 866          * throttling logic during the bulk phase of block migration. */
 867         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 868             /* The following detection logic can be refined later. For now:
 869                Check to see if the dirtied bytes is 50% more than the approx.
 870                amount of bytes that just got transferred since the last time we
 871                were in this routine. If that happens twice, start or increase
 872                throttling */
 873
 874             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 875                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 876                 (++rs->dirty_rate_high_cnt >= 2)) {
 877                     trace_migration_throttle();
 878                     rs->dirty_rate_high_cnt = 0;
 879                     mig_throttle_guest_down();
 880             }
 881         }
 882
 883         if (migrate_use_xbzrle()) {
 884             if (rs->iterations_prev != rs->iterations) {
 885                 xbzrle_counters.cache_miss_rate =
 886                    (double)(xbzrle_counters.cache_miss -
 887                             rs->xbzrle_cache_miss_prev) /
 888                    (rs->iterations - rs->iterations_prev);
 889             }
 890             rs->iterations_prev = rs->iterations;
 891             rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 892         }
 893
 894         /* reset period counters */
 895         rs->time_last_bitmap_sync = end_time;
 896         rs->num_dirty_pages_period = 0;
 897         rs->bytes_xfer_prev = bytes_xfer_now;
 898     }
 899     if (migrate_use_events()) {
 900         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
 901     }
 902 }
 903
 904 /**
 905  * save_zero_page: send the zero page to the stream
 906  *
 907  * Returns the number of pages written.
 908  *
 909  * @rs: current RAM state
 910  * @block: block that contains the page we want to send
 911  * @offset: offset inside the block for the page
 912  */
 913 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
 914 {
 915     uint8_t *p = block->host + offset;
 916     int pages = -1;
 917
 918     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 919         ram_counters.duplicate++;
 920         ram_counters.transferred +=
 921             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
 922         qemu_put_byte(rs->f, 0);
 923         ram_counters.transferred += 1;
 924         pages = 1;
 925     }
 926
 927     return pages;
 928 }
 929
 930 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 931 {
 932     if (!migrate_release_ram() || !migration_in_postcopy()) {
 933         return;
 934     }
 935
 936     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 937 }
 938
 939 /**
 940  * ram_save_page: send the given page to the stream
 941  *
 942  * Returns the number of pages written.
 943  *          < 0 - error
 944  *          >=0 - Number of pages written - this might legally be 0
 945  *                if xbzrle noticed the page was the same.
 946  *
 947  * @rs: current RAM state
 948  * @block: block that contains the page we want to send
 949  * @offset: offset inside the block for the page
 950  * @last_stage: if we are at the completion stage
 951  */
 952 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
 953 {
 954     int pages = -1;
 955     uint64_t bytes_xmit;
 956     ram_addr_t current_addr;
 957     uint8_t *p;
 958     int ret;
 959     bool send_async = true;
 960     RAMBlock *block = pss->block;
 961     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 962
 963     p = block->host + offset;
 964     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
 965
 966     /* In doubt sent page as normal */
 967     bytes_xmit = 0;
 968     ret = ram_control_save_page(rs->f, block->offset,
 969                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 970     if (bytes_xmit) {
 971         ram_counters.transferred += bytes_xmit;
 972         pages = 1;
 973     }
 974
 975     XBZRLE_cache_lock();
 976
 977     current_addr = block->offset + offset;
 978
 979     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 980         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 981             if (bytes_xmit > 0) {
 982                 ram_counters.normal++;
 983             } else if (bytes_xmit == 0) {
 984                 ram_counters.duplicate++;
 985             }
 986         }
 987     } else {
 988         pages = save_zero_page(rs, block, offset);
 989         if (pages > 0) {
 990             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 991              * page would be stale
 992              */
 993             xbzrle_cache_zero_page(rs, current_addr);
 994             ram_release_pages(block->idstr, offset, pages);
 995         } else if (!rs->ram_bulk_stage &&
 996                    !migration_in_postcopy() && migrate_use_xbzrle()) {
 997             pages = save_xbzrle_page(rs, &p, current_addr, block,
 998                                      offset, last_stage);
 999             if (!last_stage) {
1000                 /* Can't send this cached data async, since the cache page
1001                  * might get updated before it gets to the wire
1002                  */
1003                 send_async = false;
1004             }
1005         }
1006     }
1007
1008     /* XBZRLE overflow or normal page */
1009     if (pages == -1) {
1010         ram_counters.transferred +=
1011             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
1012         if (send_async) {
1013             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
1014                                   migrate_release_ram() &
1015                                   migration_in_postcopy());
1016         } else {
1017             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
1018         }
1019         ram_counters.transferred += TARGET_PAGE_SIZE;
1020         pages = 1;
1021         ram_counters.normal++;
1022     }
1023
1024     XBZRLE_cache_unlock();
1025
1026     return pages;
1027 }
1028
1029 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
1030                                 ram_addr_t offset)
1031 {
1032     RAMState *rs = ram_state;
1033     int bytes_sent, blen;
1034     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1035
1036     bytes_sent = save_page_header(rs, f, block, offset |
1037                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
1038     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1039                                      migrate_compress_level());
1040     if (blen < 0) {
1041         bytes_sent = 0;
1042         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1043         error_report("compressed data failed!");
1044     } else {
1045         bytes_sent += blen;
1046         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1047     }
1048
1049     return bytes_sent;
1050 }
1051
1052 static void flush_compressed_data(RAMState *rs)
1053 {
1054     int idx, len, thread_count;
1055
1056     if (!migrate_use_compression()) {
1057         return;
1058     }
1059     thread_count = migrate_compress_threads();
1060
1061     qemu_mutex_lock(&comp_done_lock);
1062     for (idx = 0; idx < thread_count; idx++) {
1063         while (!comp_param[idx].done) {
1064             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1065         }
1066     }
1067     qemu_mutex_unlock(&comp_done_lock);
1068
1069     for (idx = 0; idx < thread_count; idx++) {
1070         qemu_mutex_lock(&comp_param[idx].mutex);
1071         if (!comp_param[idx].quit) {
1072             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1073             ram_counters.transferred += len;
1074         }
1075         qemu_mutex_unlock(&comp_param[idx].mutex);
1076     }
1077 }
1078
1079 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1080                                        ram_addr_t offset)
1081 {
1082     param->block = block;
1083     param->offset = offset;
1084 }
1085
1086 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1087                                            ram_addr_t offset)
1088 {
1089     int idx, thread_count, bytes_xmit = -1, pages = -1;
1090
1091     thread_count = migrate_compress_threads();
1092     qemu_mutex_lock(&comp_done_lock);
1093     while (true) {
1094         for (idx = 0; idx < thread_count; idx++) {
1095             if (comp_param[idx].done) {
1096                 comp_param[idx].done = false;
1097                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1098                 qemu_mutex_lock(&comp_param[idx].mutex);
1099                 set_compress_params(&comp_param[idx], block, offset);
1100                 qemu_cond_signal(&comp_param[idx].cond);
1101                 qemu_mutex_unlock(&comp_param[idx].mutex);
1102                 pages = 1;
1103                 ram_counters.normal++;
1104                 ram_counters.transferred += bytes_xmit;
1105                 break;
1106             }
1107         }
1108         if (pages > 0) {
1109             break;
1110         } else {
1111             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1112         }
1113     }
1114     qemu_mutex_unlock(&comp_done_lock);
1115
1116     return pages;
1117 }
1118
1119 /**
1120  * ram_save_compressed_page: compress the given page and send it to the stream
1121  *
1122  * Returns the number of pages written.
1123  *
1124  * @rs: current RAM state
1125  * @block: block that contains the page we want to send
1126  * @offset: offset inside the block for the page
1127  * @last_stage: if we are at the completion stage
1128  */
1129 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
1130                                     bool last_stage)
1131 {
1132     int pages = -1;
1133     uint64_t bytes_xmit = 0;
1134     uint8_t *p;
1135     int ret, blen;
1136     RAMBlock *block = pss->block;
1137     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1138
1139     p = block->host + offset;
1140
1141     ret = ram_control_save_page(rs->f, block->offset,
1142                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
1143     if (bytes_xmit) {
1144         ram_counters.transferred += bytes_xmit;
1145         pages = 1;
1146     }
1147     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
1148         if (ret != RAM_SAVE_CONTROL_DELAYED) {
1149             if (bytes_xmit > 0) {
1150                 ram_counters.normal++;
1151             } else if (bytes_xmit == 0) {
1152                 ram_counters.duplicate++;
1153             }
1154         }
1155     } else {
1156         /* When starting the process of a new block, the first page of
1157          * the block should be sent out before other pages in the same
1158          * block, and all the pages in last block should have been sent
1159          * out, keeping this order is important, because the 'cont' flag
1160          * is used to avoid resending the block name.
1161          */
1162         if (block != rs->last_sent_block) {
1163             flush_compressed_data(rs);
1164             pages = save_zero_page(rs, block, offset);
1165             if (pages == -1) {
1166                 /* Make sure the first page is sent out before other pages */
1167                 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1168                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
1169                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1170                                                  migrate_compress_level());
1171                 if (blen > 0) {
1172                     ram_counters.transferred += bytes_xmit + blen;
1173                     ram_counters.normal++;
1174                     pages = 1;
1175                 } else {
1176                     qemu_file_set_error(rs->f, blen);
1177                     error_report("compressed data failed!");
1178                 }
1179             }
1180             if (pages > 0) {
1181                 ram_release_pages(block->idstr, offset, pages);
1182             }
1183         } else {
1184             pages = save_zero_page(rs, block, offset);
1185             if (pages == -1) {
1186                 pages = compress_page_with_multi_thread(rs, block, offset);
1187             } else {
1188                 ram_release_pages(block->idstr, offset, pages);
1189             }
1190         }
1191     }
1192
1193     return pages;
1194 }
1195
1196 /**
1197  * find_dirty_block: find the next dirty page and update any state
1198  * associated with the search process.
1199  *
1200  * Returns if a page is found
1201  *
1202  * @rs: current RAM state
1203  * @pss: data about the state of the current dirty page scan
1204  * @again: set to false if the search has scanned the whole of RAM
1205  */
1206 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1207 {
1208     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1209     if (pss->complete_round && pss->block == rs->last_seen_block &&
1210         pss->page >= rs->last_page) {
1211         /*
1212          * We've been once around the RAM and haven't found anything.
1213          * Give up.
1214          */
1215         *again = false;
1216         return false;
1217     }
1218     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1219         /* Didn't find anything in this RAM Block */
1220         pss->page = 0;
1221         pss->block = QLIST_NEXT_RCU(pss->block, next);
1222         if (!pss->block) {
1223             /* Hit the end of the list */
1224             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1225             /* Flag that we've looped */
1226             pss->complete_round = true;
1227             rs->ram_bulk_stage = false;
1228             if (migrate_use_xbzrle()) {
1229                 /* If xbzrle is on, stop using the data compression at this
1230                  * point. In theory, xbzrle can do better than compression.
1231                  */
1232                 flush_compressed_data(rs);
1233             }
1234         }
1235         /* Didn't find anything this time, but try again on the new block */
1236         *again = true;
1237         return false;
1238     } else {
1239         /* Can go around again, but... */
1240         *again = true;
1241         /* We've found something so probably don't need to */
1242         return true;
1243     }
1244 }
1245
1246 /**
1247  * unqueue_page: gets a page of the queue
1248  *
1249  * Helper for 'get_queued_page' - gets a page off the queue
1250  *
1251  * Returns the block of the page (or NULL if none available)
1252  *
1253  * @rs: current RAM state
1254  * @offset: used to return the offset within the RAMBlock
1255  */
1256 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1257 {
1258     RAMBlock *block = NULL;
1259
1260     qemu_mutex_lock(&rs->src_page_req_mutex);
1261     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1262         struct RAMSrcPageRequest *entry =
1263                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1264         block = entry->rb;
1265         *offset = entry->offset;
1266
1267         if (entry->len > TARGET_PAGE_SIZE) {
1268             entry->len -= TARGET_PAGE_SIZE;
1269             entry->offset += TARGET_PAGE_SIZE;
1270         } else {
1271             memory_region_unref(block->mr);
1272             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1273             g_free(entry);
1274         }
1275     }
1276     qemu_mutex_unlock(&rs->src_page_req_mutex);
1277
1278     return block;
1279 }
1280
1281 /**
1282  * get_queued_page: unqueue a page from the postocpy requests
1283  *
1284  * Skips pages that are already sent (!dirty)
1285  *
1286  * Returns if a queued page is found
1287  *
1288  * @rs: current RAM state
1289  * @pss: data about the state of the current dirty page scan
1290  */
1291 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1292 {
1293     RAMBlock  *block;
1294     ram_addr_t offset;
1295     bool dirty;
1296
1297     do {
1298         block = unqueue_page(rs, &offset);
1299         /*
1300          * We're sending this page, and since it's postcopy nothing else
1301          * will dirty it, and we must make sure it doesn't get sent again
1302          * even if this queue request was received after the background
1303          * search already sent it.
1304          */
1305         if (block) {
1306             unsigned long page;
1307
1308             page = offset >> TARGET_PAGE_BITS;
1309             dirty = test_bit(page, block->bmap);
1310             if (!dirty) {
1311                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1312                        page, test_bit(page, block->unsentmap));
1313             } else {
1314                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1315             }
1316         }
1317
1318     } while (block && !dirty);
1319
1320     if (block) {
1321         /*
1322          * As soon as we start servicing pages out of order, then we have
1323          * to kill the bulk stage, since the bulk stage assumes
1324          * in (migration_bitmap_find_and_reset_dirty) that every page is
1325          * dirty, that's no longer true.
1326          */
1327         rs->ram_bulk_stage = false;
1328
1329         /*
1330          * We want the background search to continue from the queued page
1331          * since the guest is likely to want other pages near to the page
1332          * it just requested.
1333          */
1334         pss->block = block;
1335         pss->page = offset >> TARGET_PAGE_BITS;
1336     }
1337
1338     return !!block;
1339 }
1340
1341 /**
1342  * migration_page_queue_free: drop any remaining pages in the ram
1343  * request queue
1344  *
1345  * It should be empty at the end anyway, but in error cases there may
1346  * be some left.  in case that there is any page left, we drop it.
1347  *
1348  */
1349 static void migration_page_queue_free(RAMState *rs)
1350 {
1351     struct RAMSrcPageRequest *mspr, *next_mspr;
1352     /* This queue generally should be empty - but in the case of a failed
1353      * migration might have some droppings in.
1354      */
1355     rcu_read_lock();
1356     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1357         memory_region_unref(mspr->rb->mr);
1358         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1359         g_free(mspr);
1360     }
1361     rcu_read_unlock();
1362 }
1363
1364 /**
1365  * ram_save_queue_pages: queue the page for transmission
1366  *
1367  * A request from postcopy destination for example.
1368  *
1369  * Returns zero on success or negative on error
1370  *
1371  * @rbname: Name of the RAMBLock of the request. NULL means the
1372  *          same that last one.
1373  * @start: starting address from the start of the RAMBlock
1374  * @len: length (in bytes) to send
1375  */
1376 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1377 {
1378     RAMBlock *ramblock;
1379     RAMState *rs = ram_state;
1380
1381     ram_counters.postcopy_requests++;
1382     rcu_read_lock();
1383     if (!rbname) {
1384         /* Reuse last RAMBlock */
1385         ramblock = rs->last_req_rb;
1386
1387         if (!ramblock) {
1388             /*
1389              * Shouldn't happen, we can't reuse the last RAMBlock if
1390              * it's the 1st request.
1391              */
1392             error_report("ram_save_queue_pages no previous block");
1393             goto err;
1394         }
1395     } else {
1396         ramblock = qemu_ram_block_by_name(rbname);
1397
1398         if (!ramblock) {
1399             /* We shouldn't be asked for a non-existent RAMBlock */
1400             error_report("ram_save_queue_pages no block '%s'", rbname);
1401             goto err;
1402         }
1403         rs->last_req_rb = ramblock;
1404     }
1405     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1406     if (start+len > ramblock->used_length) {
1407         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1408                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1409                      __func__, start, len, ramblock->used_length);
1410         goto err;
1411     }
1412
1413     struct RAMSrcPageRequest *new_entry =
1414         g_malloc0(sizeof(struct RAMSrcPageRequest));
1415     new_entry->rb = ramblock;
1416     new_entry->offset = start;
1417     new_entry->len = len;
1418
1419     memory_region_ref(ramblock->mr);
1420     qemu_mutex_lock(&rs->src_page_req_mutex);
1421     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1422     qemu_mutex_unlock(&rs->src_page_req_mutex);
1423     rcu_read_unlock();
1424
1425     return 0;
1426
1427 err:
1428     rcu_read_unlock();
1429     return -1;
1430 }
1431
1432 /**
1433  * ram_save_target_page: save one target page
1434  *
1435  * Returns the number of pages written
1436  *
1437  * @rs: current RAM state
1438  * @ms: current migration state
1439  * @pss: data about the page we want to send
1440  * @last_stage: if we are at the completion stage
1441  */
1442 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1443                                 bool last_stage)
1444 {
1445     int res = 0;
1446
1447     /* Check the pages is dirty and if it is send it */
1448     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1449         /*
1450          * If xbzrle is on, stop using the data compression after first
1451          * round of migration even if compression is enabled. In theory,
1452          * xbzrle can do better than compression.
1453          */
1454         if (migrate_use_compression() &&
1455             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1456             res = ram_save_compressed_page(rs, pss, last_stage);
1457         } else {
1458             res = ram_save_page(rs, pss, last_stage);
1459         }
1460
1461         if (res < 0) {
1462             return res;
1463         }
1464         if (pss->block->unsentmap) {
1465             clear_bit(pss->page, pss->block->unsentmap);
1466         }
1467     }
1468
1469     return res;
1470 }
1471
1472 /**
1473  * ram_save_host_page: save a whole host page
1474  *
1475  * Starting at *offset send pages up to the end of the current host
1476  * page. It's valid for the initial offset to point into the middle of
1477  * a host page in which case the remainder of the hostpage is sent.
1478  * Only dirty target pages are sent. Note that the host page size may
1479  * be a huge page for this block.
1480  * The saving stops at the boundary of the used_length of the block
1481  * if the RAMBlock isn't a multiple of the host page size.
1482  *
1483  * Returns the number of pages written or negative on error
1484  *
1485  * @rs: current RAM state
1486  * @ms: current migration state
1487  * @pss: data about the page we want to send
1488  * @last_stage: if we are at the completion stage
1489  */
1490 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1491                               bool last_stage)
1492 {
1493     int tmppages, pages = 0;
1494     size_t pagesize_bits =
1495         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1496
1497     do {
1498         tmppages = ram_save_target_page(rs, pss, last_stage);
1499         if (tmppages < 0) {
1500             return tmppages;
1501         }
1502
1503         pages += tmppages;
1504         pss->page++;
1505     } while ((pss->page & (pagesize_bits - 1)) &&
1506              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1507
1508     /* The offset we leave with is the last one we looked at */
1509     pss->page--;
1510     return pages;
1511 }
1512
1513 /**
1514  * ram_find_and_save_block: finds a dirty page and sends it to f
1515  *
1516  * Called within an RCU critical section.
1517  *
1518  * Returns the number of pages written where zero means no dirty pages
1519  *
1520  * @rs: current RAM state
1521  * @last_stage: if we are at the completion stage
1522  *
1523  * On systems where host-page-size > target-page-size it will send all the
1524  * pages in a host page that are dirty.
1525  */
1526
1527 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1528 {
1529     PageSearchStatus pss;
1530     int pages = 0;
1531     bool again, found;
1532
1533     /* No dirty page as there is zero RAM */
1534     if (!ram_bytes_total()) {
1535         return pages;
1536     }
1537
1538     pss.block = rs->last_seen_block;
1539     pss.page = rs->last_page;
1540     pss.complete_round = false;
1541
1542     if (!pss.block) {
1543         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1544     }
1545
1546     do {
1547         again = true;
1548         found = get_queued_page(rs, &pss);
1549
1550         if (!found) {
1551             /* priority queue empty, so just search for something dirty */
1552             found = find_dirty_block(rs, &pss, &again);
1553         }
1554
1555         if (found) {
1556             pages = ram_save_host_page(rs, &pss, last_stage);
1557         }
1558     } while (!pages && again);
1559
1560     rs->last_seen_block = pss.block;
1561     rs->last_page = pss.page;
1562
1563     return pages;
1564 }
1565
1566 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1567 {
1568     uint64_t pages = size / TARGET_PAGE_SIZE;
1569
1570     if (zero) {
1571         ram_counters.duplicate += pages;
1572     } else {
1573         ram_counters.normal += pages;
1574         ram_counters.transferred += size;
1575         qemu_update_position(f, size);
1576     }
1577 }
1578
1579 uint64_t ram_bytes_total(void)
1580 {
1581     RAMBlock *block;
1582     uint64_t total = 0;
1583
1584     rcu_read_lock();
1585     RAMBLOCK_FOREACH(block) {
1586         total += block->used_length;
1587     }
1588     rcu_read_unlock();
1589     return total;
1590 }
1591
1592 static void xbzrle_load_setup(void)
1593 {
1594     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1595 }
1596
1597 static void xbzrle_load_cleanup(void)
1598 {
1599     g_free(XBZRLE.decoded_buf);
1600     XBZRLE.decoded_buf = NULL;
1601 }
1602
1603 static void ram_state_cleanup(RAMState **rsp)
1604 {
1605     if (*rsp) {
1606         migration_page_queue_free(*rsp);
1607         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1608         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1609         g_free(*rsp);
1610         *rsp = NULL;
1611     }
1612 }
1613
1614 static void xbzrle_cleanup(void)
1615 {
1616     XBZRLE_cache_lock();
1617     if (XBZRLE.cache) {
1618         cache_fini(XBZRLE.cache);
1619         g_free(XBZRLE.encoded_buf);
1620         g_free(XBZRLE.current_buf);
1621         g_free(XBZRLE.zero_target_page);
1622         XBZRLE.cache = NULL;
1623         XBZRLE.encoded_buf = NULL;
1624         XBZRLE.current_buf = NULL;
1625         XBZRLE.zero_target_page = NULL;
1626     }
1627     XBZRLE_cache_unlock();
1628 }
1629
1630 static void ram_save_cleanup(void *opaque)
1631 {
1632     RAMState **rsp = opaque;
1633     RAMBlock *block;
1634
1635     /* caller have hold iothread lock or is in a bh, so there is
1636      * no writing race against this migration_bitmap
1637      */
1638     memory_global_dirty_log_stop();
1639
1640     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1641         g_free(block->bmap);
1642         block->bmap = NULL;
1643         g_free(block->unsentmap);
1644         block->unsentmap = NULL;
1645     }
1646
1647     xbzrle_cleanup();
1648     compress_threads_save_cleanup();
1649     ram_state_cleanup(rsp);
1650 }
1651
1652 static void ram_state_reset(RAMState *rs)
1653 {
1654     rs->last_seen_block = NULL;
1655     rs->last_sent_block = NULL;
1656     rs->last_page = 0;
1657     rs->last_version = ram_list.version;
1658     rs->ram_bulk_stage = true;
1659 }
1660
1661 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1662
1663 /*
1664  * 'expected' is the value you expect the bitmap mostly to be full
1665  * of; it won't bother printing lines that are all this value.
1666  * If 'todump' is null the migration bitmap is dumped.
1667  */
1668 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1669                            unsigned long pages)
1670 {
1671     int64_t cur;
1672     int64_t linelen = 128;
1673     char linebuf[129];
1674
1675     for (cur = 0; cur < pages; cur += linelen) {
1676         int64_t curb;
1677         bool found = false;
1678         /*
1679          * Last line; catch the case where the line length
1680          * is longer than remaining ram
1681          */
1682         if (cur + linelen > pages) {
1683             linelen = pages - cur;
1684         }
1685         for (curb = 0; curb < linelen; curb++) {
1686             bool thisbit = test_bit(cur + curb, todump);
1687             linebuf[curb] = thisbit ? '1' : '.';
1688             found = found || (thisbit != expected);
1689         }
1690         if (found) {
1691             linebuf[curb] = '\0';
1692             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1693         }
1694     }
1695 }
1696
1697 /* **** functions for postcopy ***** */
1698
1699 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1700 {
1701     struct RAMBlock *block;
1702
1703     RAMBLOCK_FOREACH(block) {
1704         unsigned long *bitmap = block->bmap;
1705         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1706         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1707
1708         while (run_start < range) {
1709             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1710             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1711                               (run_end - run_start) << TARGET_PAGE_BITS);
1712             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1713         }
1714     }
1715 }
1716
1717 /**
1718  * postcopy_send_discard_bm_ram: discard a RAMBlock
1719  *
1720  * Returns zero on success
1721  *
1722  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1723  * Note: At this point the 'unsentmap' is the processed bitmap combined
1724  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1725  *
1726  * @ms: current migration state
1727  * @pds: state for postcopy
1728  * @start: RAMBlock starting page
1729  * @length: RAMBlock size
1730  */
1731 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1732                                         PostcopyDiscardState *pds,
1733                                         RAMBlock *block)
1734 {
1735     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1736     unsigned long current;
1737     unsigned long *unsentmap = block->unsentmap;
1738
1739     for (current = 0; current < end; ) {
1740         unsigned long one = find_next_bit(unsentmap, end, current);
1741
1742         if (one <= end) {
1743             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1744             unsigned long discard_length;
1745
1746             if (zero >= end) {
1747                 discard_length = end - one;
1748             } else {
1749                 discard_length = zero - one;
1750             }
1751             if (discard_length) {
1752                 postcopy_discard_send_range(ms, pds, one, discard_length);
1753             }
1754             current = one + discard_length;
1755         } else {
1756             current = one;
1757         }
1758     }
1759
1760     return 0;
1761 }
1762
1763 /**
1764  * postcopy_each_ram_send_discard: discard all RAMBlocks
1765  *
1766  * Returns 0 for success or negative for error
1767  *
1768  * Utility for the outgoing postcopy code.
1769  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1770  *   passing it bitmap indexes and name.
1771  * (qemu_ram_foreach_block ends up passing unscaled lengths
1772  *  which would mean postcopy code would have to deal with target page)
1773  *
1774  * @ms: current migration state
1775  */
1776 static int postcopy_each_ram_send_discard(MigrationState *ms)
1777 {
1778     struct RAMBlock *block;
1779     int ret;
1780
1781     RAMBLOCK_FOREACH(block) {
1782         PostcopyDiscardState *pds =
1783             postcopy_discard_send_init(ms, block->idstr);
1784
1785         /*
1786          * Postcopy sends chunks of bitmap over the wire, but it
1787          * just needs indexes at this point, avoids it having
1788          * target page specific code.
1789          */
1790         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1791         postcopy_discard_send_finish(ms, pds);
1792         if (ret) {
1793             return ret;
1794         }
1795     }
1796
1797     return 0;
1798 }
1799
1800 /**
1801  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1802  *
1803  * Helper for postcopy_chunk_hostpages; it's called twice to
1804  * canonicalize the two bitmaps, that are similar, but one is
1805  * inverted.
1806  *
1807  * Postcopy requires that all target pages in a hostpage are dirty or
1808  * clean, not a mix.  This function canonicalizes the bitmaps.
1809  *
1810  * @ms: current migration state
1811  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1812  *               otherwise we need to canonicalize partially dirty host pages
1813  * @block: block that contains the page we want to canonicalize
1814  * @pds: state for postcopy
1815  */
1816 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1817                                           RAMBlock *block,
1818                                           PostcopyDiscardState *pds)
1819 {
1820     RAMState *rs = ram_state;
1821     unsigned long *bitmap = block->bmap;
1822     unsigned long *unsentmap = block->unsentmap;
1823     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1824     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1825     unsigned long run_start;
1826
1827     if (block->page_size == TARGET_PAGE_SIZE) {
1828         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1829         return;
1830     }
1831
1832     if (unsent_pass) {
1833         /* Find a sent page */
1834         run_start = find_next_zero_bit(unsentmap, pages, 0);
1835     } else {
1836         /* Find a dirty page */
1837         run_start = find_next_bit(bitmap, pages, 0);
1838     }
1839
1840     while (run_start < pages) {
1841         bool do_fixup = false;
1842         unsigned long fixup_start_addr;
1843         unsigned long host_offset;
1844
1845         /*
1846          * If the start of this run of pages is in the middle of a host
1847          * page, then we need to fixup this host page.
1848          */
1849         host_offset = run_start % host_ratio;
1850         if (host_offset) {
1851             do_fixup = true;
1852             run_start -= host_offset;
1853             fixup_start_addr = run_start;
1854             /* For the next pass */
1855             run_start = run_start + host_ratio;
1856         } else {
1857             /* Find the end of this run */
1858             unsigned long run_end;
1859             if (unsent_pass) {
1860                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1861             } else {
1862                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1863             }
1864             /*
1865              * If the end isn't at the start of a host page, then the
1866              * run doesn't finish at the end of a host page
1867              * and we need to discard.
1868              */
1869             host_offset = run_end % host_ratio;
1870             if (host_offset) {
1871                 do_fixup = true;
1872                 fixup_start_addr = run_end - host_offset;
1873                 /*
1874                  * This host page has gone, the next loop iteration starts
1875                  * from after the fixup
1876                  */
1877                 run_start = fixup_start_addr + host_ratio;
1878             } else {
1879                 /*
1880                  * No discards on this iteration, next loop starts from
1881                  * next sent/dirty page
1882                  */
1883                 run_start = run_end + 1;
1884             }
1885         }
1886
1887         if (do_fixup) {
1888             unsigned long page;
1889
1890             /* Tell the destination to discard this page */
1891             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1892                 /* For the unsent_pass we:
1893                  *     discard partially sent pages
1894                  * For the !unsent_pass (dirty) we:
1895                  *     discard partially dirty pages that were sent
1896                  *     (any partially sent pages were already discarded
1897                  *     by the previous unsent_pass)
1898                  */
1899                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1900                                             host_ratio);
1901             }
1902
1903             /* Clean up the bitmap */
1904             for (page = fixup_start_addr;
1905                  page < fixup_start_addr + host_ratio; page++) {
1906                 /* All pages in this host page are now not sent */
1907                 set_bit(page, unsentmap);
1908
1909                 /*
1910                  * Remark them as dirty, updating the count for any pages
1911                  * that weren't previously dirty.
1912                  */
1913                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1914             }
1915         }
1916
1917         if (unsent_pass) {
1918             /* Find the next sent page for the next iteration */
1919             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1920         } else {
1921             /* Find the next dirty page for the next iteration */
1922             run_start = find_next_bit(bitmap, pages, run_start);
1923         }
1924     }
1925 }
1926
1927 /**
1928  * postcopy_chuck_hostpages: discrad any partially sent host page
1929  *
1930  * Utility for the outgoing postcopy code.
1931  *
1932  * Discard any partially sent host-page size chunks, mark any partially
1933  * dirty host-page size chunks as all dirty.  In this case the host-page
1934  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1935  *
1936  * Returns zero on success
1937  *
1938  * @ms: current migration state
1939  * @block: block we want to work with
1940  */
1941 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1942 {
1943     PostcopyDiscardState *pds =
1944         postcopy_discard_send_init(ms, block->idstr);
1945
1946     /* First pass: Discard all partially sent host pages */
1947     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1948     /*
1949      * Second pass: Ensure that all partially dirty host pages are made
1950      * fully dirty.
1951      */
1952     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1953
1954     postcopy_discard_send_finish(ms, pds);
1955     return 0;
1956 }
1957
1958 /**
1959  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1960  *
1961  * Returns zero on success
1962  *
1963  * Transmit the set of pages to be discarded after precopy to the target
1964  * these are pages that:
1965  *     a) Have been previously transmitted but are now dirty again
1966  *     b) Pages that have never been transmitted, this ensures that
1967  *        any pages on the destination that have been mapped by background
1968  *        tasks get discarded (transparent huge pages is the specific concern)
1969  * Hopefully this is pretty sparse
1970  *
1971  * @ms: current migration state
1972  */
1973 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1974 {
1975     RAMState *rs = ram_state;
1976     RAMBlock *block;
1977     int ret;
1978
1979     rcu_read_lock();
1980
1981     /* This should be our last sync, the src is now paused */
1982     migration_bitmap_sync(rs);
1983
1984     /* Easiest way to make sure we don't resume in the middle of a host-page */
1985     rs->last_seen_block = NULL;
1986     rs->last_sent_block = NULL;
1987     rs->last_page = 0;
1988
1989     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1990         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1991         unsigned long *bitmap = block->bmap;
1992         unsigned long *unsentmap = block->unsentmap;
1993
1994         if (!unsentmap) {
1995             /* We don't have a safe way to resize the sentmap, so
1996              * if the bitmap was resized it will be NULL at this
1997              * point.
1998              */
1999             error_report("migration ram resized during precopy phase");
2000             rcu_read_unlock();
2001             return -EINVAL;
2002         }
2003         /* Deal with TPS != HPS and huge pages */
2004         ret = postcopy_chunk_hostpages(ms, block);
2005         if (ret) {
2006             rcu_read_unlock();
2007             return ret;
2008         }
2009
2010         /*
2011          * Update the unsentmap to be unsentmap = unsentmap | dirty
2012          */
2013         bitmap_or(unsentmap, unsentmap, bitmap, pages);
2014 #ifdef DEBUG_POSTCOPY
2015         ram_debug_dump_bitmap(unsentmap, true, pages);
2016 #endif
2017     }
2018     trace_ram_postcopy_send_discard_bitmap();
2019
2020     ret = postcopy_each_ram_send_discard(ms);
2021     rcu_read_unlock();
2022
2023     return ret;
2024 }
2025
2026 /**
2027  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2028  *
2029  * Returns zero on success
2030  *
2031  * @rbname: name of the RAMBlock of the request. NULL means the
2032  *          same that last one.
2033  * @start: RAMBlock starting page
2034  * @length: RAMBlock size
2035  */
2036 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2037 {
2038     int ret = -1;
2039
2040     trace_ram_discard_range(rbname, start, length);
2041
2042     rcu_read_lock();
2043     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2044
2045     if (!rb) {
2046         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2047         goto err;
2048     }
2049
2050     bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2051                  length >> qemu_target_page_bits());
2052     ret = ram_block_discard_range(rb, start, length);
2053
2054 err:
2055     rcu_read_unlock();
2056
2057     return ret;
2058 }
2059
2060 /*
2061  * For every allocation, we will try not to crash the VM if the
2062  * allocation failed.
2063  */
2064 static int xbzrle_init(void)
2065 {
2066     Error *local_err = NULL;
2067
2068     if (!migrate_use_xbzrle()) {
2069         return 0;
2070     }
2071
2072     XBZRLE_cache_lock();
2073
2074     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2075     if (!XBZRLE.zero_target_page) {
2076         error_report("%s: Error allocating zero page", __func__);
2077         goto err_out;
2078     }
2079
2080     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2081                               TARGET_PAGE_SIZE, &local_err);
2082     if (!XBZRLE.cache) {
2083         error_report_err(local_err);
2084         goto free_zero_page;
2085     }
2086
2087     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2088     if (!XBZRLE.encoded_buf) {
2089         error_report("%s: Error allocating encoded_buf", __func__);
2090         goto free_cache;
2091     }
2092
2093     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2094     if (!XBZRLE.current_buf) {
2095         error_report("%s: Error allocating current_buf", __func__);
2096         goto free_encoded_buf;
2097     }
2098
2099     /* We are all good */
2100     XBZRLE_cache_unlock();
2101     return 0;
2102
2103 free_encoded_buf:
2104     g_free(XBZRLE.encoded_buf);
2105     XBZRLE.encoded_buf = NULL;
2106 free_cache:
2107     cache_fini(XBZRLE.cache);
2108     XBZRLE.cache = NULL;
2109 free_zero_page:
2110     g_free(XBZRLE.zero_target_page);
2111     XBZRLE.zero_target_page = NULL;
2112 err_out:
2113     XBZRLE_cache_unlock();
2114     return -ENOMEM;
2115 }
2116
2117 static int ram_state_init(RAMState **rsp)
2118 {
2119     *rsp = g_try_new0(RAMState, 1);
2120
2121     if (!*rsp) {
2122         error_report("%s: Init ramstate fail", __func__);
2123         return -1;
2124     }
2125
2126     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2127     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2128     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2129
2130     /*
2131      * Count the total number of pages used by ram blocks not including any
2132      * gaps due to alignment or unplugs.
2133      */
2134     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2135
2136     ram_state_reset(*rsp);
2137
2138     return 0;
2139 }
2140
2141 static void ram_list_init_bitmaps(void)
2142 {
2143     RAMBlock *block;
2144     unsigned long pages;
2145
2146     /* Skip setting bitmap if there is no RAM */
2147     if (ram_bytes_total()) {
2148         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2149             pages = block->max_length >> TARGET_PAGE_BITS;
2150             block->bmap = bitmap_new(pages);
2151             bitmap_set(block->bmap, 0, pages);
2152             if (migrate_postcopy_ram()) {
2153                 block->unsentmap = bitmap_new(pages);
2154                 bitmap_set(block->unsentmap, 0, pages);
2155             }
2156         }
2157     }
2158 }
2159
2160 static void ram_init_bitmaps(RAMState *rs)
2161 {
2162     /* For memory_global_dirty_log_start below.  */
2163     qemu_mutex_lock_iothread();
2164     qemu_mutex_lock_ramlist();
2165     rcu_read_lock();
2166
2167     ram_list_init_bitmaps();
2168     memory_global_dirty_log_start();
2169     migration_bitmap_sync(rs);
2170
2171     rcu_read_unlock();
2172     qemu_mutex_unlock_ramlist();
2173     qemu_mutex_unlock_iothread();
2174 }
2175
2176 static int ram_init_all(RAMState **rsp)
2177 {
2178     if (ram_state_init(rsp)) {
2179         return -1;
2180     }
2181
2182     if (xbzrle_init()) {
2183         ram_state_cleanup(rsp);
2184         return -1;
2185     }
2186
2187     ram_init_bitmaps(*rsp);
2188
2189     return 0;
2190 }
2191
2192 /*
2193  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2194  * long-running RCU critical section.  When rcu-reclaims in the code
2195  * start to become numerous it will be necessary to reduce the
2196  * granularity of these critical sections.
2197  */
2198
2199 /**
2200  * ram_save_setup: Setup RAM for migration
2201  *
2202  * Returns zero to indicate success and negative for error
2203  *
2204  * @f: QEMUFile where to send the data
2205  * @opaque: RAMState pointer
2206  */
2207 static int ram_save_setup(QEMUFile *f, void *opaque)
2208 {
2209     RAMState **rsp = opaque;
2210     RAMBlock *block;
2211
2212     /* migration has already setup the bitmap, reuse it. */
2213     if (!migration_in_colo_state()) {
2214         if (ram_init_all(rsp) != 0) {
2215             return -1;
2216         }
2217     }
2218     (*rsp)->f = f;
2219
2220     rcu_read_lock();
2221
2222     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2223
2224     RAMBLOCK_FOREACH(block) {
2225         qemu_put_byte(f, strlen(block->idstr));
2226         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2227         qemu_put_be64(f, block->used_length);
2228         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2229             qemu_put_be64(f, block->page_size);
2230         }
2231     }
2232
2233     rcu_read_unlock();
2234     compress_threads_save_setup();
2235
2236     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2237     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2238
2239     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2240
2241     return 0;
2242 }
2243
2244 /**
2245  * ram_save_iterate: iterative stage for migration
2246  *
2247  * Returns zero to indicate success and negative for error
2248  *
2249  * @f: QEMUFile where to send the data
2250  * @opaque: RAMState pointer
2251  */
2252 static int ram_save_iterate(QEMUFile *f, void *opaque)
2253 {
2254     RAMState **temp = opaque;
2255     RAMState *rs = *temp;
2256     int ret;
2257     int i;
2258     int64_t t0;
2259     int done = 0;
2260
2261     rcu_read_lock();
2262     if (ram_list.version != rs->last_version) {
2263         ram_state_reset(rs);
2264     }
2265
2266     /* Read version before ram_list.blocks */
2267     smp_rmb();
2268
2269     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2270
2271     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2272     i = 0;
2273     while ((ret = qemu_file_rate_limit(f)) == 0) {
2274         int pages;
2275
2276         pages = ram_find_and_save_block(rs, false);
2277         /* no more pages to sent */
2278         if (pages == 0) {
2279             done = 1;
2280             break;
2281         }
2282         rs->iterations++;
2283
2284         /* we want to check in the 1st loop, just in case it was the 1st time
2285            and we had to sync the dirty bitmap.
2286            qemu_get_clock_ns() is a bit expensive, so we only check each some
2287            iterations
2288         */
2289         if ((i & 63) == 0) {
2290             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2291             if (t1 > MAX_WAIT) {
2292                 trace_ram_save_iterate_big_wait(t1, i);
2293                 break;
2294             }
2295         }
2296         i++;
2297     }
2298     flush_compressed_data(rs);
2299     rcu_read_unlock();
2300
2301     /*
2302      * Must occur before EOS (or any QEMUFile operation)
2303      * because of RDMA protocol.
2304      */
2305     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2306
2307     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2308     ram_counters.transferred += 8;
2309
2310     ret = qemu_file_get_error(f);
2311     if (ret < 0) {
2312         return ret;
2313     }
2314
2315     return done;
2316 }
2317
2318 /**
2319  * ram_save_complete: function called to send the remaining amount of ram
2320  *
2321  * Returns zero to indicate success
2322  *
2323  * Called with iothread lock
2324  *
2325  * @f: QEMUFile where to send the data
2326  * @opaque: RAMState pointer
2327  */
2328 static int ram_save_complete(QEMUFile *f, void *opaque)
2329 {
2330     RAMState **temp = opaque;
2331     RAMState *rs = *temp;
2332
2333     rcu_read_lock();
2334
2335     if (!migration_in_postcopy()) {
2336         migration_bitmap_sync(rs);
2337     }
2338
2339     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2340
2341     /* try transferring iterative blocks of memory */
2342
2343     /* flush all remaining blocks regardless of rate limiting */
2344     while (true) {
2345         int pages;
2346
2347         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2348         /* no more blocks to sent */
2349         if (pages == 0) {
2350             break;
2351         }
2352     }
2353
2354     flush_compressed_data(rs);
2355     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2356
2357     rcu_read_unlock();
2358
2359     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2360
2361     return 0;
2362 }
2363
2364 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2365                              uint64_t *non_postcopiable_pending,
2366                              uint64_t *postcopiable_pending)
2367 {
2368     RAMState **temp = opaque;
2369     RAMState *rs = *temp;
2370     uint64_t remaining_size;
2371
2372     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2373
2374     if (!migration_in_postcopy() &&
2375         remaining_size < max_size) {
2376         qemu_mutex_lock_iothread();
2377         rcu_read_lock();
2378         migration_bitmap_sync(rs);
2379         rcu_read_unlock();
2380         qemu_mutex_unlock_iothread();
2381         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2382     }
2383
2384     if (migrate_postcopy_ram()) {
2385         /* We can do postcopy, and all the data is postcopiable */
2386         *postcopiable_pending += remaining_size;
2387     } else {
2388         *non_postcopiable_pending += remaining_size;
2389     }
2390 }
2391
2392 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2393 {
2394     unsigned int xh_len;
2395     int xh_flags;
2396     uint8_t *loaded_data;
2397
2398     /* extract RLE header */
2399     xh_flags = qemu_get_byte(f);
2400     xh_len = qemu_get_be16(f);
2401
2402     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2403         error_report("Failed to load XBZRLE page - wrong compression!");
2404         return -1;
2405     }
2406
2407     if (xh_len > TARGET_PAGE_SIZE) {
2408         error_report("Failed to load XBZRLE page - len overflow!");
2409         return -1;
2410     }
2411     loaded_data = XBZRLE.decoded_buf;
2412     /* load data and decode */
2413     /* it can change loaded_data to point to an internal buffer */
2414     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2415
2416     /* decode RLE */
2417     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2418                              TARGET_PAGE_SIZE) == -1) {
2419         error_report("Failed to load XBZRLE page - decode error!");
2420         return -1;
2421     }
2422
2423     return 0;
2424 }
2425
2426 /**
2427  * ram_block_from_stream: read a RAMBlock id from the migration stream
2428  *
2429  * Must be called from within a rcu critical section.
2430  *
2431  * Returns a pointer from within the RCU-protected ram_list.
2432  *
2433  * @f: QEMUFile where to read the data from
2434  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2435  */
2436 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2437 {
2438     static RAMBlock *block = NULL;
2439     char id[256];
2440     uint8_t len;
2441
2442     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2443         if (!block) {
2444             error_report("Ack, bad migration stream!");
2445             return NULL;
2446         }
2447         return block;
2448     }
2449
2450     len = qemu_get_byte(f);
2451     qemu_get_buffer(f, (uint8_t *)id, len);
2452     id[len] = 0;
2453
2454     block = qemu_ram_block_by_name(id);
2455     if (!block) {
2456         error_report("Can't find block %s", id);
2457         return NULL;
2458     }
2459
2460     return block;
2461 }
2462
2463 static inline void *host_from_ram_block_offset(RAMBlock *block,
2464                                                ram_addr_t offset)
2465 {
2466     if (!offset_in_ramblock(block, offset)) {
2467         return NULL;
2468     }
2469
2470     return block->host + offset;
2471 }
2472
2473 /**
2474  * ram_handle_compressed: handle the zero page case
2475  *
2476  * If a page (or a whole RDMA chunk) has been
2477  * determined to be zero, then zap it.
2478  *
2479  * @host: host address for the zero page
2480  * @ch: what the page is filled from.  We only support zero
2481  * @size: size of the zero page
2482  */
2483 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2484 {
2485     if (ch != 0 || !is_zero_range(host, size)) {
2486         memset(host, ch, size);
2487     }
2488 }
2489
2490 static void *do_data_decompress(void *opaque)
2491 {
2492     DecompressParam *param = opaque;
2493     unsigned long pagesize;
2494     uint8_t *des;
2495     int len;
2496
2497     qemu_mutex_lock(&param->mutex);
2498     while (!param->quit) {
2499         if (param->des) {
2500             des = param->des;
2501             len = param->len;
2502             param->des = 0;
2503             qemu_mutex_unlock(&param->mutex);
2504
2505             pagesize = TARGET_PAGE_SIZE;
2506             /* uncompress() will return failed in some case, especially
2507              * when the page is dirted when doing the compression, it's
2508              * not a problem because the dirty page will be retransferred
2509              * and uncompress() won't break the data in other pages.
2510              */
2511             uncompress((Bytef *)des, &pagesize,
2512                        (const Bytef *)param->compbuf, len);
2513
2514             qemu_mutex_lock(&decomp_done_lock);
2515             param->done = true;
2516             qemu_cond_signal(&decomp_done_cond);
2517             qemu_mutex_unlock(&decomp_done_lock);
2518
2519             qemu_mutex_lock(&param->mutex);
2520         } else {
2521             qemu_cond_wait(&param->cond, &param->mutex);
2522         }
2523     }
2524     qemu_mutex_unlock(&param->mutex);
2525
2526     return NULL;
2527 }
2528
2529 static void wait_for_decompress_done(void)
2530 {
2531     int idx, thread_count;
2532
2533     if (!migrate_use_compression()) {
2534         return;
2535     }
2536
2537     thread_count = migrate_decompress_threads();
2538     qemu_mutex_lock(&decomp_done_lock);
2539     for (idx = 0; idx < thread_count; idx++) {
2540         while (!decomp_param[idx].done) {
2541             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2542         }
2543     }
2544     qemu_mutex_unlock(&decomp_done_lock);
2545 }
2546
2547 static void compress_threads_load_setup(void)
2548 {
2549     int i, thread_count;
2550
2551     if (!migrate_use_compression()) {
2552         return;
2553     }
2554     thread_count = migrate_decompress_threads();
2555     decompress_threads = g_new0(QemuThread, thread_count);
2556     decomp_param = g_new0(DecompressParam, thread_count);
2557     qemu_mutex_init(&decomp_done_lock);
2558     qemu_cond_init(&decomp_done_cond);
2559     for (i = 0; i < thread_count; i++) {
2560         qemu_mutex_init(&decomp_param[i].mutex);
2561         qemu_cond_init(&decomp_param[i].cond);
2562         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2563         decomp_param[i].done = true;
2564         decomp_param[i].quit = false;
2565         qemu_thread_create(decompress_threads + i, "decompress",
2566                            do_data_decompress, decomp_param + i,
2567                            QEMU_THREAD_JOINABLE);
2568     }
2569 }
2570
2571 static void compress_threads_load_cleanup(void)
2572 {
2573     int i, thread_count;
2574
2575     if (!migrate_use_compression()) {
2576         return;
2577     }
2578     thread_count = migrate_decompress_threads();
2579     for (i = 0; i < thread_count; i++) {
2580         qemu_mutex_lock(&decomp_param[i].mutex);
2581         decomp_param[i].quit = true;
2582         qemu_cond_signal(&decomp_param[i].cond);
2583         qemu_mutex_unlock(&decomp_param[i].mutex);
2584     }
2585     for (i = 0; i < thread_count; i++) {
2586         qemu_thread_join(decompress_threads + i);
2587         qemu_mutex_destroy(&decomp_param[i].mutex);
2588         qemu_cond_destroy(&decomp_param[i].cond);
2589         g_free(decomp_param[i].compbuf);
2590     }
2591     g_free(decompress_threads);
2592     g_free(decomp_param);
2593     decompress_threads = NULL;
2594     decomp_param = NULL;
2595 }
2596
2597 static void decompress_data_with_multi_threads(QEMUFile *f,
2598                                                void *host, int len)
2599 {
2600     int idx, thread_count;
2601
2602     thread_count = migrate_decompress_threads();
2603     qemu_mutex_lock(&decomp_done_lock);
2604     while (true) {
2605         for (idx = 0; idx < thread_count; idx++) {
2606             if (decomp_param[idx].done) {
2607                 decomp_param[idx].done = false;
2608                 qemu_mutex_lock(&decomp_param[idx].mutex);
2609                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2610                 decomp_param[idx].des = host;
2611                 decomp_param[idx].len = len;
2612                 qemu_cond_signal(&decomp_param[idx].cond);
2613                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2614                 break;
2615             }
2616         }
2617         if (idx < thread_count) {
2618             break;
2619         } else {
2620             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2621         }
2622     }
2623     qemu_mutex_unlock(&decomp_done_lock);
2624 }
2625
2626 /**
2627  * ram_load_setup: Setup RAM for migration incoming side
2628  *
2629  * Returns zero to indicate success and negative for error
2630  *
2631  * @f: QEMUFile where to receive the data
2632  * @opaque: RAMState pointer
2633  */
2634 static int ram_load_setup(QEMUFile *f, void *opaque)
2635 {
2636     xbzrle_load_setup();
2637     compress_threads_load_setup();
2638     ramblock_recv_map_init();
2639     return 0;
2640 }
2641
2642 static int ram_load_cleanup(void *opaque)
2643 {
2644     RAMBlock *rb;
2645     xbzrle_load_cleanup();
2646     compress_threads_load_cleanup();
2647
2648     RAMBLOCK_FOREACH(rb) {
2649         g_free(rb->receivedmap);
2650         rb->receivedmap = NULL;
2651     }
2652     return 0;
2653 }
2654
2655 /**
2656  * ram_postcopy_incoming_init: allocate postcopy data structures
2657  *
2658  * Returns 0 for success and negative if there was one error
2659  *
2660  * @mis: current migration incoming state
2661  *
2662  * Allocate data structures etc needed by incoming migration with
2663  * postcopy-ram. postcopy-ram's similarly names
2664  * postcopy_ram_incoming_init does the work.
2665  */
2666 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2667 {
2668     unsigned long ram_pages = last_ram_page();
2669
2670     return postcopy_ram_incoming_init(mis, ram_pages);
2671 }
2672
2673 /**
2674  * ram_load_postcopy: load a page in postcopy case
2675  *
2676  * Returns 0 for success or -errno in case of error
2677  *
2678  * Called in postcopy mode by ram_load().
2679  * rcu_read_lock is taken prior to this being called.
2680  *
2681  * @f: QEMUFile where to send the data
2682  */
2683 static int ram_load_postcopy(QEMUFile *f)
2684 {
2685     int flags = 0, ret = 0;
2686     bool place_needed = false;
2687     bool matching_page_sizes = false;
2688     MigrationIncomingState *mis = migration_incoming_get_current();
2689     /* Temporary page that is later 'placed' */
2690     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2691     void *last_host = NULL;
2692     bool all_zero = false;
2693
2694     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2695         ram_addr_t addr;
2696         void *host = NULL;
2697         void *page_buffer = NULL;
2698         void *place_source = NULL;
2699         RAMBlock *block = NULL;
2700         uint8_t ch;
2701
2702         addr = qemu_get_be64(f);
2703
2704         /*
2705          * If qemu file error, we should stop here, and then "addr"
2706          * may be invalid
2707          */
2708         ret = qemu_file_get_error(f);
2709         if (ret) {
2710             break;
2711         }
2712
2713         flags = addr & ~TARGET_PAGE_MASK;
2714         addr &= TARGET_PAGE_MASK;
2715
2716         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2717         place_needed = false;
2718         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2719             block = ram_block_from_stream(f, flags);
2720
2721             host = host_from_ram_block_offset(block, addr);
2722             if (!host) {
2723                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2724                 ret = -EINVAL;
2725                 break;
2726             }
2727             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2728             /*
2729              * Postcopy requires that we place whole host pages atomically;
2730              * these may be huge pages for RAMBlocks that are backed by
2731              * hugetlbfs.
2732              * To make it atomic, the data is read into a temporary page
2733              * that's moved into place later.
2734              * The migration protocol uses,  possibly smaller, target-pages
2735              * however the source ensures it always sends all the components
2736              * of a host page in order.
2737              */
2738             page_buffer = postcopy_host_page +
2739                           ((uintptr_t)host & (block->page_size - 1));
2740             /* If all TP are zero then we can optimise the place */
2741             if (!((uintptr_t)host & (block->page_size - 1))) {
2742                 all_zero = true;
2743             } else {
2744                 /* not the 1st TP within the HP */
2745                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2746                     error_report("Non-sequential target page %p/%p",
2747                                   host, last_host);
2748                     ret = -EINVAL;
2749                     break;
2750                 }
2751             }
2752
2753
2754             /*
2755              * If it's the last part of a host page then we place the host
2756              * page
2757              */
2758             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2759                                      (block->page_size - 1)) == 0;
2760             place_source = postcopy_host_page;
2761         }
2762         last_host = host;
2763
2764         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2765         case RAM_SAVE_FLAG_ZERO:
2766             ch = qemu_get_byte(f);
2767             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2768             if (ch) {
2769                 all_zero = false;
2770             }
2771             break;
2772
2773         case RAM_SAVE_FLAG_PAGE:
2774             all_zero = false;
2775             if (!place_needed || !matching_page_sizes) {
2776                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2777             } else {
2778                 /* Avoids the qemu_file copy during postcopy, which is
2779                  * going to do a copy later; can only do it when we
2780                  * do this read in one go (matching page sizes)
2781                  */
2782                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2783                                          TARGET_PAGE_SIZE);
2784             }
2785             break;
2786         case RAM_SAVE_FLAG_EOS:
2787             /* normal exit */
2788             break;
2789         default:
2790             error_report("Unknown combination of migration flags: %#x"
2791                          " (postcopy mode)", flags);
2792             ret = -EINVAL;
2793             break;
2794         }
2795
2796         /* Detect for any possible file errors */
2797         if (!ret && qemu_file_get_error(f)) {
2798             ret = qemu_file_get_error(f);
2799         }
2800
2801         if (!ret && place_needed) {
2802             /* This gets called at the last target page in the host page */
2803             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2804
2805             if (all_zero) {
2806                 ret = postcopy_place_page_zero(mis, place_dest,
2807                                                block);
2808             } else {
2809                 ret = postcopy_place_page(mis, place_dest,
2810                                           place_source, block);
2811             }
2812         }
2813     }
2814
2815     return ret;
2816 }
2817
2818 static bool postcopy_is_advised(void)
2819 {
2820     PostcopyState ps = postcopy_state_get();
2821     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
2822 }
2823
2824 static bool postcopy_is_running(void)
2825 {
2826     PostcopyState ps = postcopy_state_get();
2827     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
2828 }
2829
2830 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2831 {
2832     int flags = 0, ret = 0, invalid_flags = 0;
2833     static uint64_t seq_iter;
2834     int len = 0;
2835     /*
2836      * If system is running in postcopy mode, page inserts to host memory must
2837      * be atomic
2838      */
2839     bool postcopy_running = postcopy_is_running();
2840     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2841     bool postcopy_advised = postcopy_is_advised();
2842
2843     seq_iter++;
2844
2845     if (version_id != 4) {
2846         ret = -EINVAL;
2847     }
2848
2849     if (!migrate_use_compression()) {
2850         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2851     }
2852     /* This RCU critical section can be very long running.
2853      * When RCU reclaims in the code start to become numerous,
2854      * it will be necessary to reduce the granularity of this
2855      * critical section.
2856      */
2857     rcu_read_lock();
2858
2859     if (postcopy_running) {
2860         ret = ram_load_postcopy(f);
2861     }
2862
2863     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2864         ram_addr_t addr, total_ram_bytes;
2865         void *host = NULL;
2866         uint8_t ch;
2867
2868         addr = qemu_get_be64(f);
2869         flags = addr & ~TARGET_PAGE_MASK;
2870         addr &= TARGET_PAGE_MASK;
2871
2872         if (flags & invalid_flags) {
2873             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2874                 error_report("Received an unexpected compressed page");
2875             }
2876
2877             ret = -EINVAL;
2878             break;
2879         }
2880
2881         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2882                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2883             RAMBlock *block = ram_block_from_stream(f, flags);
2884
2885             host = host_from_ram_block_offset(block, addr);
2886             if (!host) {
2887                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2888                 ret = -EINVAL;
2889                 break;
2890             }
2891             ramblock_recv_bitmap_set(block, host);
2892             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2893         }
2894
2895         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2896         case RAM_SAVE_FLAG_MEM_SIZE:
2897             /* Synchronize RAM block list */
2898             total_ram_bytes = addr;
2899             while (!ret && total_ram_bytes) {
2900                 RAMBlock *block;
2901                 char id[256];
2902                 ram_addr_t length;
2903
2904                 len = qemu_get_byte(f);
2905                 qemu_get_buffer(f, (uint8_t *)id, len);
2906                 id[len] = 0;
2907                 length = qemu_get_be64(f);
2908
2909                 block = qemu_ram_block_by_name(id);
2910                 if (block) {
2911                     if (length != block->used_length) {
2912                         Error *local_err = NULL;
2913
2914                         ret = qemu_ram_resize(block, length,
2915                                               &local_err);
2916                         if (local_err) {
2917                             error_report_err(local_err);
2918                         }
2919                     }
2920                     /* For postcopy we need to check hugepage sizes match */
2921                     if (postcopy_advised &&
2922                         block->page_size != qemu_host_page_size) {
2923                         uint64_t remote_page_size = qemu_get_be64(f);
2924                         if (remote_page_size != block->page_size) {
2925                             error_report("Mismatched RAM page size %s "
2926                                          "(local) %zd != %" PRId64,
2927                                          id, block->page_size,
2928                                          remote_page_size);
2929                             ret = -EINVAL;
2930                         }
2931                     }
2932                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2933                                           block->idstr);
2934                 } else {
2935                     error_report("Unknown ramblock \"%s\", cannot "
2936                                  "accept migration", id);
2937                     ret = -EINVAL;
2938                 }
2939
2940                 total_ram_bytes -= length;
2941             }
2942             break;
2943
2944         case RAM_SAVE_FLAG_ZERO:
2945             ch = qemu_get_byte(f);
2946             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2947             break;
2948
2949         case RAM_SAVE_FLAG_PAGE:
2950             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2951             break;
2952
2953         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2954             len = qemu_get_be32(f);
2955             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2956                 error_report("Invalid compressed data length: %d", len);
2957                 ret = -EINVAL;
2958                 break;
2959             }
2960             decompress_data_with_multi_threads(f, host, len);
2961             break;
2962
2963         case RAM_SAVE_FLAG_XBZRLE:
2964             if (load_xbzrle(f, addr, host) < 0) {
2965                 error_report("Failed to decompress XBZRLE page at "
2966                              RAM_ADDR_FMT, addr);
2967                 ret = -EINVAL;
2968                 break;
2969             }
2970             break;
2971         case RAM_SAVE_FLAG_EOS:
2972             /* normal exit */
2973             break;
2974         default:
2975             if (flags & RAM_SAVE_FLAG_HOOK) {
2976                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2977             } else {
2978                 error_report("Unknown combination of migration flags: %#x",
2979                              flags);
2980                 ret = -EINVAL;
2981             }
2982         }
2983         if (!ret) {
2984             ret = qemu_file_get_error(f);
2985         }
2986     }
2987
2988     wait_for_decompress_done();
2989     rcu_read_unlock();
2990     trace_ram_load_complete(ret, seq_iter);
2991     return ret;
2992 }
2993
2994 static bool ram_has_postcopy(void *opaque)
2995 {
2996     return migrate_postcopy_ram();
2997 }
2998
2999 static SaveVMHandlers savevm_ram_handlers = {
3000     .save_setup = ram_save_setup,
3001     .save_live_iterate = ram_save_iterate,
3002     .save_live_complete_postcopy = ram_save_complete,
3003     .save_live_complete_precopy = ram_save_complete,
3004     .has_postcopy = ram_has_postcopy,
3005     .save_live_pending = ram_save_pending,
3006     .load_state = ram_load,
3007     .save_cleanup = ram_save_cleanup,
3008     .load_setup = ram_load_setup,
3009     .load_cleanup = ram_load_cleanup,
3010 };
3011
3012 void ram_mig_init(void)
3013 {
3014     qemu_mutex_init(&XBZRLE.lock);
3015     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
3016 }