migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "qemu-common.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qapi-event.h"
  33 #include "qemu/cutils.h"
  34 #include "qemu/bitops.h"
  35 #include "qemu/bitmap.h"
  36 #include "qemu/timer.h"
  37 #include "qemu/main-loop.h"
  38 #include "xbzrle.h"
  39 #include "ram.h"
  40 #include "migration/migration.h"
  41 #include "migration/misc.h"
  42 #include "qemu-file.h"
  43 #include "migration/vmstate.h"
  44 #include "postcopy-ram.h"
  45 #include "exec/address-spaces.h"
  46 #include "migration/page_cache.h"
  47 #include "qemu/error-report.h"
  48 #include "trace.h"
  49 #include "exec/ram_addr.h"
  50 #include "qemu/rcu_queue.h"
  51 #include "migration/colo.h"
  52
  53 /***********************************************************/
  54 /* ram save/restore */
  55
  56 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  57  * worked for pages that where filled with the same char.  We switched
  58  * it to only search for the zero value.  And to avoid confusion with
  59  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  60  */
  61
  62 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  63 #define RAM_SAVE_FLAG_ZERO     0x02
  64 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  65 #define RAM_SAVE_FLAG_PAGE     0x08
  66 #define RAM_SAVE_FLAG_EOS      0x10
  67 #define RAM_SAVE_FLAG_CONTINUE 0x20
  68 #define RAM_SAVE_FLAG_XBZRLE   0x40
  69 /* 0x80 is reserved in migration.h start with 0x100 next */
  70 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  71
  72 static uint8_t *ZERO_TARGET_PAGE;
  73
  74 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  75 {
  76     return buffer_is_zero(p, size);
  77 }
  78
  79 /* struct contains XBZRLE cache and a static page
  80    used by the compression */
  81 static struct {
  82     /* buffer used for XBZRLE encoding */
  83     uint8_t *encoded_buf;
  84     /* buffer for storing page content */
  85     uint8_t *current_buf;
  86     /* Cache for XBZRLE, Protected by lock. */
  87     PageCache *cache;
  88     QemuMutex lock;
  89 } XBZRLE;
  90
  91 /* buffer used for XBZRLE decoding */
  92 static uint8_t *xbzrle_decoded_buf;
  93
  94 static void XBZRLE_cache_lock(void)
  95 {
  96     if (migrate_use_xbzrle())
  97         qemu_mutex_lock(&XBZRLE.lock);
  98 }
  99
 100 static void XBZRLE_cache_unlock(void)
 101 {
 102     if (migrate_use_xbzrle())
 103         qemu_mutex_unlock(&XBZRLE.lock);
 104 }
 105
 106 /**
 107  * xbzrle_cache_resize: resize the xbzrle cache
 108  *
 109  * This function is called from qmp_migrate_set_cache_size in main
 110  * thread, possibly while a migration is in progress.  A running
 111  * migration may be using the cache and might finish during this call,
 112  * hence changes to the cache are protected by XBZRLE.lock().
 113  *
 114  * Returns the new_size or negative in case of error.
 115  *
 116  * @new_size: new cache size
 117  */
 118 int64_t xbzrle_cache_resize(int64_t new_size)
 119 {
 120     PageCache *new_cache;
 121     int64_t ret;
 122
 123     if (new_size < TARGET_PAGE_SIZE) {
 124         return -1;
 125     }
 126
 127     XBZRLE_cache_lock();
 128
 129     if (XBZRLE.cache != NULL) {
 130         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 131             goto out_new_size;
 132         }
 133         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 134                                         TARGET_PAGE_SIZE);
 135         if (!new_cache) {
 136             error_report("Error creating cache");
 137             ret = -1;
 138             goto out;
 139         }
 140
 141         cache_fini(XBZRLE.cache);
 142         XBZRLE.cache = new_cache;
 143     }
 144
 145 out_new_size:
 146     ret = pow2floor(new_size);
 147 out:
 148     XBZRLE_cache_unlock();
 149     return ret;
 150 }
 151
 152 /*
 153  * An outstanding page request, on the source, having been received
 154  * and queued
 155  */
 156 struct RAMSrcPageRequest {
 157     RAMBlock *rb;
 158     hwaddr    offset;
 159     hwaddr    len;
 160
 161     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 162 };
 163
 164 /* State of RAM for migration */
 165 struct RAMState {
 166     /* QEMUFile used for this migration */
 167     QEMUFile *f;
 168     /* Last block that we have visited searching for dirty pages */
 169     RAMBlock *last_seen_block;
 170     /* Last block from where we have sent data */
 171     RAMBlock *last_sent_block;
 172     /* Last dirty target page we have sent */
 173     ram_addr_t last_page;
 174     /* last ram version we have seen */
 175     uint32_t last_version;
 176     /* We are in the first round */
 177     bool ram_bulk_stage;
 178     /* How many times we have dirty too many pages */
 179     int dirty_rate_high_cnt;
 180     /* How many times we have synchronized the bitmap */
 181     uint64_t bitmap_sync_count;
 182     /* these variables are used for bitmap sync */
 183     /* last time we did a full bitmap_sync */
 184     int64_t time_last_bitmap_sync;
 185     /* bytes transferred at start_time */
 186     uint64_t bytes_xfer_prev;
 187     /* number of dirty pages since start_time */
 188     uint64_t num_dirty_pages_period;
 189     /* xbzrle misses since the beginning of the period */
 190     uint64_t xbzrle_cache_miss_prev;
 191     /* number of iterations at the beginning of period */
 192     uint64_t iterations_prev;
 193     /* Accounting fields */
 194     /* number of zero pages.  It used to be pages filled by the same char. */
 195     uint64_t zero_pages;
 196     /* number of normal transferred pages */
 197     uint64_t norm_pages;
 198     /* Iterations since start */
 199     uint64_t iterations;
 200     /* xbzrle transmitted bytes.  Notice that this is with
 201      * compression, they can't be calculated from the pages */
 202     uint64_t xbzrle_bytes;
 203     /* xbzrle transmmited pages */
 204     uint64_t xbzrle_pages;
 205     /* xbzrle number of cache miss */
 206     uint64_t xbzrle_cache_miss;
 207     /* xbzrle miss rate */
 208     double xbzrle_cache_miss_rate;
 209     /* xbzrle number of overflows */
 210     uint64_t xbzrle_overflows;
 211     /* number of dirty bits in the bitmap */
 212     uint64_t migration_dirty_pages;
 213     /* total number of bytes transferred */
 214     uint64_t bytes_transferred;
 215     /* number of dirtied pages in the last second */
 216     uint64_t dirty_pages_rate;
 217     /* Count of requests incoming from destination */
 218     uint64_t postcopy_requests;
 219     /* protects modification of the bitmap */
 220     QemuMutex bitmap_mutex;
 221     /* The RAMBlock used in the last src_page_requests */
 222     RAMBlock *last_req_rb;
 223     /* Queue of outstanding page requests from the destination */
 224     QemuMutex src_page_req_mutex;
 225     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 226 };
 227 typedef struct RAMState RAMState;
 228
 229 static RAMState ram_state;
 230
 231 uint64_t dup_mig_pages_transferred(void)
 232 {
 233     return ram_state.zero_pages;
 234 }
 235
 236 uint64_t norm_mig_pages_transferred(void)
 237 {
 238     return ram_state.norm_pages;
 239 }
 240
 241 uint64_t xbzrle_mig_bytes_transferred(void)
 242 {
 243     return ram_state.xbzrle_bytes;
 244 }
 245
 246 uint64_t xbzrle_mig_pages_transferred(void)
 247 {
 248     return ram_state.xbzrle_pages;
 249 }
 250
 251 uint64_t xbzrle_mig_pages_cache_miss(void)
 252 {
 253     return ram_state.xbzrle_cache_miss;
 254 }
 255
 256 double xbzrle_mig_cache_miss_rate(void)
 257 {
 258     return ram_state.xbzrle_cache_miss_rate;
 259 }
 260
 261 uint64_t xbzrle_mig_pages_overflow(void)
 262 {
 263     return ram_state.xbzrle_overflows;
 264 }
 265
 266 uint64_t ram_bytes_transferred(void)
 267 {
 268     return ram_state.bytes_transferred;
 269 }
 270
 271 uint64_t ram_bytes_remaining(void)
 272 {
 273     return ram_state.migration_dirty_pages * TARGET_PAGE_SIZE;
 274 }
 275
 276 uint64_t ram_dirty_sync_count(void)
 277 {
 278     return ram_state.bitmap_sync_count;
 279 }
 280
 281 uint64_t ram_dirty_pages_rate(void)
 282 {
 283     return ram_state.dirty_pages_rate;
 284 }
 285
 286 uint64_t ram_postcopy_requests(void)
 287 {
 288     return ram_state.postcopy_requests;
 289 }
 290
 291 /* used by the search for pages to send */
 292 struct PageSearchStatus {
 293     /* Current block being searched */
 294     RAMBlock    *block;
 295     /* Current page to search from */
 296     unsigned long page;
 297     /* Set once we wrap around */
 298     bool         complete_round;
 299 };
 300 typedef struct PageSearchStatus PageSearchStatus;
 301
 302 struct CompressParam {
 303     bool done;
 304     bool quit;
 305     QEMUFile *file;
 306     QemuMutex mutex;
 307     QemuCond cond;
 308     RAMBlock *block;
 309     ram_addr_t offset;
 310 };
 311 typedef struct CompressParam CompressParam;
 312
 313 struct DecompressParam {
 314     bool done;
 315     bool quit;
 316     QemuMutex mutex;
 317     QemuCond cond;
 318     void *des;
 319     uint8_t *compbuf;
 320     int len;
 321 };
 322 typedef struct DecompressParam DecompressParam;
 323
 324 static CompressParam *comp_param;
 325 static QemuThread *compress_threads;
 326 /* comp_done_cond is used to wake up the migration thread when
 327  * one of the compression threads has finished the compression.
 328  * comp_done_lock is used to co-work with comp_done_cond.
 329  */
 330 static QemuMutex comp_done_lock;
 331 static QemuCond comp_done_cond;
 332 /* The empty QEMUFileOps will be used by file in CompressParam */
 333 static const QEMUFileOps empty_ops = { };
 334
 335 static DecompressParam *decomp_param;
 336 static QemuThread *decompress_threads;
 337 static QemuMutex decomp_done_lock;
 338 static QemuCond decomp_done_cond;
 339
 340 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 341                                 ram_addr_t offset);
 342
 343 static void *do_data_compress(void *opaque)
 344 {
 345     CompressParam *param = opaque;
 346     RAMBlock *block;
 347     ram_addr_t offset;
 348
 349     qemu_mutex_lock(&param->mutex);
 350     while (!param->quit) {
 351         if (param->block) {
 352             block = param->block;
 353             offset = param->offset;
 354             param->block = NULL;
 355             qemu_mutex_unlock(&param->mutex);
 356
 357             do_compress_ram_page(param->file, block, offset);
 358
 359             qemu_mutex_lock(&comp_done_lock);
 360             param->done = true;
 361             qemu_cond_signal(&comp_done_cond);
 362             qemu_mutex_unlock(&comp_done_lock);
 363
 364             qemu_mutex_lock(&param->mutex);
 365         } else {
 366             qemu_cond_wait(&param->cond, &param->mutex);
 367         }
 368     }
 369     qemu_mutex_unlock(&param->mutex);
 370
 371     return NULL;
 372 }
 373
 374 static inline void terminate_compression_threads(void)
 375 {
 376     int idx, thread_count;
 377
 378     thread_count = migrate_compress_threads();
 379
 380     for (idx = 0; idx < thread_count; idx++) {
 381         qemu_mutex_lock(&comp_param[idx].mutex);
 382         comp_param[idx].quit = true;
 383         qemu_cond_signal(&comp_param[idx].cond);
 384         qemu_mutex_unlock(&comp_param[idx].mutex);
 385     }
 386 }
 387
 388 void migrate_compress_threads_join(void)
 389 {
 390     int i, thread_count;
 391
 392     if (!migrate_use_compression()) {
 393         return;
 394     }
 395     terminate_compression_threads();
 396     thread_count = migrate_compress_threads();
 397     for (i = 0; i < thread_count; i++) {
 398         qemu_thread_join(compress_threads + i);
 399         qemu_fclose(comp_param[i].file);
 400         qemu_mutex_destroy(&comp_param[i].mutex);
 401         qemu_cond_destroy(&comp_param[i].cond);
 402     }
 403     qemu_mutex_destroy(&comp_done_lock);
 404     qemu_cond_destroy(&comp_done_cond);
 405     g_free(compress_threads);
 406     g_free(comp_param);
 407     compress_threads = NULL;
 408     comp_param = NULL;
 409 }
 410
 411 void migrate_compress_threads_create(void)
 412 {
 413     int i, thread_count;
 414
 415     if (!migrate_use_compression()) {
 416         return;
 417     }
 418     thread_count = migrate_compress_threads();
 419     compress_threads = g_new0(QemuThread, thread_count);
 420     comp_param = g_new0(CompressParam, thread_count);
 421     qemu_cond_init(&comp_done_cond);
 422     qemu_mutex_init(&comp_done_lock);
 423     for (i = 0; i < thread_count; i++) {
 424         /* comp_param[i].file is just used as a dummy buffer to save data,
 425          * set its ops to empty.
 426          */
 427         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 428         comp_param[i].done = true;
 429         comp_param[i].quit = false;
 430         qemu_mutex_init(&comp_param[i].mutex);
 431         qemu_cond_init(&comp_param[i].cond);
 432         qemu_thread_create(compress_threads + i, "compress",
 433                            do_data_compress, comp_param + i,
 434                            QEMU_THREAD_JOINABLE);
 435     }
 436 }
 437
 438 /**
 439  * save_page_header: write page header to wire
 440  *
 441  * If this is the 1st block, it also writes the block identification
 442  *
 443  * Returns the number of bytes written
 444  *
 445  * @f: QEMUFile where to send the data
 446  * @block: block that contains the page we want to send
 447  * @offset: offset inside the block for the page
 448  *          in the lower bits, it contains flags
 449  */
 450 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 451                                ram_addr_t offset)
 452 {
 453     size_t size, len;
 454
 455     if (block == rs->last_sent_block) {
 456         offset |= RAM_SAVE_FLAG_CONTINUE;
 457     }
 458     qemu_put_be64(f, offset);
 459     size = 8;
 460
 461     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 462         len = strlen(block->idstr);
 463         qemu_put_byte(f, len);
 464         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 465         size += 1 + len;
 466         rs->last_sent_block = block;
 467     }
 468     return size;
 469 }
 470
 471 /**
 472  * mig_throttle_guest_down: throotle down the guest
 473  *
 474  * Reduce amount of guest cpu execution to hopefully slow down memory
 475  * writes. If guest dirty memory rate is reduced below the rate at
 476  * which we can transfer pages to the destination then we should be
 477  * able to complete migration. Some workloads dirty memory way too
 478  * fast and will not effectively converge, even with auto-converge.
 479  */
 480 static void mig_throttle_guest_down(void)
 481 {
 482     MigrationState *s = migrate_get_current();
 483     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 484     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 485
 486     /* We have not started throttling yet. Let's start it. */
 487     if (!cpu_throttle_active()) {
 488         cpu_throttle_set(pct_initial);
 489     } else {
 490         /* Throttling already on, just increase the rate */
 491         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 492     }
 493 }
 494
 495 /**
 496  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 497  *
 498  * @rs: current RAM state
 499  * @current_addr: address for the zero page
 500  *
 501  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 502  * The important thing is that a stale (not-yet-0'd) page be replaced
 503  * by the new data.
 504  * As a bonus, if the page wasn't in the cache it gets added so that
 505  * when a small write is made into the 0'd page it gets XBZRLE sent.
 506  */
 507 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 508 {
 509     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 510         return;
 511     }
 512
 513     /* We don't care if this fails to allocate a new cache page
 514      * as long as it updated an old one */
 515     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
 516                  rs->bitmap_sync_count);
 517 }
 518
 519 #define ENCODING_FLAG_XBZRLE 0x1
 520
 521 /**
 522  * save_xbzrle_page: compress and send current page
 523  *
 524  * Returns: 1 means that we wrote the page
 525  *          0 means that page is identical to the one already sent
 526  *          -1 means that xbzrle would be longer than normal
 527  *
 528  * @rs: current RAM state
 529  * @current_data: pointer to the address of the page contents
 530  * @current_addr: addr of the page
 531  * @block: block that contains the page we want to send
 532  * @offset: offset inside the block for the page
 533  * @last_stage: if we are at the completion stage
 534  */
 535 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 536                             ram_addr_t current_addr, RAMBlock *block,
 537                             ram_addr_t offset, bool last_stage)
 538 {
 539     int encoded_len = 0, bytes_xbzrle;
 540     uint8_t *prev_cached_page;
 541
 542     if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
 543         rs->xbzrle_cache_miss++;
 544         if (!last_stage) {
 545             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 546                              rs->bitmap_sync_count) == -1) {
 547                 return -1;
 548             } else {
 549                 /* update *current_data when the page has been
 550                    inserted into cache */
 551                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 552             }
 553         }
 554         return -1;
 555     }
 556
 557     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 558
 559     /* save current buffer into memory */
 560     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 561
 562     /* XBZRLE encoding (if there is no overflow) */
 563     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 564                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 565                                        TARGET_PAGE_SIZE);
 566     if (encoded_len == 0) {
 567         trace_save_xbzrle_page_skipping();
 568         return 0;
 569     } else if (encoded_len == -1) {
 570         trace_save_xbzrle_page_overflow();
 571         rs->xbzrle_overflows++;
 572         /* update data in the cache */
 573         if (!last_stage) {
 574             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 575             *current_data = prev_cached_page;
 576         }
 577         return -1;
 578     }
 579
 580     /* we need to update the data in the cache, in order to get the same data */
 581     if (!last_stage) {
 582         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 583     }
 584
 585     /* Send XBZRLE based compressed page */
 586     bytes_xbzrle = save_page_header(rs, rs->f, block,
 587                                     offset | RAM_SAVE_FLAG_XBZRLE);
 588     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 589     qemu_put_be16(rs->f, encoded_len);
 590     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 591     bytes_xbzrle += encoded_len + 1 + 2;
 592     rs->xbzrle_pages++;
 593     rs->xbzrle_bytes += bytes_xbzrle;
 594     rs->bytes_transferred += bytes_xbzrle;
 595
 596     return 1;
 597 }
 598
 599 /**
 600  * migration_bitmap_find_dirty: find the next dirty page from start
 601  *
 602  * Called with rcu_read_lock() to protect migration_bitmap
 603  *
 604  * Returns the byte offset within memory region of the start of a dirty page
 605  *
 606  * @rs: current RAM state
 607  * @rb: RAMBlock where to search for dirty pages
 608  * @start: page where we start the search
 609  */
 610 static inline
 611 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 612                                           unsigned long start)
 613 {
 614     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 615     unsigned long *bitmap = rb->bmap;
 616     unsigned long next;
 617
 618     if (rs->ram_bulk_stage && start > 0) {
 619         next = start + 1;
 620     } else {
 621         next = find_next_bit(bitmap, size, start);
 622     }
 623
 624     return next;
 625 }
 626
 627 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 628                                                 RAMBlock *rb,
 629                                                 unsigned long page)
 630 {
 631     bool ret;
 632
 633     ret = test_and_clear_bit(page, rb->bmap);
 634
 635     if (ret) {
 636         rs->migration_dirty_pages--;
 637     }
 638     return ret;
 639 }
 640
 641 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 642                                         ram_addr_t start, ram_addr_t length)
 643 {
 644     rs->migration_dirty_pages +=
 645         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 646                                               &rs->num_dirty_pages_period);
 647 }
 648
 649 /**
 650  * ram_pagesize_summary: calculate all the pagesizes of a VM
 651  *
 652  * Returns a summary bitmap of the page sizes of all RAMBlocks
 653  *
 654  * For VMs with just normal pages this is equivalent to the host page
 655  * size. If it's got some huge pages then it's the OR of all the
 656  * different page sizes.
 657  */
 658 uint64_t ram_pagesize_summary(void)
 659 {
 660     RAMBlock *block;
 661     uint64_t summary = 0;
 662
 663     RAMBLOCK_FOREACH(block) {
 664         summary |= block->page_size;
 665     }
 666
 667     return summary;
 668 }
 669
 670 static void migration_bitmap_sync(RAMState *rs)
 671 {
 672     RAMBlock *block;
 673     int64_t end_time;
 674     uint64_t bytes_xfer_now;
 675
 676     rs->bitmap_sync_count++;
 677
 678     if (!rs->time_last_bitmap_sync) {
 679         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 680     }
 681
 682     trace_migration_bitmap_sync_start();
 683     memory_global_dirty_log_sync();
 684
 685     qemu_mutex_lock(&rs->bitmap_mutex);
 686     rcu_read_lock();
 687     RAMBLOCK_FOREACH(block) {
 688         migration_bitmap_sync_range(rs, block, 0, block->used_length);
 689     }
 690     rcu_read_unlock();
 691     qemu_mutex_unlock(&rs->bitmap_mutex);
 692
 693     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 694
 695     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 696
 697     /* more than 1 second = 1000 millisecons */
 698     if (end_time > rs->time_last_bitmap_sync + 1000) {
 699         /* calculate period counters */
 700         rs->dirty_pages_rate = rs->num_dirty_pages_period * 1000
 701             / (end_time - rs->time_last_bitmap_sync);
 702         bytes_xfer_now = ram_bytes_transferred();
 703
 704         if (migrate_auto_converge()) {
 705             /* The following detection logic can be refined later. For now:
 706                Check to see if the dirtied bytes is 50% more than the approx.
 707                amount of bytes that just got transferred since the last time we
 708                were in this routine. If that happens twice, start or increase
 709                throttling */
 710
 711             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 712                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 713                 (++rs->dirty_rate_high_cnt >= 2)) {
 714                     trace_migration_throttle();
 715                     rs->dirty_rate_high_cnt = 0;
 716                     mig_throttle_guest_down();
 717             }
 718         }
 719
 720         if (migrate_use_xbzrle()) {
 721             if (rs->iterations_prev != rs->iterations) {
 722                 rs->xbzrle_cache_miss_rate =
 723                    (double)(rs->xbzrle_cache_miss -
 724                             rs->xbzrle_cache_miss_prev) /
 725                    (rs->iterations - rs->iterations_prev);
 726             }
 727             rs->iterations_prev = rs->iterations;
 728             rs->xbzrle_cache_miss_prev = rs->xbzrle_cache_miss;
 729         }
 730
 731         /* reset period counters */
 732         rs->time_last_bitmap_sync = end_time;
 733         rs->num_dirty_pages_period = 0;
 734         rs->bytes_xfer_prev = bytes_xfer_now;
 735     }
 736     if (migrate_use_events()) {
 737         qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
 738     }
 739 }
 740
 741 /**
 742  * save_zero_page: send the zero page to the stream
 743  *
 744  * Returns the number of pages written.
 745  *
 746  * @rs: current RAM state
 747  * @block: block that contains the page we want to send
 748  * @offset: offset inside the block for the page
 749  * @p: pointer to the page
 750  */
 751 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
 752                           uint8_t *p)
 753 {
 754     int pages = -1;
 755
 756     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 757         rs->zero_pages++;
 758         rs->bytes_transferred +=
 759             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
 760         qemu_put_byte(rs->f, 0);
 761         rs->bytes_transferred += 1;
 762         pages = 1;
 763     }
 764
 765     return pages;
 766 }
 767
 768 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 769 {
 770     if (!migrate_release_ram() || !migration_in_postcopy()) {
 771         return;
 772     }
 773
 774     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 775 }
 776
 777 /**
 778  * ram_save_page: send the given page to the stream
 779  *
 780  * Returns the number of pages written.
 781  *          < 0 - error
 782  *          >=0 - Number of pages written - this might legally be 0
 783  *                if xbzrle noticed the page was the same.
 784  *
 785  * @rs: current RAM state
 786  * @block: block that contains the page we want to send
 787  * @offset: offset inside the block for the page
 788  * @last_stage: if we are at the completion stage
 789  */
 790 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
 791 {
 792     int pages = -1;
 793     uint64_t bytes_xmit;
 794     ram_addr_t current_addr;
 795     uint8_t *p;
 796     int ret;
 797     bool send_async = true;
 798     RAMBlock *block = pss->block;
 799     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 800
 801     p = block->host + offset;
 802     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
 803
 804     /* In doubt sent page as normal */
 805     bytes_xmit = 0;
 806     ret = ram_control_save_page(rs->f, block->offset,
 807                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 808     if (bytes_xmit) {
 809         rs->bytes_transferred += bytes_xmit;
 810         pages = 1;
 811     }
 812
 813     XBZRLE_cache_lock();
 814
 815     current_addr = block->offset + offset;
 816
 817     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 818         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 819             if (bytes_xmit > 0) {
 820                 rs->norm_pages++;
 821             } else if (bytes_xmit == 0) {
 822                 rs->zero_pages++;
 823             }
 824         }
 825     } else {
 826         pages = save_zero_page(rs, block, offset, p);
 827         if (pages > 0) {
 828             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 829              * page would be stale
 830              */
 831             xbzrle_cache_zero_page(rs, current_addr);
 832             ram_release_pages(block->idstr, offset, pages);
 833         } else if (!rs->ram_bulk_stage &&
 834                    !migration_in_postcopy() && migrate_use_xbzrle()) {
 835             pages = save_xbzrle_page(rs, &p, current_addr, block,
 836                                      offset, last_stage);
 837             if (!last_stage) {
 838                 /* Can't send this cached data async, since the cache page
 839                  * might get updated before it gets to the wire
 840                  */
 841                 send_async = false;
 842             }
 843         }
 844     }
 845
 846     /* XBZRLE overflow or normal page */
 847     if (pages == -1) {
 848         rs->bytes_transferred += save_page_header(rs, rs->f, block,
 849                                                   offset | RAM_SAVE_FLAG_PAGE);
 850         if (send_async) {
 851             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
 852                                   migrate_release_ram() &
 853                                   migration_in_postcopy());
 854         } else {
 855             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
 856         }
 857         rs->bytes_transferred += TARGET_PAGE_SIZE;
 858         pages = 1;
 859         rs->norm_pages++;
 860     }
 861
 862     XBZRLE_cache_unlock();
 863
 864     return pages;
 865 }
 866
 867 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 868                                 ram_addr_t offset)
 869 {
 870     RAMState *rs = &ram_state;
 871     int bytes_sent, blen;
 872     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 873
 874     bytes_sent = save_page_header(rs, f, block, offset |
 875                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
 876     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 877                                      migrate_compress_level());
 878     if (blen < 0) {
 879         bytes_sent = 0;
 880         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
 881         error_report("compressed data failed!");
 882     } else {
 883         bytes_sent += blen;
 884         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
 885     }
 886
 887     return bytes_sent;
 888 }
 889
 890 static void flush_compressed_data(RAMState *rs)
 891 {
 892     int idx, len, thread_count;
 893
 894     if (!migrate_use_compression()) {
 895         return;
 896     }
 897     thread_count = migrate_compress_threads();
 898
 899     qemu_mutex_lock(&comp_done_lock);
 900     for (idx = 0; idx < thread_count; idx++) {
 901         while (!comp_param[idx].done) {
 902             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 903         }
 904     }
 905     qemu_mutex_unlock(&comp_done_lock);
 906
 907     for (idx = 0; idx < thread_count; idx++) {
 908         qemu_mutex_lock(&comp_param[idx].mutex);
 909         if (!comp_param[idx].quit) {
 910             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 911             rs->bytes_transferred += len;
 912         }
 913         qemu_mutex_unlock(&comp_param[idx].mutex);
 914     }
 915 }
 916
 917 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 918                                        ram_addr_t offset)
 919 {
 920     param->block = block;
 921     param->offset = offset;
 922 }
 923
 924 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
 925                                            ram_addr_t offset)
 926 {
 927     int idx, thread_count, bytes_xmit = -1, pages = -1;
 928
 929     thread_count = migrate_compress_threads();
 930     qemu_mutex_lock(&comp_done_lock);
 931     while (true) {
 932         for (idx = 0; idx < thread_count; idx++) {
 933             if (comp_param[idx].done) {
 934                 comp_param[idx].done = false;
 935                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 936                 qemu_mutex_lock(&comp_param[idx].mutex);
 937                 set_compress_params(&comp_param[idx], block, offset);
 938                 qemu_cond_signal(&comp_param[idx].cond);
 939                 qemu_mutex_unlock(&comp_param[idx].mutex);
 940                 pages = 1;
 941                 rs->norm_pages++;
 942                 rs->bytes_transferred += bytes_xmit;
 943                 break;
 944             }
 945         }
 946         if (pages > 0) {
 947             break;
 948         } else {
 949             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 950         }
 951     }
 952     qemu_mutex_unlock(&comp_done_lock);
 953
 954     return pages;
 955 }
 956
 957 /**
 958  * ram_save_compressed_page: compress the given page and send it to the stream
 959  *
 960  * Returns the number of pages written.
 961  *
 962  * @rs: current RAM state
 963  * @block: block that contains the page we want to send
 964  * @offset: offset inside the block for the page
 965  * @last_stage: if we are at the completion stage
 966  */
 967 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
 968                                     bool last_stage)
 969 {
 970     int pages = -1;
 971     uint64_t bytes_xmit = 0;
 972     uint8_t *p;
 973     int ret, blen;
 974     RAMBlock *block = pss->block;
 975     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 976
 977     p = block->host + offset;
 978
 979     ret = ram_control_save_page(rs->f, block->offset,
 980                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
 981     if (bytes_xmit) {
 982         rs->bytes_transferred += bytes_xmit;
 983         pages = 1;
 984     }
 985     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 986         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 987             if (bytes_xmit > 0) {
 988                 rs->norm_pages++;
 989             } else if (bytes_xmit == 0) {
 990                 rs->zero_pages++;
 991             }
 992         }
 993     } else {
 994         /* When starting the process of a new block, the first page of
 995          * the block should be sent out before other pages in the same
 996          * block, and all the pages in last block should have been sent
 997          * out, keeping this order is important, because the 'cont' flag
 998          * is used to avoid resending the block name.
 999          */
1000         if (block != rs->last_sent_block) {
1001             flush_compressed_data(rs);
1002             pages = save_zero_page(rs, block, offset, p);
1003             if (pages == -1) {
1004                 /* Make sure the first page is sent out before other pages */
1005                 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1006                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
1007                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1008                                                  migrate_compress_level());
1009                 if (blen > 0) {
1010                     rs->bytes_transferred += bytes_xmit + blen;
1011                     rs->norm_pages++;
1012                     pages = 1;
1013                 } else {
1014                     qemu_file_set_error(rs->f, blen);
1015                     error_report("compressed data failed!");
1016                 }
1017             }
1018             if (pages > 0) {
1019                 ram_release_pages(block->idstr, offset, pages);
1020             }
1021         } else {
1022             pages = save_zero_page(rs, block, offset, p);
1023             if (pages == -1) {
1024                 pages = compress_page_with_multi_thread(rs, block, offset);
1025             } else {
1026                 ram_release_pages(block->idstr, offset, pages);
1027             }
1028         }
1029     }
1030
1031     return pages;
1032 }
1033
1034 /**
1035  * find_dirty_block: find the next dirty page and update any state
1036  * associated with the search process.
1037  *
1038  * Returns if a page is found
1039  *
1040  * @rs: current RAM state
1041  * @pss: data about the state of the current dirty page scan
1042  * @again: set to false if the search has scanned the whole of RAM
1043  */
1044 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1045 {
1046     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1047     if (pss->complete_round && pss->block == rs->last_seen_block &&
1048         pss->page >= rs->last_page) {
1049         /*
1050          * We've been once around the RAM and haven't found anything.
1051          * Give up.
1052          */
1053         *again = false;
1054         return false;
1055     }
1056     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1057         /* Didn't find anything in this RAM Block */
1058         pss->page = 0;
1059         pss->block = QLIST_NEXT_RCU(pss->block, next);
1060         if (!pss->block) {
1061             /* Hit the end of the list */
1062             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1063             /* Flag that we've looped */
1064             pss->complete_round = true;
1065             rs->ram_bulk_stage = false;
1066             if (migrate_use_xbzrle()) {
1067                 /* If xbzrle is on, stop using the data compression at this
1068                  * point. In theory, xbzrle can do better than compression.
1069                  */
1070                 flush_compressed_data(rs);
1071             }
1072         }
1073         /* Didn't find anything this time, but try again on the new block */
1074         *again = true;
1075         return false;
1076     } else {
1077         /* Can go around again, but... */
1078         *again = true;
1079         /* We've found something so probably don't need to */
1080         return true;
1081     }
1082 }
1083
1084 /**
1085  * unqueue_page: gets a page of the queue
1086  *
1087  * Helper for 'get_queued_page' - gets a page off the queue
1088  *
1089  * Returns the block of the page (or NULL if none available)
1090  *
1091  * @rs: current RAM state
1092  * @offset: used to return the offset within the RAMBlock
1093  */
1094 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1095 {
1096     RAMBlock *block = NULL;
1097
1098     qemu_mutex_lock(&rs->src_page_req_mutex);
1099     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1100         struct RAMSrcPageRequest *entry =
1101                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1102         block = entry->rb;
1103         *offset = entry->offset;
1104
1105         if (entry->len > TARGET_PAGE_SIZE) {
1106             entry->len -= TARGET_PAGE_SIZE;
1107             entry->offset += TARGET_PAGE_SIZE;
1108         } else {
1109             memory_region_unref(block->mr);
1110             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1111             g_free(entry);
1112         }
1113     }
1114     qemu_mutex_unlock(&rs->src_page_req_mutex);
1115
1116     return block;
1117 }
1118
1119 /**
1120  * get_queued_page: unqueue a page from the postocpy requests
1121  *
1122  * Skips pages that are already sent (!dirty)
1123  *
1124  * Returns if a queued page is found
1125  *
1126  * @rs: current RAM state
1127  * @pss: data about the state of the current dirty page scan
1128  */
1129 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1130 {
1131     RAMBlock  *block;
1132     ram_addr_t offset;
1133     bool dirty;
1134
1135     do {
1136         block = unqueue_page(rs, &offset);
1137         /*
1138          * We're sending this page, and since it's postcopy nothing else
1139          * will dirty it, and we must make sure it doesn't get sent again
1140          * even if this queue request was received after the background
1141          * search already sent it.
1142          */
1143         if (block) {
1144             unsigned long page;
1145
1146             page = offset >> TARGET_PAGE_BITS;
1147             dirty = test_bit(page, block->bmap);
1148             if (!dirty) {
1149                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1150                        page, test_bit(page, block->unsentmap));
1151             } else {
1152                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1153             }
1154         }
1155
1156     } while (block && !dirty);
1157
1158     if (block) {
1159         /*
1160          * As soon as we start servicing pages out of order, then we have
1161          * to kill the bulk stage, since the bulk stage assumes
1162          * in (migration_bitmap_find_and_reset_dirty) that every page is
1163          * dirty, that's no longer true.
1164          */
1165         rs->ram_bulk_stage = false;
1166
1167         /*
1168          * We want the background search to continue from the queued page
1169          * since the guest is likely to want other pages near to the page
1170          * it just requested.
1171          */
1172         pss->block = block;
1173         pss->page = offset >> TARGET_PAGE_BITS;
1174     }
1175
1176     return !!block;
1177 }
1178
1179 /**
1180  * migration_page_queue_free: drop any remaining pages in the ram
1181  * request queue
1182  *
1183  * It should be empty at the end anyway, but in error cases there may
1184  * be some left.  in case that there is any page left, we drop it.
1185  *
1186  */
1187 void migration_page_queue_free(void)
1188 {
1189     struct RAMSrcPageRequest *mspr, *next_mspr;
1190     RAMState *rs = &ram_state;
1191     /* This queue generally should be empty - but in the case of a failed
1192      * migration might have some droppings in.
1193      */
1194     rcu_read_lock();
1195     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1196         memory_region_unref(mspr->rb->mr);
1197         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1198         g_free(mspr);
1199     }
1200     rcu_read_unlock();
1201 }
1202
1203 /**
1204  * ram_save_queue_pages: queue the page for transmission
1205  *
1206  * A request from postcopy destination for example.
1207  *
1208  * Returns zero on success or negative on error
1209  *
1210  * @rbname: Name of the RAMBLock of the request. NULL means the
1211  *          same that last one.
1212  * @start: starting address from the start of the RAMBlock
1213  * @len: length (in bytes) to send
1214  */
1215 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1216 {
1217     RAMBlock *ramblock;
1218     RAMState *rs = &ram_state;
1219
1220     rs->postcopy_requests++;
1221     rcu_read_lock();
1222     if (!rbname) {
1223         /* Reuse last RAMBlock */
1224         ramblock = rs->last_req_rb;
1225
1226         if (!ramblock) {
1227             /*
1228              * Shouldn't happen, we can't reuse the last RAMBlock if
1229              * it's the 1st request.
1230              */
1231             error_report("ram_save_queue_pages no previous block");
1232             goto err;
1233         }
1234     } else {
1235         ramblock = qemu_ram_block_by_name(rbname);
1236
1237         if (!ramblock) {
1238             /* We shouldn't be asked for a non-existent RAMBlock */
1239             error_report("ram_save_queue_pages no block '%s'", rbname);
1240             goto err;
1241         }
1242         rs->last_req_rb = ramblock;
1243     }
1244     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1245     if (start+len > ramblock->used_length) {
1246         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1247                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1248                      __func__, start, len, ramblock->used_length);
1249         goto err;
1250     }
1251
1252     struct RAMSrcPageRequest *new_entry =
1253         g_malloc0(sizeof(struct RAMSrcPageRequest));
1254     new_entry->rb = ramblock;
1255     new_entry->offset = start;
1256     new_entry->len = len;
1257
1258     memory_region_ref(ramblock->mr);
1259     qemu_mutex_lock(&rs->src_page_req_mutex);
1260     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1261     qemu_mutex_unlock(&rs->src_page_req_mutex);
1262     rcu_read_unlock();
1263
1264     return 0;
1265
1266 err:
1267     rcu_read_unlock();
1268     return -1;
1269 }
1270
1271 /**
1272  * ram_save_target_page: save one target page
1273  *
1274  * Returns the number of pages written
1275  *
1276  * @rs: current RAM state
1277  * @ms: current migration state
1278  * @pss: data about the page we want to send
1279  * @last_stage: if we are at the completion stage
1280  */
1281 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1282                                 bool last_stage)
1283 {
1284     int res = 0;
1285
1286     /* Check the pages is dirty and if it is send it */
1287     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1288         /*
1289          * If xbzrle is on, stop using the data compression after first
1290          * round of migration even if compression is enabled. In theory,
1291          * xbzrle can do better than compression.
1292          */
1293         if (migrate_use_compression() &&
1294             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1295             res = ram_save_compressed_page(rs, pss, last_stage);
1296         } else {
1297             res = ram_save_page(rs, pss, last_stage);
1298         }
1299
1300         if (res < 0) {
1301             return res;
1302         }
1303         if (pss->block->unsentmap) {
1304             clear_bit(pss->page, pss->block->unsentmap);
1305         }
1306     }
1307
1308     return res;
1309 }
1310
1311 /**
1312  * ram_save_host_page: save a whole host page
1313  *
1314  * Starting at *offset send pages up to the end of the current host
1315  * page. It's valid for the initial offset to point into the middle of
1316  * a host page in which case the remainder of the hostpage is sent.
1317  * Only dirty target pages are sent. Note that the host page size may
1318  * be a huge page for this block.
1319  * The saving stops at the boundary of the used_length of the block
1320  * if the RAMBlock isn't a multiple of the host page size.
1321  *
1322  * Returns the number of pages written or negative on error
1323  *
1324  * @rs: current RAM state
1325  * @ms: current migration state
1326  * @pss: data about the page we want to send
1327  * @last_stage: if we are at the completion stage
1328  */
1329 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1330                               bool last_stage)
1331 {
1332     int tmppages, pages = 0;
1333     size_t pagesize_bits =
1334         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1335
1336     do {
1337         tmppages = ram_save_target_page(rs, pss, last_stage);
1338         if (tmppages < 0) {
1339             return tmppages;
1340         }
1341
1342         pages += tmppages;
1343         pss->page++;
1344     } while ((pss->page & (pagesize_bits - 1)) &&
1345              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1346
1347     /* The offset we leave with is the last one we looked at */
1348     pss->page--;
1349     return pages;
1350 }
1351
1352 /**
1353  * ram_find_and_save_block: finds a dirty page and sends it to f
1354  *
1355  * Called within an RCU critical section.
1356  *
1357  * Returns the number of pages written where zero means no dirty pages
1358  *
1359  * @rs: current RAM state
1360  * @last_stage: if we are at the completion stage
1361  *
1362  * On systems where host-page-size > target-page-size it will send all the
1363  * pages in a host page that are dirty.
1364  */
1365
1366 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1367 {
1368     PageSearchStatus pss;
1369     int pages = 0;
1370     bool again, found;
1371
1372     /* No dirty page as there is zero RAM */
1373     if (!ram_bytes_total()) {
1374         return pages;
1375     }
1376
1377     pss.block = rs->last_seen_block;
1378     pss.page = rs->last_page;
1379     pss.complete_round = false;
1380
1381     if (!pss.block) {
1382         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1383     }
1384
1385     do {
1386         again = true;
1387         found = get_queued_page(rs, &pss);
1388
1389         if (!found) {
1390             /* priority queue empty, so just search for something dirty */
1391             found = find_dirty_block(rs, &pss, &again);
1392         }
1393
1394         if (found) {
1395             pages = ram_save_host_page(rs, &pss, last_stage);
1396         }
1397     } while (!pages && again);
1398
1399     rs->last_seen_block = pss.block;
1400     rs->last_page = pss.page;
1401
1402     return pages;
1403 }
1404
1405 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1406 {
1407     uint64_t pages = size / TARGET_PAGE_SIZE;
1408     RAMState *rs = &ram_state;
1409
1410     if (zero) {
1411         rs->zero_pages += pages;
1412     } else {
1413         rs->norm_pages += pages;
1414         rs->bytes_transferred += size;
1415         qemu_update_position(f, size);
1416     }
1417 }
1418
1419 uint64_t ram_bytes_total(void)
1420 {
1421     RAMBlock *block;
1422     uint64_t total = 0;
1423
1424     rcu_read_lock();
1425     RAMBLOCK_FOREACH(block) {
1426         total += block->used_length;
1427     }
1428     rcu_read_unlock();
1429     return total;
1430 }
1431
1432 void free_xbzrle_decoded_buf(void)
1433 {
1434     g_free(xbzrle_decoded_buf);
1435     xbzrle_decoded_buf = NULL;
1436 }
1437
1438 static void ram_migration_cleanup(void *opaque)
1439 {
1440     RAMBlock *block;
1441
1442     /* caller have hold iothread lock or is in a bh, so there is
1443      * no writing race against this migration_bitmap
1444      */
1445     memory_global_dirty_log_stop();
1446
1447     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1448         g_free(block->bmap);
1449         block->bmap = NULL;
1450         g_free(block->unsentmap);
1451         block->unsentmap = NULL;
1452     }
1453
1454     XBZRLE_cache_lock();
1455     if (XBZRLE.cache) {
1456         cache_fini(XBZRLE.cache);
1457         g_free(XBZRLE.encoded_buf);
1458         g_free(XBZRLE.current_buf);
1459         g_free(ZERO_TARGET_PAGE);
1460         XBZRLE.cache = NULL;
1461         XBZRLE.encoded_buf = NULL;
1462         XBZRLE.current_buf = NULL;
1463     }
1464     XBZRLE_cache_unlock();
1465 }
1466
1467 static void ram_state_reset(RAMState *rs)
1468 {
1469     rs->last_seen_block = NULL;
1470     rs->last_sent_block = NULL;
1471     rs->last_page = 0;
1472     rs->last_version = ram_list.version;
1473     rs->ram_bulk_stage = true;
1474 }
1475
1476 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1477
1478 /*
1479  * 'expected' is the value you expect the bitmap mostly to be full
1480  * of; it won't bother printing lines that are all this value.
1481  * If 'todump' is null the migration bitmap is dumped.
1482  */
1483 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1484                            unsigned long pages)
1485 {
1486     int64_t cur;
1487     int64_t linelen = 128;
1488     char linebuf[129];
1489
1490     for (cur = 0; cur < pages; cur += linelen) {
1491         int64_t curb;
1492         bool found = false;
1493         /*
1494          * Last line; catch the case where the line length
1495          * is longer than remaining ram
1496          */
1497         if (cur + linelen > pages) {
1498             linelen = pages - cur;
1499         }
1500         for (curb = 0; curb < linelen; curb++) {
1501             bool thisbit = test_bit(cur + curb, todump);
1502             linebuf[curb] = thisbit ? '1' : '.';
1503             found = found || (thisbit != expected);
1504         }
1505         if (found) {
1506             linebuf[curb] = '\0';
1507             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1508         }
1509     }
1510 }
1511
1512 /* **** functions for postcopy ***** */
1513
1514 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1515 {
1516     struct RAMBlock *block;
1517
1518     RAMBLOCK_FOREACH(block) {
1519         unsigned long *bitmap = block->bmap;
1520         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1521         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1522
1523         while (run_start < range) {
1524             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1525             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1526                               (run_end - run_start) << TARGET_PAGE_BITS);
1527             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1528         }
1529     }
1530 }
1531
1532 /**
1533  * postcopy_send_discard_bm_ram: discard a RAMBlock
1534  *
1535  * Returns zero on success
1536  *
1537  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1538  * Note: At this point the 'unsentmap' is the processed bitmap combined
1539  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1540  *
1541  * @ms: current migration state
1542  * @pds: state for postcopy
1543  * @start: RAMBlock starting page
1544  * @length: RAMBlock size
1545  */
1546 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1547                                         PostcopyDiscardState *pds,
1548                                         RAMBlock *block)
1549 {
1550     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1551     unsigned long current;
1552     unsigned long *unsentmap = block->unsentmap;
1553
1554     for (current = 0; current < end; ) {
1555         unsigned long one = find_next_bit(unsentmap, end, current);
1556
1557         if (one <= end) {
1558             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1559             unsigned long discard_length;
1560
1561             if (zero >= end) {
1562                 discard_length = end - one;
1563             } else {
1564                 discard_length = zero - one;
1565             }
1566             if (discard_length) {
1567                 postcopy_discard_send_range(ms, pds, one, discard_length);
1568             }
1569             current = one + discard_length;
1570         } else {
1571             current = one;
1572         }
1573     }
1574
1575     return 0;
1576 }
1577
1578 /**
1579  * postcopy_each_ram_send_discard: discard all RAMBlocks
1580  *
1581  * Returns 0 for success or negative for error
1582  *
1583  * Utility for the outgoing postcopy code.
1584  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1585  *   passing it bitmap indexes and name.
1586  * (qemu_ram_foreach_block ends up passing unscaled lengths
1587  *  which would mean postcopy code would have to deal with target page)
1588  *
1589  * @ms: current migration state
1590  */
1591 static int postcopy_each_ram_send_discard(MigrationState *ms)
1592 {
1593     struct RAMBlock *block;
1594     int ret;
1595
1596     RAMBLOCK_FOREACH(block) {
1597         PostcopyDiscardState *pds =
1598             postcopy_discard_send_init(ms, block->idstr);
1599
1600         /*
1601          * Postcopy sends chunks of bitmap over the wire, but it
1602          * just needs indexes at this point, avoids it having
1603          * target page specific code.
1604          */
1605         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1606         postcopy_discard_send_finish(ms, pds);
1607         if (ret) {
1608             return ret;
1609         }
1610     }
1611
1612     return 0;
1613 }
1614
1615 /**
1616  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1617  *
1618  * Helper for postcopy_chunk_hostpages; it's called twice to
1619  * canonicalize the two bitmaps, that are similar, but one is
1620  * inverted.
1621  *
1622  * Postcopy requires that all target pages in a hostpage are dirty or
1623  * clean, not a mix.  This function canonicalizes the bitmaps.
1624  *
1625  * @ms: current migration state
1626  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1627  *               otherwise we need to canonicalize partially dirty host pages
1628  * @block: block that contains the page we want to canonicalize
1629  * @pds: state for postcopy
1630  */
1631 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1632                                           RAMBlock *block,
1633                                           PostcopyDiscardState *pds)
1634 {
1635     RAMState *rs = &ram_state;
1636     unsigned long *bitmap = block->bmap;
1637     unsigned long *unsentmap = block->unsentmap;
1638     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1639     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1640     unsigned long run_start;
1641
1642     if (block->page_size == TARGET_PAGE_SIZE) {
1643         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1644         return;
1645     }
1646
1647     if (unsent_pass) {
1648         /* Find a sent page */
1649         run_start = find_next_zero_bit(unsentmap, pages, 0);
1650     } else {
1651         /* Find a dirty page */
1652         run_start = find_next_bit(bitmap, pages, 0);
1653     }
1654
1655     while (run_start < pages) {
1656         bool do_fixup = false;
1657         unsigned long fixup_start_addr;
1658         unsigned long host_offset;
1659
1660         /*
1661          * If the start of this run of pages is in the middle of a host
1662          * page, then we need to fixup this host page.
1663          */
1664         host_offset = run_start % host_ratio;
1665         if (host_offset) {
1666             do_fixup = true;
1667             run_start -= host_offset;
1668             fixup_start_addr = run_start;
1669             /* For the next pass */
1670             run_start = run_start + host_ratio;
1671         } else {
1672             /* Find the end of this run */
1673             unsigned long run_end;
1674             if (unsent_pass) {
1675                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1676             } else {
1677                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1678             }
1679             /*
1680              * If the end isn't at the start of a host page, then the
1681              * run doesn't finish at the end of a host page
1682              * and we need to discard.
1683              */
1684             host_offset = run_end % host_ratio;
1685             if (host_offset) {
1686                 do_fixup = true;
1687                 fixup_start_addr = run_end - host_offset;
1688                 /*
1689                  * This host page has gone, the next loop iteration starts
1690                  * from after the fixup
1691                  */
1692                 run_start = fixup_start_addr + host_ratio;
1693             } else {
1694                 /*
1695                  * No discards on this iteration, next loop starts from
1696                  * next sent/dirty page
1697                  */
1698                 run_start = run_end + 1;
1699             }
1700         }
1701
1702         if (do_fixup) {
1703             unsigned long page;
1704
1705             /* Tell the destination to discard this page */
1706             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1707                 /* For the unsent_pass we:
1708                  *     discard partially sent pages
1709                  * For the !unsent_pass (dirty) we:
1710                  *     discard partially dirty pages that were sent
1711                  *     (any partially sent pages were already discarded
1712                  *     by the previous unsent_pass)
1713                  */
1714                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1715                                             host_ratio);
1716             }
1717
1718             /* Clean up the bitmap */
1719             for (page = fixup_start_addr;
1720                  page < fixup_start_addr + host_ratio; page++) {
1721                 /* All pages in this host page are now not sent */
1722                 set_bit(page, unsentmap);
1723
1724                 /*
1725                  * Remark them as dirty, updating the count for any pages
1726                  * that weren't previously dirty.
1727                  */
1728                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1729             }
1730         }
1731
1732         if (unsent_pass) {
1733             /* Find the next sent page for the next iteration */
1734             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1735         } else {
1736             /* Find the next dirty page for the next iteration */
1737             run_start = find_next_bit(bitmap, pages, run_start);
1738         }
1739     }
1740 }
1741
1742 /**
1743  * postcopy_chuck_hostpages: discrad any partially sent host page
1744  *
1745  * Utility for the outgoing postcopy code.
1746  *
1747  * Discard any partially sent host-page size chunks, mark any partially
1748  * dirty host-page size chunks as all dirty.  In this case the host-page
1749  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1750  *
1751  * Returns zero on success
1752  *
1753  * @ms: current migration state
1754  * @block: block we want to work with
1755  */
1756 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1757 {
1758     PostcopyDiscardState *pds =
1759         postcopy_discard_send_init(ms, block->idstr);
1760
1761     /* First pass: Discard all partially sent host pages */
1762     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1763     /*
1764      * Second pass: Ensure that all partially dirty host pages are made
1765      * fully dirty.
1766      */
1767     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1768
1769     postcopy_discard_send_finish(ms, pds);
1770     return 0;
1771 }
1772
1773 /**
1774  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1775  *
1776  * Returns zero on success
1777  *
1778  * Transmit the set of pages to be discarded after precopy to the target
1779  * these are pages that:
1780  *     a) Have been previously transmitted but are now dirty again
1781  *     b) Pages that have never been transmitted, this ensures that
1782  *        any pages on the destination that have been mapped by background
1783  *        tasks get discarded (transparent huge pages is the specific concern)
1784  * Hopefully this is pretty sparse
1785  *
1786  * @ms: current migration state
1787  */
1788 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1789 {
1790     RAMState *rs = &ram_state;
1791     RAMBlock *block;
1792     int ret;
1793
1794     rcu_read_lock();
1795
1796     /* This should be our last sync, the src is now paused */
1797     migration_bitmap_sync(rs);
1798
1799     /* Easiest way to make sure we don't resume in the middle of a host-page */
1800     rs->last_seen_block = NULL;
1801     rs->last_sent_block = NULL;
1802     rs->last_page = 0;
1803
1804     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1805         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1806         unsigned long *bitmap = block->bmap;
1807         unsigned long *unsentmap = block->unsentmap;
1808
1809         if (!unsentmap) {
1810             /* We don't have a safe way to resize the sentmap, so
1811              * if the bitmap was resized it will be NULL at this
1812              * point.
1813              */
1814             error_report("migration ram resized during precopy phase");
1815             rcu_read_unlock();
1816             return -EINVAL;
1817         }
1818         /* Deal with TPS != HPS and huge pages */
1819         ret = postcopy_chunk_hostpages(ms, block);
1820         if (ret) {
1821             rcu_read_unlock();
1822             return ret;
1823         }
1824
1825         /*
1826          * Update the unsentmap to be unsentmap = unsentmap | dirty
1827          */
1828         bitmap_or(unsentmap, unsentmap, bitmap, pages);
1829 #ifdef DEBUG_POSTCOPY
1830         ram_debug_dump_bitmap(unsentmap, true, pages);
1831 #endif
1832     }
1833     trace_ram_postcopy_send_discard_bitmap();
1834
1835     ret = postcopy_each_ram_send_discard(ms);
1836     rcu_read_unlock();
1837
1838     return ret;
1839 }
1840
1841 /**
1842  * ram_discard_range: discard dirtied pages at the beginning of postcopy
1843  *
1844  * Returns zero on success
1845  *
1846  * @rbname: name of the RAMBlock of the request. NULL means the
1847  *          same that last one.
1848  * @start: RAMBlock starting page
1849  * @length: RAMBlock size
1850  */
1851 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1852 {
1853     int ret = -1;
1854
1855     trace_ram_discard_range(rbname, start, length);
1856
1857     rcu_read_lock();
1858     RAMBlock *rb = qemu_ram_block_by_name(rbname);
1859
1860     if (!rb) {
1861         error_report("ram_discard_range: Failed to find block '%s'", rbname);
1862         goto err;
1863     }
1864
1865     ret = ram_block_discard_range(rb, start, length);
1866
1867 err:
1868     rcu_read_unlock();
1869
1870     return ret;
1871 }
1872
1873 static int ram_state_init(RAMState *rs)
1874 {
1875     memset(rs, 0, sizeof(*rs));
1876     qemu_mutex_init(&rs->bitmap_mutex);
1877     qemu_mutex_init(&rs->src_page_req_mutex);
1878     QSIMPLEQ_INIT(&rs->src_page_requests);
1879
1880     if (migrate_use_xbzrle()) {
1881         XBZRLE_cache_lock();
1882         ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
1883         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1884                                   TARGET_PAGE_SIZE,
1885                                   TARGET_PAGE_SIZE);
1886         if (!XBZRLE.cache) {
1887             XBZRLE_cache_unlock();
1888             error_report("Error creating cache");
1889             return -1;
1890         }
1891         XBZRLE_cache_unlock();
1892
1893         /* We prefer not to abort if there is no memory */
1894         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1895         if (!XBZRLE.encoded_buf) {
1896             error_report("Error allocating encoded_buf");
1897             return -1;
1898         }
1899
1900         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1901         if (!XBZRLE.current_buf) {
1902             error_report("Error allocating current_buf");
1903             g_free(XBZRLE.encoded_buf);
1904             XBZRLE.encoded_buf = NULL;
1905             return -1;
1906         }
1907     }
1908
1909     /* For memory_global_dirty_log_start below.  */
1910     qemu_mutex_lock_iothread();
1911
1912     qemu_mutex_lock_ramlist();
1913     rcu_read_lock();
1914     ram_state_reset(rs);
1915
1916     /* Skip setting bitmap if there is no RAM */
1917     if (ram_bytes_total()) {
1918         RAMBlock *block;
1919
1920         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1921             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
1922
1923             block->bmap = bitmap_new(pages);
1924             bitmap_set(block->bmap, 0, pages);
1925             if (migrate_postcopy_ram()) {
1926                 block->unsentmap = bitmap_new(pages);
1927                 bitmap_set(block->unsentmap, 0, pages);
1928             }
1929         }
1930     }
1931
1932     /*
1933      * Count the total number of pages used by ram blocks not including any
1934      * gaps due to alignment or unplugs.
1935      */
1936     rs->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1937
1938     memory_global_dirty_log_start();
1939     migration_bitmap_sync(rs);
1940     qemu_mutex_unlock_ramlist();
1941     qemu_mutex_unlock_iothread();
1942     rcu_read_unlock();
1943
1944     return 0;
1945 }
1946
1947 /*
1948  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1949  * long-running RCU critical section.  When rcu-reclaims in the code
1950  * start to become numerous it will be necessary to reduce the
1951  * granularity of these critical sections.
1952  */
1953
1954 /**
1955  * ram_save_setup: Setup RAM for migration
1956  *
1957  * Returns zero to indicate success and negative for error
1958  *
1959  * @f: QEMUFile where to send the data
1960  * @opaque: RAMState pointer
1961  */
1962 static int ram_save_setup(QEMUFile *f, void *opaque)
1963 {
1964     RAMState *rs = opaque;
1965     RAMBlock *block;
1966
1967     /* migration has already setup the bitmap, reuse it. */
1968     if (!migration_in_colo_state()) {
1969         if (ram_state_init(rs) < 0) {
1970             return -1;
1971          }
1972     }
1973     rs->f = f;
1974
1975     rcu_read_lock();
1976
1977     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1978
1979     RAMBLOCK_FOREACH(block) {
1980         qemu_put_byte(f, strlen(block->idstr));
1981         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1982         qemu_put_be64(f, block->used_length);
1983         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
1984             qemu_put_be64(f, block->page_size);
1985         }
1986     }
1987
1988     rcu_read_unlock();
1989
1990     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1991     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1992
1993     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1994
1995     return 0;
1996 }
1997
1998 /**
1999  * ram_save_iterate: iterative stage for migration
2000  *
2001  * Returns zero to indicate success and negative for error
2002  *
2003  * @f: QEMUFile where to send the data
2004  * @opaque: RAMState pointer
2005  */
2006 static int ram_save_iterate(QEMUFile *f, void *opaque)
2007 {
2008     RAMState *rs = opaque;
2009     int ret;
2010     int i;
2011     int64_t t0;
2012     int done = 0;
2013
2014     rcu_read_lock();
2015     if (ram_list.version != rs->last_version) {
2016         ram_state_reset(rs);
2017     }
2018
2019     /* Read version before ram_list.blocks */
2020     smp_rmb();
2021
2022     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2023
2024     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2025     i = 0;
2026     while ((ret = qemu_file_rate_limit(f)) == 0) {
2027         int pages;
2028
2029         pages = ram_find_and_save_block(rs, false);
2030         /* no more pages to sent */
2031         if (pages == 0) {
2032             done = 1;
2033             break;
2034         }
2035         rs->iterations++;
2036
2037         /* we want to check in the 1st loop, just in case it was the 1st time
2038            and we had to sync the dirty bitmap.
2039            qemu_get_clock_ns() is a bit expensive, so we only check each some
2040            iterations
2041         */
2042         if ((i & 63) == 0) {
2043             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2044             if (t1 > MAX_WAIT) {
2045                 trace_ram_save_iterate_big_wait(t1, i);
2046                 break;
2047             }
2048         }
2049         i++;
2050     }
2051     flush_compressed_data(rs);
2052     rcu_read_unlock();
2053
2054     /*
2055      * Must occur before EOS (or any QEMUFile operation)
2056      * because of RDMA protocol.
2057      */
2058     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2059
2060     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2061     rs->bytes_transferred += 8;
2062
2063     ret = qemu_file_get_error(f);
2064     if (ret < 0) {
2065         return ret;
2066     }
2067
2068     return done;
2069 }
2070
2071 /**
2072  * ram_save_complete: function called to send the remaining amount of ram
2073  *
2074  * Returns zero to indicate success
2075  *
2076  * Called with iothread lock
2077  *
2078  * @f: QEMUFile where to send the data
2079  * @opaque: RAMState pointer
2080  */
2081 static int ram_save_complete(QEMUFile *f, void *opaque)
2082 {
2083     RAMState *rs = opaque;
2084
2085     rcu_read_lock();
2086
2087     if (!migration_in_postcopy()) {
2088         migration_bitmap_sync(rs);
2089     }
2090
2091     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2092
2093     /* try transferring iterative blocks of memory */
2094
2095     /* flush all remaining blocks regardless of rate limiting */
2096     while (true) {
2097         int pages;
2098
2099         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2100         /* no more blocks to sent */
2101         if (pages == 0) {
2102             break;
2103         }
2104     }
2105
2106     flush_compressed_data(rs);
2107     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2108
2109     rcu_read_unlock();
2110
2111     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2112
2113     return 0;
2114 }
2115
2116 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2117                              uint64_t *non_postcopiable_pending,
2118                              uint64_t *postcopiable_pending)
2119 {
2120     RAMState *rs = opaque;
2121     uint64_t remaining_size;
2122
2123     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2124
2125     if (!migration_in_postcopy() &&
2126         remaining_size < max_size) {
2127         qemu_mutex_lock_iothread();
2128         rcu_read_lock();
2129         migration_bitmap_sync(rs);
2130         rcu_read_unlock();
2131         qemu_mutex_unlock_iothread();
2132         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2133     }
2134
2135     /* We can do postcopy, and all the data is postcopiable */
2136     *postcopiable_pending += remaining_size;
2137 }
2138
2139 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2140 {
2141     unsigned int xh_len;
2142     int xh_flags;
2143     uint8_t *loaded_data;
2144
2145     if (!xbzrle_decoded_buf) {
2146         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2147     }
2148     loaded_data = xbzrle_decoded_buf;
2149
2150     /* extract RLE header */
2151     xh_flags = qemu_get_byte(f);
2152     xh_len = qemu_get_be16(f);
2153
2154     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2155         error_report("Failed to load XBZRLE page - wrong compression!");
2156         return -1;
2157     }
2158
2159     if (xh_len > TARGET_PAGE_SIZE) {
2160         error_report("Failed to load XBZRLE page - len overflow!");
2161         return -1;
2162     }
2163     /* load data and decode */
2164     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2165
2166     /* decode RLE */
2167     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2168                              TARGET_PAGE_SIZE) == -1) {
2169         error_report("Failed to load XBZRLE page - decode error!");
2170         return -1;
2171     }
2172
2173     return 0;
2174 }
2175
2176 /**
2177  * ram_block_from_stream: read a RAMBlock id from the migration stream
2178  *
2179  * Must be called from within a rcu critical section.
2180  *
2181  * Returns a pointer from within the RCU-protected ram_list.
2182  *
2183  * @f: QEMUFile where to read the data from
2184  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2185  */
2186 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2187 {
2188     static RAMBlock *block = NULL;
2189     char id[256];
2190     uint8_t len;
2191
2192     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2193         if (!block) {
2194             error_report("Ack, bad migration stream!");
2195             return NULL;
2196         }
2197         return block;
2198     }
2199
2200     len = qemu_get_byte(f);
2201     qemu_get_buffer(f, (uint8_t *)id, len);
2202     id[len] = 0;
2203
2204     block = qemu_ram_block_by_name(id);
2205     if (!block) {
2206         error_report("Can't find block %s", id);
2207         return NULL;
2208     }
2209
2210     return block;
2211 }
2212
2213 static inline void *host_from_ram_block_offset(RAMBlock *block,
2214                                                ram_addr_t offset)
2215 {
2216     if (!offset_in_ramblock(block, offset)) {
2217         return NULL;
2218     }
2219
2220     return block->host + offset;
2221 }
2222
2223 /**
2224  * ram_handle_compressed: handle the zero page case
2225  *
2226  * If a page (or a whole RDMA chunk) has been
2227  * determined to be zero, then zap it.
2228  *
2229  * @host: host address for the zero page
2230  * @ch: what the page is filled from.  We only support zero
2231  * @size: size of the zero page
2232  */
2233 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2234 {
2235     if (ch != 0 || !is_zero_range(host, size)) {
2236         memset(host, ch, size);
2237     }
2238 }
2239
2240 static void *do_data_decompress(void *opaque)
2241 {
2242     DecompressParam *param = opaque;
2243     unsigned long pagesize;
2244     uint8_t *des;
2245     int len;
2246
2247     qemu_mutex_lock(&param->mutex);
2248     while (!param->quit) {
2249         if (param->des) {
2250             des = param->des;
2251             len = param->len;
2252             param->des = 0;
2253             qemu_mutex_unlock(&param->mutex);
2254
2255             pagesize = TARGET_PAGE_SIZE;
2256             /* uncompress() will return failed in some case, especially
2257              * when the page is dirted when doing the compression, it's
2258              * not a problem because the dirty page will be retransferred
2259              * and uncompress() won't break the data in other pages.
2260              */
2261             uncompress((Bytef *)des, &pagesize,
2262                        (const Bytef *)param->compbuf, len);
2263
2264             qemu_mutex_lock(&decomp_done_lock);
2265             param->done = true;
2266             qemu_cond_signal(&decomp_done_cond);
2267             qemu_mutex_unlock(&decomp_done_lock);
2268
2269             qemu_mutex_lock(&param->mutex);
2270         } else {
2271             qemu_cond_wait(&param->cond, &param->mutex);
2272         }
2273     }
2274     qemu_mutex_unlock(&param->mutex);
2275
2276     return NULL;
2277 }
2278
2279 static void wait_for_decompress_done(void)
2280 {
2281     int idx, thread_count;
2282
2283     if (!migrate_use_compression()) {
2284         return;
2285     }
2286
2287     thread_count = migrate_decompress_threads();
2288     qemu_mutex_lock(&decomp_done_lock);
2289     for (idx = 0; idx < thread_count; idx++) {
2290         while (!decomp_param[idx].done) {
2291             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2292         }
2293     }
2294     qemu_mutex_unlock(&decomp_done_lock);
2295 }
2296
2297 void migrate_decompress_threads_create(void)
2298 {
2299     int i, thread_count;
2300
2301     thread_count = migrate_decompress_threads();
2302     decompress_threads = g_new0(QemuThread, thread_count);
2303     decomp_param = g_new0(DecompressParam, thread_count);
2304     qemu_mutex_init(&decomp_done_lock);
2305     qemu_cond_init(&decomp_done_cond);
2306     for (i = 0; i < thread_count; i++) {
2307         qemu_mutex_init(&decomp_param[i].mutex);
2308         qemu_cond_init(&decomp_param[i].cond);
2309         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2310         decomp_param[i].done = true;
2311         decomp_param[i].quit = false;
2312         qemu_thread_create(decompress_threads + i, "decompress",
2313                            do_data_decompress, decomp_param + i,
2314                            QEMU_THREAD_JOINABLE);
2315     }
2316 }
2317
2318 void migrate_decompress_threads_join(void)
2319 {
2320     int i, thread_count;
2321
2322     thread_count = migrate_decompress_threads();
2323     for (i = 0; i < thread_count; i++) {
2324         qemu_mutex_lock(&decomp_param[i].mutex);
2325         decomp_param[i].quit = true;
2326         qemu_cond_signal(&decomp_param[i].cond);
2327         qemu_mutex_unlock(&decomp_param[i].mutex);
2328     }
2329     for (i = 0; i < thread_count; i++) {
2330         qemu_thread_join(decompress_threads + i);
2331         qemu_mutex_destroy(&decomp_param[i].mutex);
2332         qemu_cond_destroy(&decomp_param[i].cond);
2333         g_free(decomp_param[i].compbuf);
2334     }
2335     g_free(decompress_threads);
2336     g_free(decomp_param);
2337     decompress_threads = NULL;
2338     decomp_param = NULL;
2339 }
2340
2341 static void decompress_data_with_multi_threads(QEMUFile *f,
2342                                                void *host, int len)
2343 {
2344     int idx, thread_count;
2345
2346     thread_count = migrate_decompress_threads();
2347     qemu_mutex_lock(&decomp_done_lock);
2348     while (true) {
2349         for (idx = 0; idx < thread_count; idx++) {
2350             if (decomp_param[idx].done) {
2351                 decomp_param[idx].done = false;
2352                 qemu_mutex_lock(&decomp_param[idx].mutex);
2353                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2354                 decomp_param[idx].des = host;
2355                 decomp_param[idx].len = len;
2356                 qemu_cond_signal(&decomp_param[idx].cond);
2357                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2358                 break;
2359             }
2360         }
2361         if (idx < thread_count) {
2362             break;
2363         } else {
2364             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2365         }
2366     }
2367     qemu_mutex_unlock(&decomp_done_lock);
2368 }
2369
2370 /**
2371  * ram_postcopy_incoming_init: allocate postcopy data structures
2372  *
2373  * Returns 0 for success and negative if there was one error
2374  *
2375  * @mis: current migration incoming state
2376  *
2377  * Allocate data structures etc needed by incoming migration with
2378  * postcopy-ram. postcopy-ram's similarly names
2379  * postcopy_ram_incoming_init does the work.
2380  */
2381 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2382 {
2383     unsigned long ram_pages = last_ram_page();
2384
2385     return postcopy_ram_incoming_init(mis, ram_pages);
2386 }
2387
2388 /**
2389  * ram_load_postcopy: load a page in postcopy case
2390  *
2391  * Returns 0 for success or -errno in case of error
2392  *
2393  * Called in postcopy mode by ram_load().
2394  * rcu_read_lock is taken prior to this being called.
2395  *
2396  * @f: QEMUFile where to send the data
2397  */
2398 static int ram_load_postcopy(QEMUFile *f)
2399 {
2400     int flags = 0, ret = 0;
2401     bool place_needed = false;
2402     bool matching_page_sizes = false;
2403     MigrationIncomingState *mis = migration_incoming_get_current();
2404     /* Temporary page that is later 'placed' */
2405     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2406     void *last_host = NULL;
2407     bool all_zero = false;
2408
2409     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2410         ram_addr_t addr;
2411         void *host = NULL;
2412         void *page_buffer = NULL;
2413         void *place_source = NULL;
2414         RAMBlock *block = NULL;
2415         uint8_t ch;
2416
2417         addr = qemu_get_be64(f);
2418         flags = addr & ~TARGET_PAGE_MASK;
2419         addr &= TARGET_PAGE_MASK;
2420
2421         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2422         place_needed = false;
2423         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2424             block = ram_block_from_stream(f, flags);
2425
2426             host = host_from_ram_block_offset(block, addr);
2427             if (!host) {
2428                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2429                 ret = -EINVAL;
2430                 break;
2431             }
2432             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2433             /*
2434              * Postcopy requires that we place whole host pages atomically;
2435              * these may be huge pages for RAMBlocks that are backed by
2436              * hugetlbfs.
2437              * To make it atomic, the data is read into a temporary page
2438              * that's moved into place later.
2439              * The migration protocol uses,  possibly smaller, target-pages
2440              * however the source ensures it always sends all the components
2441              * of a host page in order.
2442              */
2443             page_buffer = postcopy_host_page +
2444                           ((uintptr_t)host & (block->page_size - 1));
2445             /* If all TP are zero then we can optimise the place */
2446             if (!((uintptr_t)host & (block->page_size - 1))) {
2447                 all_zero = true;
2448             } else {
2449                 /* not the 1st TP within the HP */
2450                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2451                     error_report("Non-sequential target page %p/%p",
2452                                   host, last_host);
2453                     ret = -EINVAL;
2454                     break;
2455                 }
2456             }
2457
2458
2459             /*
2460              * If it's the last part of a host page then we place the host
2461              * page
2462              */
2463             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2464                                      (block->page_size - 1)) == 0;
2465             place_source = postcopy_host_page;
2466         }
2467         last_host = host;
2468
2469         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2470         case RAM_SAVE_FLAG_ZERO:
2471             ch = qemu_get_byte(f);
2472             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2473             if (ch) {
2474                 all_zero = false;
2475             }
2476             break;
2477
2478         case RAM_SAVE_FLAG_PAGE:
2479             all_zero = false;
2480             if (!place_needed || !matching_page_sizes) {
2481                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2482             } else {
2483                 /* Avoids the qemu_file copy during postcopy, which is
2484                  * going to do a copy later; can only do it when we
2485                  * do this read in one go (matching page sizes)
2486                  */
2487                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2488                                          TARGET_PAGE_SIZE);
2489             }
2490             break;
2491         case RAM_SAVE_FLAG_EOS:
2492             /* normal exit */
2493             break;
2494         default:
2495             error_report("Unknown combination of migration flags: %#x"
2496                          " (postcopy mode)", flags);
2497             ret = -EINVAL;
2498         }
2499
2500         if (place_needed) {
2501             /* This gets called at the last target page in the host page */
2502             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2503
2504             if (all_zero) {
2505                 ret = postcopy_place_page_zero(mis, place_dest,
2506                                                block->page_size);
2507             } else {
2508                 ret = postcopy_place_page(mis, place_dest,
2509                                           place_source, block->page_size);
2510             }
2511         }
2512         if (!ret) {
2513             ret = qemu_file_get_error(f);
2514         }
2515     }
2516
2517     return ret;
2518 }
2519
2520 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2521 {
2522     int flags = 0, ret = 0;
2523     static uint64_t seq_iter;
2524     int len = 0;
2525     /*
2526      * If system is running in postcopy mode, page inserts to host memory must
2527      * be atomic
2528      */
2529     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2530     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2531     bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2532
2533     seq_iter++;
2534
2535     if (version_id != 4) {
2536         ret = -EINVAL;
2537     }
2538
2539     /* This RCU critical section can be very long running.
2540      * When RCU reclaims in the code start to become numerous,
2541      * it will be necessary to reduce the granularity of this
2542      * critical section.
2543      */
2544     rcu_read_lock();
2545
2546     if (postcopy_running) {
2547         ret = ram_load_postcopy(f);
2548     }
2549
2550     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2551         ram_addr_t addr, total_ram_bytes;
2552         void *host = NULL;
2553         uint8_t ch;
2554
2555         addr = qemu_get_be64(f);
2556         flags = addr & ~TARGET_PAGE_MASK;
2557         addr &= TARGET_PAGE_MASK;
2558
2559         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2560                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2561             RAMBlock *block = ram_block_from_stream(f, flags);
2562
2563             host = host_from_ram_block_offset(block, addr);
2564             if (!host) {
2565                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2566                 ret = -EINVAL;
2567                 break;
2568             }
2569             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2570         }
2571
2572         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2573         case RAM_SAVE_FLAG_MEM_SIZE:
2574             /* Synchronize RAM block list */
2575             total_ram_bytes = addr;
2576             while (!ret && total_ram_bytes) {
2577                 RAMBlock *block;
2578                 char id[256];
2579                 ram_addr_t length;
2580
2581                 len = qemu_get_byte(f);
2582                 qemu_get_buffer(f, (uint8_t *)id, len);
2583                 id[len] = 0;
2584                 length = qemu_get_be64(f);
2585
2586                 block = qemu_ram_block_by_name(id);
2587                 if (block) {
2588                     if (length != block->used_length) {
2589                         Error *local_err = NULL;
2590
2591                         ret = qemu_ram_resize(block, length,
2592                                               &local_err);
2593                         if (local_err) {
2594                             error_report_err(local_err);
2595                         }
2596                     }
2597                     /* For postcopy we need to check hugepage sizes match */
2598                     if (postcopy_advised &&
2599                         block->page_size != qemu_host_page_size) {
2600                         uint64_t remote_page_size = qemu_get_be64(f);
2601                         if (remote_page_size != block->page_size) {
2602                             error_report("Mismatched RAM page size %s "
2603                                          "(local) %zd != %" PRId64,
2604                                          id, block->page_size,
2605                                          remote_page_size);
2606                             ret = -EINVAL;
2607                         }
2608                     }
2609                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2610                                           block->idstr);
2611                 } else {
2612                     error_report("Unknown ramblock \"%s\", cannot "
2613                                  "accept migration", id);
2614                     ret = -EINVAL;
2615                 }
2616
2617                 total_ram_bytes -= length;
2618             }
2619             break;
2620
2621         case RAM_SAVE_FLAG_ZERO:
2622             ch = qemu_get_byte(f);
2623             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2624             break;
2625
2626         case RAM_SAVE_FLAG_PAGE:
2627             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2628             break;
2629
2630         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2631             len = qemu_get_be32(f);
2632             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2633                 error_report("Invalid compressed data length: %d", len);
2634                 ret = -EINVAL;
2635                 break;
2636             }
2637             decompress_data_with_multi_threads(f, host, len);
2638             break;
2639
2640         case RAM_SAVE_FLAG_XBZRLE:
2641             if (load_xbzrle(f, addr, host) < 0) {
2642                 error_report("Failed to decompress XBZRLE page at "
2643                              RAM_ADDR_FMT, addr);
2644                 ret = -EINVAL;
2645                 break;
2646             }
2647             break;
2648         case RAM_SAVE_FLAG_EOS:
2649             /* normal exit */
2650             break;
2651         default:
2652             if (flags & RAM_SAVE_FLAG_HOOK) {
2653                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2654             } else {
2655                 error_report("Unknown combination of migration flags: %#x",
2656                              flags);
2657                 ret = -EINVAL;
2658             }
2659         }
2660         if (!ret) {
2661             ret = qemu_file_get_error(f);
2662         }
2663     }
2664
2665     wait_for_decompress_done();
2666     rcu_read_unlock();
2667     trace_ram_load_complete(ret, seq_iter);
2668     return ret;
2669 }
2670
2671 static SaveVMHandlers savevm_ram_handlers = {
2672     .save_live_setup = ram_save_setup,
2673     .save_live_iterate = ram_save_iterate,
2674     .save_live_complete_postcopy = ram_save_complete,
2675     .save_live_complete_precopy = ram_save_complete,
2676     .save_live_pending = ram_save_pending,
2677     .load_state = ram_load,
2678     .cleanup = ram_migration_cleanup,
2679 };
2680
2681 void ram_mig_init(void)
2682 {
2683     qemu_mutex_init(&XBZRLE.lock);
2684     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2685 }