migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "qemu-common.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qapi-event.h"
  33 #include "qemu/cutils.h"
  34 #include "qemu/bitops.h"
  35 #include "qemu/bitmap.h"
  36 #include "qemu/timer.h"
  37 #include "qemu/main-loop.h"
  38 #include "xbzrle.h"
  39 #include "migration/migration.h"
  40 #include "postcopy-ram.h"
  41 #include "exec/address-spaces.h"
  42 #include "migration/page_cache.h"
  43 #include "qemu/error-report.h"
  44 #include "trace.h"
  45 #include "exec/ram_addr.h"
  46 #include "qemu/rcu_queue.h"
  47 #include "migration/colo.h"
  48
  49 /***********************************************************/
  50 /* ram save/restore */
  51
  52 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  53  * worked for pages that where filled with the same char.  We switched
  54  * it to only search for the zero value.  And to avoid confusion with
  55  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  56  */
  57
  58 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  59 #define RAM_SAVE_FLAG_ZERO     0x02
  60 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  61 #define RAM_SAVE_FLAG_PAGE     0x08
  62 #define RAM_SAVE_FLAG_EOS      0x10
  63 #define RAM_SAVE_FLAG_CONTINUE 0x20
  64 #define RAM_SAVE_FLAG_XBZRLE   0x40
  65 /* 0x80 is reserved in migration.h start with 0x100 next */
  66 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  67
  68 static uint8_t *ZERO_TARGET_PAGE;
  69
  70 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  71 {
  72     return buffer_is_zero(p, size);
  73 }
  74
  75 /* struct contains XBZRLE cache and a static page
  76    used by the compression */
  77 static struct {
  78     /* buffer used for XBZRLE encoding */
  79     uint8_t *encoded_buf;
  80     /* buffer for storing page content */
  81     uint8_t *current_buf;
  82     /* Cache for XBZRLE, Protected by lock. */
  83     PageCache *cache;
  84     QemuMutex lock;
  85 } XBZRLE;
  86
  87 /* buffer used for XBZRLE decoding */
  88 static uint8_t *xbzrle_decoded_buf;
  89
  90 static void XBZRLE_cache_lock(void)
  91 {
  92     if (migrate_use_xbzrle())
  93         qemu_mutex_lock(&XBZRLE.lock);
  94 }
  95
  96 static void XBZRLE_cache_unlock(void)
  97 {
  98     if (migrate_use_xbzrle())
  99         qemu_mutex_unlock(&XBZRLE.lock);
 100 }
 101
 102 /**
 103  * xbzrle_cache_resize: resize the xbzrle cache
 104  *
 105  * This function is called from qmp_migrate_set_cache_size in main
 106  * thread, possibly while a migration is in progress.  A running
 107  * migration may be using the cache and might finish during this call,
 108  * hence changes to the cache are protected by XBZRLE.lock().
 109  *
 110  * Returns the new_size or negative in case of error.
 111  *
 112  * @new_size: new cache size
 113  */
 114 int64_t xbzrle_cache_resize(int64_t new_size)
 115 {
 116     PageCache *new_cache;
 117     int64_t ret;
 118
 119     if (new_size < TARGET_PAGE_SIZE) {
 120         return -1;
 121     }
 122
 123     XBZRLE_cache_lock();
 124
 125     if (XBZRLE.cache != NULL) {
 126         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 127             goto out_new_size;
 128         }
 129         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 130                                         TARGET_PAGE_SIZE);
 131         if (!new_cache) {
 132             error_report("Error creating cache");
 133             ret = -1;
 134             goto out;
 135         }
 136
 137         cache_fini(XBZRLE.cache);
 138         XBZRLE.cache = new_cache;
 139     }
 140
 141 out_new_size:
 142     ret = pow2floor(new_size);
 143 out:
 144     XBZRLE_cache_unlock();
 145     return ret;
 146 }
 147
 148 /*
 149  * An outstanding page request, on the source, having been received
 150  * and queued
 151  */
 152 struct RAMSrcPageRequest {
 153     RAMBlock *rb;
 154     hwaddr    offset;
 155     hwaddr    len;
 156
 157     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 158 };
 159
 160 /* State of RAM for migration */
 161 struct RAMState {
 162     /* QEMUFile used for this migration */
 163     QEMUFile *f;
 164     /* Last block that we have visited searching for dirty pages */
 165     RAMBlock *last_seen_block;
 166     /* Last block from where we have sent data */
 167     RAMBlock *last_sent_block;
 168     /* Last dirty target page we have sent */
 169     ram_addr_t last_page;
 170     /* last ram version we have seen */
 171     uint32_t last_version;
 172     /* We are in the first round */
 173     bool ram_bulk_stage;
 174     /* How many times we have dirty too many pages */
 175     int dirty_rate_high_cnt;
 176     /* How many times we have synchronized the bitmap */
 177     uint64_t bitmap_sync_count;
 178     /* these variables are used for bitmap sync */
 179     /* last time we did a full bitmap_sync */
 180     int64_t time_last_bitmap_sync;
 181     /* bytes transferred at start_time */
 182     uint64_t bytes_xfer_prev;
 183     /* number of dirty pages since start_time */
 184     uint64_t num_dirty_pages_period;
 185     /* xbzrle misses since the beginning of the period */
 186     uint64_t xbzrle_cache_miss_prev;
 187     /* number of iterations at the beginning of period */
 188     uint64_t iterations_prev;
 189     /* Accounting fields */
 190     /* number of zero pages.  It used to be pages filled by the same char. */
 191     uint64_t zero_pages;
 192     /* number of normal transferred pages */
 193     uint64_t norm_pages;
 194     /* Iterations since start */
 195     uint64_t iterations;
 196     /* xbzrle transmitted bytes.  Notice that this is with
 197      * compression, they can't be calculated from the pages */
 198     uint64_t xbzrle_bytes;
 199     /* xbzrle transmmited pages */
 200     uint64_t xbzrle_pages;
 201     /* xbzrle number of cache miss */
 202     uint64_t xbzrle_cache_miss;
 203     /* xbzrle miss rate */
 204     double xbzrle_cache_miss_rate;
 205     /* xbzrle number of overflows */
 206     uint64_t xbzrle_overflows;
 207     /* number of dirty bits in the bitmap */
 208     uint64_t migration_dirty_pages;
 209     /* total number of bytes transferred */
 210     uint64_t bytes_transferred;
 211     /* number of dirtied pages in the last second */
 212     uint64_t dirty_pages_rate;
 213     /* Count of requests incoming from destination */
 214     uint64_t postcopy_requests;
 215     /* protects modification of the bitmap */
 216     QemuMutex bitmap_mutex;
 217     /* The RAMBlock used in the last src_page_requests */
 218     RAMBlock *last_req_rb;
 219     /* Queue of outstanding page requests from the destination */
 220     QemuMutex src_page_req_mutex;
 221     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 222 };
 223 typedef struct RAMState RAMState;
 224
 225 static RAMState ram_state;
 226
 227 uint64_t dup_mig_pages_transferred(void)
 228 {
 229     return ram_state.zero_pages;
 230 }
 231
 232 uint64_t norm_mig_pages_transferred(void)
 233 {
 234     return ram_state.norm_pages;
 235 }
 236
 237 uint64_t xbzrle_mig_bytes_transferred(void)
 238 {
 239     return ram_state.xbzrle_bytes;
 240 }
 241
 242 uint64_t xbzrle_mig_pages_transferred(void)
 243 {
 244     return ram_state.xbzrle_pages;
 245 }
 246
 247 uint64_t xbzrle_mig_pages_cache_miss(void)
 248 {
 249     return ram_state.xbzrle_cache_miss;
 250 }
 251
 252 double xbzrle_mig_cache_miss_rate(void)
 253 {
 254     return ram_state.xbzrle_cache_miss_rate;
 255 }
 256
 257 uint64_t xbzrle_mig_pages_overflow(void)
 258 {
 259     return ram_state.xbzrle_overflows;
 260 }
 261
 262 uint64_t ram_bytes_transferred(void)
 263 {
 264     return ram_state.bytes_transferred;
 265 }
 266
 267 uint64_t ram_bytes_remaining(void)
 268 {
 269     return ram_state.migration_dirty_pages * TARGET_PAGE_SIZE;
 270 }
 271
 272 uint64_t ram_dirty_sync_count(void)
 273 {
 274     return ram_state.bitmap_sync_count;
 275 }
 276
 277 uint64_t ram_dirty_pages_rate(void)
 278 {
 279     return ram_state.dirty_pages_rate;
 280 }
 281
 282 uint64_t ram_postcopy_requests(void)
 283 {
 284     return ram_state.postcopy_requests;
 285 }
 286
 287 /* used by the search for pages to send */
 288 struct PageSearchStatus {
 289     /* Current block being searched */
 290     RAMBlock    *block;
 291     /* Current page to search from */
 292     unsigned long page;
 293     /* Set once we wrap around */
 294     bool         complete_round;
 295 };
 296 typedef struct PageSearchStatus PageSearchStatus;
 297
 298 struct CompressParam {
 299     bool done;
 300     bool quit;
 301     QEMUFile *file;
 302     QemuMutex mutex;
 303     QemuCond cond;
 304     RAMBlock *block;
 305     ram_addr_t offset;
 306 };
 307 typedef struct CompressParam CompressParam;
 308
 309 struct DecompressParam {
 310     bool done;
 311     bool quit;
 312     QemuMutex mutex;
 313     QemuCond cond;
 314     void *des;
 315     uint8_t *compbuf;
 316     int len;
 317 };
 318 typedef struct DecompressParam DecompressParam;
 319
 320 static CompressParam *comp_param;
 321 static QemuThread *compress_threads;
 322 /* comp_done_cond is used to wake up the migration thread when
 323  * one of the compression threads has finished the compression.
 324  * comp_done_lock is used to co-work with comp_done_cond.
 325  */
 326 static QemuMutex comp_done_lock;
 327 static QemuCond comp_done_cond;
 328 /* The empty QEMUFileOps will be used by file in CompressParam */
 329 static const QEMUFileOps empty_ops = { };
 330
 331 static DecompressParam *decomp_param;
 332 static QemuThread *decompress_threads;
 333 static QemuMutex decomp_done_lock;
 334 static QemuCond decomp_done_cond;
 335
 336 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 337                                 ram_addr_t offset);
 338
 339 static void *do_data_compress(void *opaque)
 340 {
 341     CompressParam *param = opaque;
 342     RAMBlock *block;
 343     ram_addr_t offset;
 344
 345     qemu_mutex_lock(&param->mutex);
 346     while (!param->quit) {
 347         if (param->block) {
 348             block = param->block;
 349             offset = param->offset;
 350             param->block = NULL;
 351             qemu_mutex_unlock(&param->mutex);
 352
 353             do_compress_ram_page(param->file, block, offset);
 354
 355             qemu_mutex_lock(&comp_done_lock);
 356             param->done = true;
 357             qemu_cond_signal(&comp_done_cond);
 358             qemu_mutex_unlock(&comp_done_lock);
 359
 360             qemu_mutex_lock(&param->mutex);
 361         } else {
 362             qemu_cond_wait(&param->cond, &param->mutex);
 363         }
 364     }
 365     qemu_mutex_unlock(&param->mutex);
 366
 367     return NULL;
 368 }
 369
 370 static inline void terminate_compression_threads(void)
 371 {
 372     int idx, thread_count;
 373
 374     thread_count = migrate_compress_threads();
 375
 376     for (idx = 0; idx < thread_count; idx++) {
 377         qemu_mutex_lock(&comp_param[idx].mutex);
 378         comp_param[idx].quit = true;
 379         qemu_cond_signal(&comp_param[idx].cond);
 380         qemu_mutex_unlock(&comp_param[idx].mutex);
 381     }
 382 }
 383
 384 void migrate_compress_threads_join(void)
 385 {
 386     int i, thread_count;
 387
 388     if (!migrate_use_compression()) {
 389         return;
 390     }
 391     terminate_compression_threads();
 392     thread_count = migrate_compress_threads();
 393     for (i = 0; i < thread_count; i++) {
 394         qemu_thread_join(compress_threads + i);
 395         qemu_fclose(comp_param[i].file);
 396         qemu_mutex_destroy(&comp_param[i].mutex);
 397         qemu_cond_destroy(&comp_param[i].cond);
 398     }
 399     qemu_mutex_destroy(&comp_done_lock);
 400     qemu_cond_destroy(&comp_done_cond);
 401     g_free(compress_threads);
 402     g_free(comp_param);
 403     compress_threads = NULL;
 404     comp_param = NULL;
 405 }
 406
 407 void migrate_compress_threads_create(void)
 408 {
 409     int i, thread_count;
 410
 411     if (!migrate_use_compression()) {
 412         return;
 413     }
 414     thread_count = migrate_compress_threads();
 415     compress_threads = g_new0(QemuThread, thread_count);
 416     comp_param = g_new0(CompressParam, thread_count);
 417     qemu_cond_init(&comp_done_cond);
 418     qemu_mutex_init(&comp_done_lock);
 419     for (i = 0; i < thread_count; i++) {
 420         /* comp_param[i].file is just used as a dummy buffer to save data,
 421          * set its ops to empty.
 422          */
 423         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 424         comp_param[i].done = true;
 425         comp_param[i].quit = false;
 426         qemu_mutex_init(&comp_param[i].mutex);
 427         qemu_cond_init(&comp_param[i].cond);
 428         qemu_thread_create(compress_threads + i, "compress",
 429                            do_data_compress, comp_param + i,
 430                            QEMU_THREAD_JOINABLE);
 431     }
 432 }
 433
 434 /**
 435  * save_page_header: write page header to wire
 436  *
 437  * If this is the 1st block, it also writes the block identification
 438  *
 439  * Returns the number of bytes written
 440  *
 441  * @f: QEMUFile where to send the data
 442  * @block: block that contains the page we want to send
 443  * @offset: offset inside the block for the page
 444  *          in the lower bits, it contains flags
 445  */
 446 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 447                                ram_addr_t offset)
 448 {
 449     size_t size, len;
 450
 451     if (block == rs->last_sent_block) {
 452         offset |= RAM_SAVE_FLAG_CONTINUE;
 453     }
 454     qemu_put_be64(f, offset);
 455     size = 8;
 456
 457     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 458         len = strlen(block->idstr);
 459         qemu_put_byte(f, len);
 460         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 461         size += 1 + len;
 462         rs->last_sent_block = block;
 463     }
 464     return size;
 465 }
 466
 467 /**
 468  * mig_throttle_guest_down: throotle down the guest
 469  *
 470  * Reduce amount of guest cpu execution to hopefully slow down memory
 471  * writes. If guest dirty memory rate is reduced below the rate at
 472  * which we can transfer pages to the destination then we should be
 473  * able to complete migration. Some workloads dirty memory way too
 474  * fast and will not effectively converge, even with auto-converge.
 475  */
 476 static void mig_throttle_guest_down(void)
 477 {
 478     MigrationState *s = migrate_get_current();
 479     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 480     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 481
 482     /* We have not started throttling yet. Let's start it. */
 483     if (!cpu_throttle_active()) {
 484         cpu_throttle_set(pct_initial);
 485     } else {
 486         /* Throttling already on, just increase the rate */
 487         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 488     }
 489 }
 490
 491 /**
 492  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 493  *
 494  * @rs: current RAM state
 495  * @current_addr: address for the zero page
 496  *
 497  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 498  * The important thing is that a stale (not-yet-0'd) page be replaced
 499  * by the new data.
 500  * As a bonus, if the page wasn't in the cache it gets added so that
 501  * when a small write is made into the 0'd page it gets XBZRLE sent.
 502  */
 503 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 504 {
 505     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 506         return;
 507     }
 508
 509     /* We don't care if this fails to allocate a new cache page
 510      * as long as it updated an old one */
 511     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
 512                  rs->bitmap_sync_count);
 513 }
 514
 515 #define ENCODING_FLAG_XBZRLE 0x1
 516
 517 /**
 518  * save_xbzrle_page: compress and send current page
 519  *
 520  * Returns: 1 means that we wrote the page
 521  *          0 means that page is identical to the one already sent
 522  *          -1 means that xbzrle would be longer than normal
 523  *
 524  * @rs: current RAM state
 525  * @current_data: pointer to the address of the page contents
 526  * @current_addr: addr of the page
 527  * @block: block that contains the page we want to send
 528  * @offset: offset inside the block for the page
 529  * @last_stage: if we are at the completion stage
 530  */
 531 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 532                             ram_addr_t current_addr, RAMBlock *block,
 533                             ram_addr_t offset, bool last_stage)
 534 {
 535     int encoded_len = 0, bytes_xbzrle;
 536     uint8_t *prev_cached_page;
 537
 538     if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
 539         rs->xbzrle_cache_miss++;
 540         if (!last_stage) {
 541             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 542                              rs->bitmap_sync_count) == -1) {
 543                 return -1;
 544             } else {
 545                 /* update *current_data when the page has been
 546                    inserted into cache */
 547                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 548             }
 549         }
 550         return -1;
 551     }
 552
 553     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 554
 555     /* save current buffer into memory */
 556     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 557
 558     /* XBZRLE encoding (if there is no overflow) */
 559     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 560                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 561                                        TARGET_PAGE_SIZE);
 562     if (encoded_len == 0) {
 563         trace_save_xbzrle_page_skipping();
 564         return 0;
 565     } else if (encoded_len == -1) {
 566         trace_save_xbzrle_page_overflow();
 567         rs->xbzrle_overflows++;
 568         /* update data in the cache */
 569         if (!last_stage) {
 570             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 571             *current_data = prev_cached_page;
 572         }
 573         return -1;
 574     }
 575
 576     /* we need to update the data in the cache, in order to get the same data */
 577     if (!last_stage) {
 578         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 579     }
 580
 581     /* Send XBZRLE based compressed page */
 582     bytes_xbzrle = save_page_header(rs, rs->f, block,
 583                                     offset | RAM_SAVE_FLAG_XBZRLE);
 584     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 585     qemu_put_be16(rs->f, encoded_len);
 586     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 587     bytes_xbzrle += encoded_len + 1 + 2;
 588     rs->xbzrle_pages++;
 589     rs->xbzrle_bytes += bytes_xbzrle;
 590     rs->bytes_transferred += bytes_xbzrle;
 591
 592     return 1;
 593 }
 594
 595 /**
 596  * migration_bitmap_find_dirty: find the next dirty page from start
 597  *
 598  * Called with rcu_read_lock() to protect migration_bitmap
 599  *
 600  * Returns the byte offset within memory region of the start of a dirty page
 601  *
 602  * @rs: current RAM state
 603  * @rb: RAMBlock where to search for dirty pages
 604  * @start: page where we start the search
 605  */
 606 static inline
 607 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 608                                           unsigned long start)
 609 {
 610     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 611     unsigned long *bitmap = rb->bmap;
 612     unsigned long next;
 613
 614     if (rs->ram_bulk_stage && start > 0) {
 615         next = start + 1;
 616     } else {
 617         next = find_next_bit(bitmap, size, start);
 618     }
 619
 620     return next;
 621 }
 622
 623 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 624                                                 RAMBlock *rb,
 625                                                 unsigned long page)
 626 {
 627     bool ret;
 628
 629     ret = test_and_clear_bit(page, rb->bmap);
 630
 631     if (ret) {
 632         rs->migration_dirty_pages--;
 633     }
 634     return ret;
 635 }
 636
 637 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 638                                         ram_addr_t start, ram_addr_t length)
 639 {
 640     rs->migration_dirty_pages +=
 641         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 642                                               &rs->num_dirty_pages_period);
 643 }
 644
 645 /**
 646  * ram_pagesize_summary: calculate all the pagesizes of a VM
 647  *
 648  * Returns a summary bitmap of the page sizes of all RAMBlocks
 649  *
 650  * For VMs with just normal pages this is equivalent to the host page
 651  * size. If it's got some huge pages then it's the OR of all the
 652  * different page sizes.
 653  */
 654 uint64_t ram_pagesize_summary(void)
 655 {
 656     RAMBlock *block;
 657     uint64_t summary = 0;
 658
 659     RAMBLOCK_FOREACH(block) {
 660         summary |= block->page_size;
 661     }
 662
 663     return summary;
 664 }
 665
 666 static void migration_bitmap_sync(RAMState *rs)
 667 {
 668     RAMBlock *block;
 669     int64_t end_time;
 670     uint64_t bytes_xfer_now;
 671
 672     rs->bitmap_sync_count++;
 673
 674     if (!rs->bytes_xfer_prev) {
 675         rs->bytes_xfer_prev = ram_bytes_transferred();
 676     }
 677
 678     if (!rs->time_last_bitmap_sync) {
 679         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 680     }
 681
 682     trace_migration_bitmap_sync_start();
 683     memory_global_dirty_log_sync();
 684
 685     qemu_mutex_lock(&rs->bitmap_mutex);
 686     rcu_read_lock();
 687     RAMBLOCK_FOREACH(block) {
 688         migration_bitmap_sync_range(rs, block, 0, block->used_length);
 689     }
 690     rcu_read_unlock();
 691     qemu_mutex_unlock(&rs->bitmap_mutex);
 692
 693     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 694
 695     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 696
 697     /* more than 1 second = 1000 millisecons */
 698     if (end_time > rs->time_last_bitmap_sync + 1000) {
 699         if (migrate_auto_converge()) {
 700             /* The following detection logic can be refined later. For now:
 701                Check to see if the dirtied bytes is 50% more than the approx.
 702                amount of bytes that just got transferred since the last time we
 703                were in this routine. If that happens twice, start or increase
 704                throttling */
 705             bytes_xfer_now = ram_bytes_transferred();
 706
 707             if (rs->dirty_pages_rate &&
 708                (rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 709                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 710                (rs->dirty_rate_high_cnt++ >= 2)) {
 711                     trace_migration_throttle();
 712                     rs->dirty_rate_high_cnt = 0;
 713                     mig_throttle_guest_down();
 714              }
 715              rs->bytes_xfer_prev = bytes_xfer_now;
 716         }
 717
 718         if (migrate_use_xbzrle()) {
 719             if (rs->iterations_prev != rs->iterations) {
 720                 rs->xbzrle_cache_miss_rate =
 721                    (double)(rs->xbzrle_cache_miss -
 722                             rs->xbzrle_cache_miss_prev) /
 723                    (rs->iterations - rs->iterations_prev);
 724             }
 725             rs->iterations_prev = rs->iterations;
 726             rs->xbzrle_cache_miss_prev = rs->xbzrle_cache_miss;
 727         }
 728         rs->dirty_pages_rate = rs->num_dirty_pages_period * 1000
 729             / (end_time - rs->time_last_bitmap_sync);
 730         rs->time_last_bitmap_sync = end_time;
 731         rs->num_dirty_pages_period = 0;
 732     }
 733     if (migrate_use_events()) {
 734         qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
 735     }
 736 }
 737
 738 /**
 739  * save_zero_page: send the zero page to the stream
 740  *
 741  * Returns the number of pages written.
 742  *
 743  * @rs: current RAM state
 744  * @block: block that contains the page we want to send
 745  * @offset: offset inside the block for the page
 746  * @p: pointer to the page
 747  */
 748 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
 749                           uint8_t *p)
 750 {
 751     int pages = -1;
 752
 753     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 754         rs->zero_pages++;
 755         rs->bytes_transferred +=
 756             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
 757         qemu_put_byte(rs->f, 0);
 758         rs->bytes_transferred += 1;
 759         pages = 1;
 760     }
 761
 762     return pages;
 763 }
 764
 765 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 766 {
 767     if (!migrate_release_ram() || !migration_in_postcopy()) {
 768         return;
 769     }
 770
 771     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 772 }
 773
 774 /**
 775  * ram_save_page: send the given page to the stream
 776  *
 777  * Returns the number of pages written.
 778  *          < 0 - error
 779  *          >=0 - Number of pages written - this might legally be 0
 780  *                if xbzrle noticed the page was the same.
 781  *
 782  * @rs: current RAM state
 783  * @block: block that contains the page we want to send
 784  * @offset: offset inside the block for the page
 785  * @last_stage: if we are at the completion stage
 786  */
 787 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
 788 {
 789     int pages = -1;
 790     uint64_t bytes_xmit;
 791     ram_addr_t current_addr;
 792     uint8_t *p;
 793     int ret;
 794     bool send_async = true;
 795     RAMBlock *block = pss->block;
 796     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 797
 798     p = block->host + offset;
 799     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
 800
 801     /* In doubt sent page as normal */
 802     bytes_xmit = 0;
 803     ret = ram_control_save_page(rs->f, block->offset,
 804                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 805     if (bytes_xmit) {
 806         rs->bytes_transferred += bytes_xmit;
 807         pages = 1;
 808     }
 809
 810     XBZRLE_cache_lock();
 811
 812     current_addr = block->offset + offset;
 813
 814     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 815         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 816             if (bytes_xmit > 0) {
 817                 rs->norm_pages++;
 818             } else if (bytes_xmit == 0) {
 819                 rs->zero_pages++;
 820             }
 821         }
 822     } else {
 823         pages = save_zero_page(rs, block, offset, p);
 824         if (pages > 0) {
 825             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 826              * page would be stale
 827              */
 828             xbzrle_cache_zero_page(rs, current_addr);
 829             ram_release_pages(block->idstr, offset, pages);
 830         } else if (!rs->ram_bulk_stage &&
 831                    !migration_in_postcopy() && migrate_use_xbzrle()) {
 832             pages = save_xbzrle_page(rs, &p, current_addr, block,
 833                                      offset, last_stage);
 834             if (!last_stage) {
 835                 /* Can't send this cached data async, since the cache page
 836                  * might get updated before it gets to the wire
 837                  */
 838                 send_async = false;
 839             }
 840         }
 841     }
 842
 843     /* XBZRLE overflow or normal page */
 844     if (pages == -1) {
 845         rs->bytes_transferred += save_page_header(rs, rs->f, block,
 846                                                   offset | RAM_SAVE_FLAG_PAGE);
 847         if (send_async) {
 848             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
 849                                   migrate_release_ram() &
 850                                   migration_in_postcopy());
 851         } else {
 852             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
 853         }
 854         rs->bytes_transferred += TARGET_PAGE_SIZE;
 855         pages = 1;
 856         rs->norm_pages++;
 857     }
 858
 859     XBZRLE_cache_unlock();
 860
 861     return pages;
 862 }
 863
 864 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 865                                 ram_addr_t offset)
 866 {
 867     RAMState *rs = &ram_state;
 868     int bytes_sent, blen;
 869     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 870
 871     bytes_sent = save_page_header(rs, f, block, offset |
 872                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
 873     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 874                                      migrate_compress_level());
 875     if (blen < 0) {
 876         bytes_sent = 0;
 877         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
 878         error_report("compressed data failed!");
 879     } else {
 880         bytes_sent += blen;
 881         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
 882     }
 883
 884     return bytes_sent;
 885 }
 886
 887 static void flush_compressed_data(RAMState *rs)
 888 {
 889     int idx, len, thread_count;
 890
 891     if (!migrate_use_compression()) {
 892         return;
 893     }
 894     thread_count = migrate_compress_threads();
 895
 896     qemu_mutex_lock(&comp_done_lock);
 897     for (idx = 0; idx < thread_count; idx++) {
 898         while (!comp_param[idx].done) {
 899             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 900         }
 901     }
 902     qemu_mutex_unlock(&comp_done_lock);
 903
 904     for (idx = 0; idx < thread_count; idx++) {
 905         qemu_mutex_lock(&comp_param[idx].mutex);
 906         if (!comp_param[idx].quit) {
 907             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 908             rs->bytes_transferred += len;
 909         }
 910         qemu_mutex_unlock(&comp_param[idx].mutex);
 911     }
 912 }
 913
 914 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 915                                        ram_addr_t offset)
 916 {
 917     param->block = block;
 918     param->offset = offset;
 919 }
 920
 921 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
 922                                            ram_addr_t offset)
 923 {
 924     int idx, thread_count, bytes_xmit = -1, pages = -1;
 925
 926     thread_count = migrate_compress_threads();
 927     qemu_mutex_lock(&comp_done_lock);
 928     while (true) {
 929         for (idx = 0; idx < thread_count; idx++) {
 930             if (comp_param[idx].done) {
 931                 comp_param[idx].done = false;
 932                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 933                 qemu_mutex_lock(&comp_param[idx].mutex);
 934                 set_compress_params(&comp_param[idx], block, offset);
 935                 qemu_cond_signal(&comp_param[idx].cond);
 936                 qemu_mutex_unlock(&comp_param[idx].mutex);
 937                 pages = 1;
 938                 rs->norm_pages++;
 939                 rs->bytes_transferred += bytes_xmit;
 940                 break;
 941             }
 942         }
 943         if (pages > 0) {
 944             break;
 945         } else {
 946             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 947         }
 948     }
 949     qemu_mutex_unlock(&comp_done_lock);
 950
 951     return pages;
 952 }
 953
 954 /**
 955  * ram_save_compressed_page: compress the given page and send it to the stream
 956  *
 957  * Returns the number of pages written.
 958  *
 959  * @rs: current RAM state
 960  * @block: block that contains the page we want to send
 961  * @offset: offset inside the block for the page
 962  * @last_stage: if we are at the completion stage
 963  */
 964 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
 965                                     bool last_stage)
 966 {
 967     int pages = -1;
 968     uint64_t bytes_xmit = 0;
 969     uint8_t *p;
 970     int ret, blen;
 971     RAMBlock *block = pss->block;
 972     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 973
 974     p = block->host + offset;
 975
 976     ret = ram_control_save_page(rs->f, block->offset,
 977                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
 978     if (bytes_xmit) {
 979         rs->bytes_transferred += bytes_xmit;
 980         pages = 1;
 981     }
 982     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 983         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 984             if (bytes_xmit > 0) {
 985                 rs->norm_pages++;
 986             } else if (bytes_xmit == 0) {
 987                 rs->zero_pages++;
 988             }
 989         }
 990     } else {
 991         /* When starting the process of a new block, the first page of
 992          * the block should be sent out before other pages in the same
 993          * block, and all the pages in last block should have been sent
 994          * out, keeping this order is important, because the 'cont' flag
 995          * is used to avoid resending the block name.
 996          */
 997         if (block != rs->last_sent_block) {
 998             flush_compressed_data(rs);
 999             pages = save_zero_page(rs, block, offset, p);
1000             if (pages == -1) {
1001                 /* Make sure the first page is sent out before other pages */
1002                 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1003                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
1004                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1005                                                  migrate_compress_level());
1006                 if (blen > 0) {
1007                     rs->bytes_transferred += bytes_xmit + blen;
1008                     rs->norm_pages++;
1009                     pages = 1;
1010                 } else {
1011                     qemu_file_set_error(rs->f, blen);
1012                     error_report("compressed data failed!");
1013                 }
1014             }
1015             if (pages > 0) {
1016                 ram_release_pages(block->idstr, offset, pages);
1017             }
1018         } else {
1019             pages = save_zero_page(rs, block, offset, p);
1020             if (pages == -1) {
1021                 pages = compress_page_with_multi_thread(rs, block, offset);
1022             } else {
1023                 ram_release_pages(block->idstr, offset, pages);
1024             }
1025         }
1026     }
1027
1028     return pages;
1029 }
1030
1031 /**
1032  * find_dirty_block: find the next dirty page and update any state
1033  * associated with the search process.
1034  *
1035  * Returns if a page is found
1036  *
1037  * @rs: current RAM state
1038  * @pss: data about the state of the current dirty page scan
1039  * @again: set to false if the search has scanned the whole of RAM
1040  */
1041 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1042 {
1043     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1044     if (pss->complete_round && pss->block == rs->last_seen_block &&
1045         pss->page >= rs->last_page) {
1046         /*
1047          * We've been once around the RAM and haven't found anything.
1048          * Give up.
1049          */
1050         *again = false;
1051         return false;
1052     }
1053     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1054         /* Didn't find anything in this RAM Block */
1055         pss->page = 0;
1056         pss->block = QLIST_NEXT_RCU(pss->block, next);
1057         if (!pss->block) {
1058             /* Hit the end of the list */
1059             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1060             /* Flag that we've looped */
1061             pss->complete_round = true;
1062             rs->ram_bulk_stage = false;
1063             if (migrate_use_xbzrle()) {
1064                 /* If xbzrle is on, stop using the data compression at this
1065                  * point. In theory, xbzrle can do better than compression.
1066                  */
1067                 flush_compressed_data(rs);
1068             }
1069         }
1070         /* Didn't find anything this time, but try again on the new block */
1071         *again = true;
1072         return false;
1073     } else {
1074         /* Can go around again, but... */
1075         *again = true;
1076         /* We've found something so probably don't need to */
1077         return true;
1078     }
1079 }
1080
1081 /**
1082  * unqueue_page: gets a page of the queue
1083  *
1084  * Helper for 'get_queued_page' - gets a page off the queue
1085  *
1086  * Returns the block of the page (or NULL if none available)
1087  *
1088  * @rs: current RAM state
1089  * @offset: used to return the offset within the RAMBlock
1090  */
1091 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1092 {
1093     RAMBlock *block = NULL;
1094
1095     qemu_mutex_lock(&rs->src_page_req_mutex);
1096     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1097         struct RAMSrcPageRequest *entry =
1098                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1099         block = entry->rb;
1100         *offset = entry->offset;
1101
1102         if (entry->len > TARGET_PAGE_SIZE) {
1103             entry->len -= TARGET_PAGE_SIZE;
1104             entry->offset += TARGET_PAGE_SIZE;
1105         } else {
1106             memory_region_unref(block->mr);
1107             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1108             g_free(entry);
1109         }
1110     }
1111     qemu_mutex_unlock(&rs->src_page_req_mutex);
1112
1113     return block;
1114 }
1115
1116 /**
1117  * get_queued_page: unqueue a page from the postocpy requests
1118  *
1119  * Skips pages that are already sent (!dirty)
1120  *
1121  * Returns if a queued page is found
1122  *
1123  * @rs: current RAM state
1124  * @pss: data about the state of the current dirty page scan
1125  */
1126 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1127 {
1128     RAMBlock  *block;
1129     ram_addr_t offset;
1130     bool dirty;
1131
1132     do {
1133         block = unqueue_page(rs, &offset);
1134         /*
1135          * We're sending this page, and since it's postcopy nothing else
1136          * will dirty it, and we must make sure it doesn't get sent again
1137          * even if this queue request was received after the background
1138          * search already sent it.
1139          */
1140         if (block) {
1141             unsigned long page;
1142
1143             page = offset >> TARGET_PAGE_BITS;
1144             dirty = test_bit(page, block->bmap);
1145             if (!dirty) {
1146                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1147                        page, test_bit(page, block->unsentmap));
1148             } else {
1149                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1150             }
1151         }
1152
1153     } while (block && !dirty);
1154
1155     if (block) {
1156         /*
1157          * As soon as we start servicing pages out of order, then we have
1158          * to kill the bulk stage, since the bulk stage assumes
1159          * in (migration_bitmap_find_and_reset_dirty) that every page is
1160          * dirty, that's no longer true.
1161          */
1162         rs->ram_bulk_stage = false;
1163
1164         /*
1165          * We want the background search to continue from the queued page
1166          * since the guest is likely to want other pages near to the page
1167          * it just requested.
1168          */
1169         pss->block = block;
1170         pss->page = offset >> TARGET_PAGE_BITS;
1171     }
1172
1173     return !!block;
1174 }
1175
1176 /**
1177  * migration_page_queue_free: drop any remaining pages in the ram
1178  * request queue
1179  *
1180  * It should be empty at the end anyway, but in error cases there may
1181  * be some left.  in case that there is any page left, we drop it.
1182  *
1183  */
1184 void migration_page_queue_free(void)
1185 {
1186     struct RAMSrcPageRequest *mspr, *next_mspr;
1187     RAMState *rs = &ram_state;
1188     /* This queue generally should be empty - but in the case of a failed
1189      * migration might have some droppings in.
1190      */
1191     rcu_read_lock();
1192     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1193         memory_region_unref(mspr->rb->mr);
1194         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1195         g_free(mspr);
1196     }
1197     rcu_read_unlock();
1198 }
1199
1200 /**
1201  * ram_save_queue_pages: queue the page for transmission
1202  *
1203  * A request from postcopy destination for example.
1204  *
1205  * Returns zero on success or negative on error
1206  *
1207  * @rbname: Name of the RAMBLock of the request. NULL means the
1208  *          same that last one.
1209  * @start: starting address from the start of the RAMBlock
1210  * @len: length (in bytes) to send
1211  */
1212 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1213 {
1214     RAMBlock *ramblock;
1215     RAMState *rs = &ram_state;
1216
1217     rs->postcopy_requests++;
1218     rcu_read_lock();
1219     if (!rbname) {
1220         /* Reuse last RAMBlock */
1221         ramblock = rs->last_req_rb;
1222
1223         if (!ramblock) {
1224             /*
1225              * Shouldn't happen, we can't reuse the last RAMBlock if
1226              * it's the 1st request.
1227              */
1228             error_report("ram_save_queue_pages no previous block");
1229             goto err;
1230         }
1231     } else {
1232         ramblock = qemu_ram_block_by_name(rbname);
1233
1234         if (!ramblock) {
1235             /* We shouldn't be asked for a non-existent RAMBlock */
1236             error_report("ram_save_queue_pages no block '%s'", rbname);
1237             goto err;
1238         }
1239         rs->last_req_rb = ramblock;
1240     }
1241     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1242     if (start+len > ramblock->used_length) {
1243         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1244                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1245                      __func__, start, len, ramblock->used_length);
1246         goto err;
1247     }
1248
1249     struct RAMSrcPageRequest *new_entry =
1250         g_malloc0(sizeof(struct RAMSrcPageRequest));
1251     new_entry->rb = ramblock;
1252     new_entry->offset = start;
1253     new_entry->len = len;
1254
1255     memory_region_ref(ramblock->mr);
1256     qemu_mutex_lock(&rs->src_page_req_mutex);
1257     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1258     qemu_mutex_unlock(&rs->src_page_req_mutex);
1259     rcu_read_unlock();
1260
1261     return 0;
1262
1263 err:
1264     rcu_read_unlock();
1265     return -1;
1266 }
1267
1268 /**
1269  * ram_save_target_page: save one target page
1270  *
1271  * Returns the number of pages written
1272  *
1273  * @rs: current RAM state
1274  * @ms: current migration state
1275  * @pss: data about the page we want to send
1276  * @last_stage: if we are at the completion stage
1277  */
1278 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1279                                 bool last_stage)
1280 {
1281     int res = 0;
1282
1283     /* Check the pages is dirty and if it is send it */
1284     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1285         /*
1286          * If xbzrle is on, stop using the data compression after first
1287          * round of migration even if compression is enabled. In theory,
1288          * xbzrle can do better than compression.
1289          */
1290         if (migrate_use_compression() &&
1291             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1292             res = ram_save_compressed_page(rs, pss, last_stage);
1293         } else {
1294             res = ram_save_page(rs, pss, last_stage);
1295         }
1296
1297         if (res < 0) {
1298             return res;
1299         }
1300         if (pss->block->unsentmap) {
1301             clear_bit(pss->page, pss->block->unsentmap);
1302         }
1303     }
1304
1305     return res;
1306 }
1307
1308 /**
1309  * ram_save_host_page: save a whole host page
1310  *
1311  * Starting at *offset send pages up to the end of the current host
1312  * page. It's valid for the initial offset to point into the middle of
1313  * a host page in which case the remainder of the hostpage is sent.
1314  * Only dirty target pages are sent. Note that the host page size may
1315  * be a huge page for this block.
1316  * The saving stops at the boundary of the used_length of the block
1317  * if the RAMBlock isn't a multiple of the host page size.
1318  *
1319  * Returns the number of pages written or negative on error
1320  *
1321  * @rs: current RAM state
1322  * @ms: current migration state
1323  * @pss: data about the page we want to send
1324  * @last_stage: if we are at the completion stage
1325  */
1326 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1327                               bool last_stage)
1328 {
1329     int tmppages, pages = 0;
1330     size_t pagesize_bits =
1331         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1332
1333     do {
1334         tmppages = ram_save_target_page(rs, pss, last_stage);
1335         if (tmppages < 0) {
1336             return tmppages;
1337         }
1338
1339         pages += tmppages;
1340         pss->page++;
1341     } while ((pss->page & (pagesize_bits - 1)) &&
1342              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1343
1344     /* The offset we leave with is the last one we looked at */
1345     pss->page--;
1346     return pages;
1347 }
1348
1349 /**
1350  * ram_find_and_save_block: finds a dirty page and sends it to f
1351  *
1352  * Called within an RCU critical section.
1353  *
1354  * Returns the number of pages written where zero means no dirty pages
1355  *
1356  * @rs: current RAM state
1357  * @last_stage: if we are at the completion stage
1358  *
1359  * On systems where host-page-size > target-page-size it will send all the
1360  * pages in a host page that are dirty.
1361  */
1362
1363 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1364 {
1365     PageSearchStatus pss;
1366     int pages = 0;
1367     bool again, found;
1368
1369     /* No dirty page as there is zero RAM */
1370     if (!ram_bytes_total()) {
1371         return pages;
1372     }
1373
1374     pss.block = rs->last_seen_block;
1375     pss.page = rs->last_page;
1376     pss.complete_round = false;
1377
1378     if (!pss.block) {
1379         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1380     }
1381
1382     do {
1383         again = true;
1384         found = get_queued_page(rs, &pss);
1385
1386         if (!found) {
1387             /* priority queue empty, so just search for something dirty */
1388             found = find_dirty_block(rs, &pss, &again);
1389         }
1390
1391         if (found) {
1392             pages = ram_save_host_page(rs, &pss, last_stage);
1393         }
1394     } while (!pages && again);
1395
1396     rs->last_seen_block = pss.block;
1397     rs->last_page = pss.page;
1398
1399     return pages;
1400 }
1401
1402 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1403 {
1404     uint64_t pages = size / TARGET_PAGE_SIZE;
1405     RAMState *rs = &ram_state;
1406
1407     if (zero) {
1408         rs->zero_pages += pages;
1409     } else {
1410         rs->norm_pages += pages;
1411         rs->bytes_transferred += size;
1412         qemu_update_position(f, size);
1413     }
1414 }
1415
1416 uint64_t ram_bytes_total(void)
1417 {
1418     RAMBlock *block;
1419     uint64_t total = 0;
1420
1421     rcu_read_lock();
1422     RAMBLOCK_FOREACH(block) {
1423         total += block->used_length;
1424     }
1425     rcu_read_unlock();
1426     return total;
1427 }
1428
1429 void free_xbzrle_decoded_buf(void)
1430 {
1431     g_free(xbzrle_decoded_buf);
1432     xbzrle_decoded_buf = NULL;
1433 }
1434
1435 static void ram_migration_cleanup(void *opaque)
1436 {
1437     RAMBlock *block;
1438
1439     /* caller have hold iothread lock or is in a bh, so there is
1440      * no writing race against this migration_bitmap
1441      */
1442     memory_global_dirty_log_stop();
1443
1444     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1445         g_free(block->bmap);
1446         block->bmap = NULL;
1447         g_free(block->unsentmap);
1448         block->unsentmap = NULL;
1449     }
1450
1451     XBZRLE_cache_lock();
1452     if (XBZRLE.cache) {
1453         cache_fini(XBZRLE.cache);
1454         g_free(XBZRLE.encoded_buf);
1455         g_free(XBZRLE.current_buf);
1456         g_free(ZERO_TARGET_PAGE);
1457         XBZRLE.cache = NULL;
1458         XBZRLE.encoded_buf = NULL;
1459         XBZRLE.current_buf = NULL;
1460     }
1461     XBZRLE_cache_unlock();
1462 }
1463
1464 static void ram_state_reset(RAMState *rs)
1465 {
1466     rs->last_seen_block = NULL;
1467     rs->last_sent_block = NULL;
1468     rs->last_page = 0;
1469     rs->last_version = ram_list.version;
1470     rs->ram_bulk_stage = true;
1471 }
1472
1473 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1474
1475 /*
1476  * 'expected' is the value you expect the bitmap mostly to be full
1477  * of; it won't bother printing lines that are all this value.
1478  * If 'todump' is null the migration bitmap is dumped.
1479  */
1480 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1481                            unsigned long pages)
1482 {
1483     int64_t cur;
1484     int64_t linelen = 128;
1485     char linebuf[129];
1486
1487     for (cur = 0; cur < pages; cur += linelen) {
1488         int64_t curb;
1489         bool found = false;
1490         /*
1491          * Last line; catch the case where the line length
1492          * is longer than remaining ram
1493          */
1494         if (cur + linelen > pages) {
1495             linelen = pages - cur;
1496         }
1497         for (curb = 0; curb < linelen; curb++) {
1498             bool thisbit = test_bit(cur + curb, todump);
1499             linebuf[curb] = thisbit ? '1' : '.';
1500             found = found || (thisbit != expected);
1501         }
1502         if (found) {
1503             linebuf[curb] = '\0';
1504             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1505         }
1506     }
1507 }
1508
1509 /* **** functions for postcopy ***** */
1510
1511 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1512 {
1513     struct RAMBlock *block;
1514
1515     RAMBLOCK_FOREACH(block) {
1516         unsigned long *bitmap = block->bmap;
1517         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1518         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1519
1520         while (run_start < range) {
1521             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1522             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1523                               (run_end - run_start) << TARGET_PAGE_BITS);
1524             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1525         }
1526     }
1527 }
1528
1529 /**
1530  * postcopy_send_discard_bm_ram: discard a RAMBlock
1531  *
1532  * Returns zero on success
1533  *
1534  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1535  * Note: At this point the 'unsentmap' is the processed bitmap combined
1536  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1537  *
1538  * @ms: current migration state
1539  * @pds: state for postcopy
1540  * @start: RAMBlock starting page
1541  * @length: RAMBlock size
1542  */
1543 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1544                                         PostcopyDiscardState *pds,
1545                                         RAMBlock *block)
1546 {
1547     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1548     unsigned long current;
1549     unsigned long *unsentmap = block->unsentmap;
1550
1551     for (current = 0; current < end; ) {
1552         unsigned long one = find_next_bit(unsentmap, end, current);
1553
1554         if (one <= end) {
1555             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1556             unsigned long discard_length;
1557
1558             if (zero >= end) {
1559                 discard_length = end - one;
1560             } else {
1561                 discard_length = zero - one;
1562             }
1563             if (discard_length) {
1564                 postcopy_discard_send_range(ms, pds, one, discard_length);
1565             }
1566             current = one + discard_length;
1567         } else {
1568             current = one;
1569         }
1570     }
1571
1572     return 0;
1573 }
1574
1575 /**
1576  * postcopy_each_ram_send_discard: discard all RAMBlocks
1577  *
1578  * Returns 0 for success or negative for error
1579  *
1580  * Utility for the outgoing postcopy code.
1581  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1582  *   passing it bitmap indexes and name.
1583  * (qemu_ram_foreach_block ends up passing unscaled lengths
1584  *  which would mean postcopy code would have to deal with target page)
1585  *
1586  * @ms: current migration state
1587  */
1588 static int postcopy_each_ram_send_discard(MigrationState *ms)
1589 {
1590     struct RAMBlock *block;
1591     int ret;
1592
1593     RAMBLOCK_FOREACH(block) {
1594         PostcopyDiscardState *pds =
1595             postcopy_discard_send_init(ms, block->idstr);
1596
1597         /*
1598          * Postcopy sends chunks of bitmap over the wire, but it
1599          * just needs indexes at this point, avoids it having
1600          * target page specific code.
1601          */
1602         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1603         postcopy_discard_send_finish(ms, pds);
1604         if (ret) {
1605             return ret;
1606         }
1607     }
1608
1609     return 0;
1610 }
1611
1612 /**
1613  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1614  *
1615  * Helper for postcopy_chunk_hostpages; it's called twice to
1616  * canonicalize the two bitmaps, that are similar, but one is
1617  * inverted.
1618  *
1619  * Postcopy requires that all target pages in a hostpage are dirty or
1620  * clean, not a mix.  This function canonicalizes the bitmaps.
1621  *
1622  * @ms: current migration state
1623  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1624  *               otherwise we need to canonicalize partially dirty host pages
1625  * @block: block that contains the page we want to canonicalize
1626  * @pds: state for postcopy
1627  */
1628 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1629                                           RAMBlock *block,
1630                                           PostcopyDiscardState *pds)
1631 {
1632     RAMState *rs = &ram_state;
1633     unsigned long *bitmap = block->bmap;
1634     unsigned long *unsentmap = block->unsentmap;
1635     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1636     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1637     unsigned long run_start;
1638
1639     if (block->page_size == TARGET_PAGE_SIZE) {
1640         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1641         return;
1642     }
1643
1644     if (unsent_pass) {
1645         /* Find a sent page */
1646         run_start = find_next_zero_bit(unsentmap, pages, 0);
1647     } else {
1648         /* Find a dirty page */
1649         run_start = find_next_bit(bitmap, pages, 0);
1650     }
1651
1652     while (run_start < pages) {
1653         bool do_fixup = false;
1654         unsigned long fixup_start_addr;
1655         unsigned long host_offset;
1656
1657         /*
1658          * If the start of this run of pages is in the middle of a host
1659          * page, then we need to fixup this host page.
1660          */
1661         host_offset = run_start % host_ratio;
1662         if (host_offset) {
1663             do_fixup = true;
1664             run_start -= host_offset;
1665             fixup_start_addr = run_start;
1666             /* For the next pass */
1667             run_start = run_start + host_ratio;
1668         } else {
1669             /* Find the end of this run */
1670             unsigned long run_end;
1671             if (unsent_pass) {
1672                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1673             } else {
1674                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1675             }
1676             /*
1677              * If the end isn't at the start of a host page, then the
1678              * run doesn't finish at the end of a host page
1679              * and we need to discard.
1680              */
1681             host_offset = run_end % host_ratio;
1682             if (host_offset) {
1683                 do_fixup = true;
1684                 fixup_start_addr = run_end - host_offset;
1685                 /*
1686                  * This host page has gone, the next loop iteration starts
1687                  * from after the fixup
1688                  */
1689                 run_start = fixup_start_addr + host_ratio;
1690             } else {
1691                 /*
1692                  * No discards on this iteration, next loop starts from
1693                  * next sent/dirty page
1694                  */
1695                 run_start = run_end + 1;
1696             }
1697         }
1698
1699         if (do_fixup) {
1700             unsigned long page;
1701
1702             /* Tell the destination to discard this page */
1703             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1704                 /* For the unsent_pass we:
1705                  *     discard partially sent pages
1706                  * For the !unsent_pass (dirty) we:
1707                  *     discard partially dirty pages that were sent
1708                  *     (any partially sent pages were already discarded
1709                  *     by the previous unsent_pass)
1710                  */
1711                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1712                                             host_ratio);
1713             }
1714
1715             /* Clean up the bitmap */
1716             for (page = fixup_start_addr;
1717                  page < fixup_start_addr + host_ratio; page++) {
1718                 /* All pages in this host page are now not sent */
1719                 set_bit(page, unsentmap);
1720
1721                 /*
1722                  * Remark them as dirty, updating the count for any pages
1723                  * that weren't previously dirty.
1724                  */
1725                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1726             }
1727         }
1728
1729         if (unsent_pass) {
1730             /* Find the next sent page for the next iteration */
1731             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1732         } else {
1733             /* Find the next dirty page for the next iteration */
1734             run_start = find_next_bit(bitmap, pages, run_start);
1735         }
1736     }
1737 }
1738
1739 /**
1740  * postcopy_chuck_hostpages: discrad any partially sent host page
1741  *
1742  * Utility for the outgoing postcopy code.
1743  *
1744  * Discard any partially sent host-page size chunks, mark any partially
1745  * dirty host-page size chunks as all dirty.  In this case the host-page
1746  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1747  *
1748  * Returns zero on success
1749  *
1750  * @ms: current migration state
1751  * @block: block we want to work with
1752  */
1753 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1754 {
1755     PostcopyDiscardState *pds =
1756         postcopy_discard_send_init(ms, block->idstr);
1757
1758     /* First pass: Discard all partially sent host pages */
1759     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1760     /*
1761      * Second pass: Ensure that all partially dirty host pages are made
1762      * fully dirty.
1763      */
1764     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1765
1766     postcopy_discard_send_finish(ms, pds);
1767     return 0;
1768 }
1769
1770 /**
1771  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1772  *
1773  * Returns zero on success
1774  *
1775  * Transmit the set of pages to be discarded after precopy to the target
1776  * these are pages that:
1777  *     a) Have been previously transmitted but are now dirty again
1778  *     b) Pages that have never been transmitted, this ensures that
1779  *        any pages on the destination that have been mapped by background
1780  *        tasks get discarded (transparent huge pages is the specific concern)
1781  * Hopefully this is pretty sparse
1782  *
1783  * @ms: current migration state
1784  */
1785 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1786 {
1787     RAMState *rs = &ram_state;
1788     RAMBlock *block;
1789     int ret;
1790
1791     rcu_read_lock();
1792
1793     /* This should be our last sync, the src is now paused */
1794     migration_bitmap_sync(rs);
1795
1796     /* Easiest way to make sure we don't resume in the middle of a host-page */
1797     rs->last_seen_block = NULL;
1798     rs->last_sent_block = NULL;
1799     rs->last_page = 0;
1800
1801     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1802         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1803         unsigned long *bitmap = block->bmap;
1804         unsigned long *unsentmap = block->unsentmap;
1805
1806         if (!unsentmap) {
1807             /* We don't have a safe way to resize the sentmap, so
1808              * if the bitmap was resized it will be NULL at this
1809              * point.
1810              */
1811             error_report("migration ram resized during precopy phase");
1812             rcu_read_unlock();
1813             return -EINVAL;
1814         }
1815         /* Deal with TPS != HPS and huge pages */
1816         ret = postcopy_chunk_hostpages(ms, block);
1817         if (ret) {
1818             rcu_read_unlock();
1819             return ret;
1820         }
1821
1822         /*
1823          * Update the unsentmap to be unsentmap = unsentmap | dirty
1824          */
1825         bitmap_or(unsentmap, unsentmap, bitmap, pages);
1826 #ifdef DEBUG_POSTCOPY
1827         ram_debug_dump_bitmap(unsentmap, true, pages);
1828 #endif
1829     }
1830     trace_ram_postcopy_send_discard_bitmap();
1831
1832     ret = postcopy_each_ram_send_discard(ms);
1833     rcu_read_unlock();
1834
1835     return ret;
1836 }
1837
1838 /**
1839  * ram_discard_range: discard dirtied pages at the beginning of postcopy
1840  *
1841  * Returns zero on success
1842  *
1843  * @rbname: name of the RAMBlock of the request. NULL means the
1844  *          same that last one.
1845  * @start: RAMBlock starting page
1846  * @length: RAMBlock size
1847  */
1848 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1849 {
1850     int ret = -1;
1851
1852     trace_ram_discard_range(rbname, start, length);
1853
1854     rcu_read_lock();
1855     RAMBlock *rb = qemu_ram_block_by_name(rbname);
1856
1857     if (!rb) {
1858         error_report("ram_discard_range: Failed to find block '%s'", rbname);
1859         goto err;
1860     }
1861
1862     ret = ram_block_discard_range(rb, start, length);
1863
1864 err:
1865     rcu_read_unlock();
1866
1867     return ret;
1868 }
1869
1870 static int ram_state_init(RAMState *rs)
1871 {
1872     memset(rs, 0, sizeof(*rs));
1873     qemu_mutex_init(&rs->bitmap_mutex);
1874     qemu_mutex_init(&rs->src_page_req_mutex);
1875     QSIMPLEQ_INIT(&rs->src_page_requests);
1876
1877     if (migrate_use_xbzrle()) {
1878         XBZRLE_cache_lock();
1879         ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
1880         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1881                                   TARGET_PAGE_SIZE,
1882                                   TARGET_PAGE_SIZE);
1883         if (!XBZRLE.cache) {
1884             XBZRLE_cache_unlock();
1885             error_report("Error creating cache");
1886             return -1;
1887         }
1888         XBZRLE_cache_unlock();
1889
1890         /* We prefer not to abort if there is no memory */
1891         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1892         if (!XBZRLE.encoded_buf) {
1893             error_report("Error allocating encoded_buf");
1894             return -1;
1895         }
1896
1897         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1898         if (!XBZRLE.current_buf) {
1899             error_report("Error allocating current_buf");
1900             g_free(XBZRLE.encoded_buf);
1901             XBZRLE.encoded_buf = NULL;
1902             return -1;
1903         }
1904     }
1905
1906     /* For memory_global_dirty_log_start below.  */
1907     qemu_mutex_lock_iothread();
1908
1909     qemu_mutex_lock_ramlist();
1910     rcu_read_lock();
1911     ram_state_reset(rs);
1912
1913     /* Skip setting bitmap if there is no RAM */
1914     if (ram_bytes_total()) {
1915         RAMBlock *block;
1916
1917         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1918             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
1919
1920             block->bmap = bitmap_new(pages);
1921             bitmap_set(block->bmap, 0, pages);
1922             if (migrate_postcopy_ram()) {
1923                 block->unsentmap = bitmap_new(pages);
1924                 bitmap_set(block->unsentmap, 0, pages);
1925             }
1926         }
1927     }
1928
1929     /*
1930      * Count the total number of pages used by ram blocks not including any
1931      * gaps due to alignment or unplugs.
1932      */
1933     rs->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1934
1935     memory_global_dirty_log_start();
1936     migration_bitmap_sync(rs);
1937     qemu_mutex_unlock_ramlist();
1938     qemu_mutex_unlock_iothread();
1939     rcu_read_unlock();
1940
1941     return 0;
1942 }
1943
1944 /*
1945  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1946  * long-running RCU critical section.  When rcu-reclaims in the code
1947  * start to become numerous it will be necessary to reduce the
1948  * granularity of these critical sections.
1949  */
1950
1951 /**
1952  * ram_save_setup: Setup RAM for migration
1953  *
1954  * Returns zero to indicate success and negative for error
1955  *
1956  * @f: QEMUFile where to send the data
1957  * @opaque: RAMState pointer
1958  */
1959 static int ram_save_setup(QEMUFile *f, void *opaque)
1960 {
1961     RAMState *rs = opaque;
1962     RAMBlock *block;
1963
1964     /* migration has already setup the bitmap, reuse it. */
1965     if (!migration_in_colo_state()) {
1966         if (ram_state_init(rs) < 0) {
1967             return -1;
1968          }
1969     }
1970     rs->f = f;
1971
1972     rcu_read_lock();
1973
1974     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1975
1976     RAMBLOCK_FOREACH(block) {
1977         qemu_put_byte(f, strlen(block->idstr));
1978         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1979         qemu_put_be64(f, block->used_length);
1980         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
1981             qemu_put_be64(f, block->page_size);
1982         }
1983     }
1984
1985     rcu_read_unlock();
1986
1987     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1988     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1989
1990     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1991
1992     return 0;
1993 }
1994
1995 /**
1996  * ram_save_iterate: iterative stage for migration
1997  *
1998  * Returns zero to indicate success and negative for error
1999  *
2000  * @f: QEMUFile where to send the data
2001  * @opaque: RAMState pointer
2002  */
2003 static int ram_save_iterate(QEMUFile *f, void *opaque)
2004 {
2005     RAMState *rs = opaque;
2006     int ret;
2007     int i;
2008     int64_t t0;
2009     int done = 0;
2010
2011     rcu_read_lock();
2012     if (ram_list.version != rs->last_version) {
2013         ram_state_reset(rs);
2014     }
2015
2016     /* Read version before ram_list.blocks */
2017     smp_rmb();
2018
2019     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2020
2021     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2022     i = 0;
2023     while ((ret = qemu_file_rate_limit(f)) == 0) {
2024         int pages;
2025
2026         pages = ram_find_and_save_block(rs, false);
2027         /* no more pages to sent */
2028         if (pages == 0) {
2029             done = 1;
2030             break;
2031         }
2032         rs->iterations++;
2033
2034         /* we want to check in the 1st loop, just in case it was the 1st time
2035            and we had to sync the dirty bitmap.
2036            qemu_get_clock_ns() is a bit expensive, so we only check each some
2037            iterations
2038         */
2039         if ((i & 63) == 0) {
2040             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2041             if (t1 > MAX_WAIT) {
2042                 trace_ram_save_iterate_big_wait(t1, i);
2043                 break;
2044             }
2045         }
2046         i++;
2047     }
2048     flush_compressed_data(rs);
2049     rcu_read_unlock();
2050
2051     /*
2052      * Must occur before EOS (or any QEMUFile operation)
2053      * because of RDMA protocol.
2054      */
2055     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2056
2057     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2058     rs->bytes_transferred += 8;
2059
2060     ret = qemu_file_get_error(f);
2061     if (ret < 0) {
2062         return ret;
2063     }
2064
2065     return done;
2066 }
2067
2068 /**
2069  * ram_save_complete: function called to send the remaining amount of ram
2070  *
2071  * Returns zero to indicate success
2072  *
2073  * Called with iothread lock
2074  *
2075  * @f: QEMUFile where to send the data
2076  * @opaque: RAMState pointer
2077  */
2078 static int ram_save_complete(QEMUFile *f, void *opaque)
2079 {
2080     RAMState *rs = opaque;
2081
2082     rcu_read_lock();
2083
2084     if (!migration_in_postcopy()) {
2085         migration_bitmap_sync(rs);
2086     }
2087
2088     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2089
2090     /* try transferring iterative blocks of memory */
2091
2092     /* flush all remaining blocks regardless of rate limiting */
2093     while (true) {
2094         int pages;
2095
2096         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2097         /* no more blocks to sent */
2098         if (pages == 0) {
2099             break;
2100         }
2101     }
2102
2103     flush_compressed_data(rs);
2104     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2105
2106     rcu_read_unlock();
2107
2108     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2109
2110     return 0;
2111 }
2112
2113 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2114                              uint64_t *non_postcopiable_pending,
2115                              uint64_t *postcopiable_pending)
2116 {
2117     RAMState *rs = opaque;
2118     uint64_t remaining_size;
2119
2120     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2121
2122     if (!migration_in_postcopy() &&
2123         remaining_size < max_size) {
2124         qemu_mutex_lock_iothread();
2125         rcu_read_lock();
2126         migration_bitmap_sync(rs);
2127         rcu_read_unlock();
2128         qemu_mutex_unlock_iothread();
2129         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2130     }
2131
2132     /* We can do postcopy, and all the data is postcopiable */
2133     *postcopiable_pending += remaining_size;
2134 }
2135
2136 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2137 {
2138     unsigned int xh_len;
2139     int xh_flags;
2140     uint8_t *loaded_data;
2141
2142     if (!xbzrle_decoded_buf) {
2143         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2144     }
2145     loaded_data = xbzrle_decoded_buf;
2146
2147     /* extract RLE header */
2148     xh_flags = qemu_get_byte(f);
2149     xh_len = qemu_get_be16(f);
2150
2151     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2152         error_report("Failed to load XBZRLE page - wrong compression!");
2153         return -1;
2154     }
2155
2156     if (xh_len > TARGET_PAGE_SIZE) {
2157         error_report("Failed to load XBZRLE page - len overflow!");
2158         return -1;
2159     }
2160     /* load data and decode */
2161     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2162
2163     /* decode RLE */
2164     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2165                              TARGET_PAGE_SIZE) == -1) {
2166         error_report("Failed to load XBZRLE page - decode error!");
2167         return -1;
2168     }
2169
2170     return 0;
2171 }
2172
2173 /**
2174  * ram_block_from_stream: read a RAMBlock id from the migration stream
2175  *
2176  * Must be called from within a rcu critical section.
2177  *
2178  * Returns a pointer from within the RCU-protected ram_list.
2179  *
2180  * @f: QEMUFile where to read the data from
2181  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2182  */
2183 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2184 {
2185     static RAMBlock *block = NULL;
2186     char id[256];
2187     uint8_t len;
2188
2189     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2190         if (!block) {
2191             error_report("Ack, bad migration stream!");
2192             return NULL;
2193         }
2194         return block;
2195     }
2196
2197     len = qemu_get_byte(f);
2198     qemu_get_buffer(f, (uint8_t *)id, len);
2199     id[len] = 0;
2200
2201     block = qemu_ram_block_by_name(id);
2202     if (!block) {
2203         error_report("Can't find block %s", id);
2204         return NULL;
2205     }
2206
2207     return block;
2208 }
2209
2210 static inline void *host_from_ram_block_offset(RAMBlock *block,
2211                                                ram_addr_t offset)
2212 {
2213     if (!offset_in_ramblock(block, offset)) {
2214         return NULL;
2215     }
2216
2217     return block->host + offset;
2218 }
2219
2220 /**
2221  * ram_handle_compressed: handle the zero page case
2222  *
2223  * If a page (or a whole RDMA chunk) has been
2224  * determined to be zero, then zap it.
2225  *
2226  * @host: host address for the zero page
2227  * @ch: what the page is filled from.  We only support zero
2228  * @size: size of the zero page
2229  */
2230 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2231 {
2232     if (ch != 0 || !is_zero_range(host, size)) {
2233         memset(host, ch, size);
2234     }
2235 }
2236
2237 static void *do_data_decompress(void *opaque)
2238 {
2239     DecompressParam *param = opaque;
2240     unsigned long pagesize;
2241     uint8_t *des;
2242     int len;
2243
2244     qemu_mutex_lock(&param->mutex);
2245     while (!param->quit) {
2246         if (param->des) {
2247             des = param->des;
2248             len = param->len;
2249             param->des = 0;
2250             qemu_mutex_unlock(&param->mutex);
2251
2252             pagesize = TARGET_PAGE_SIZE;
2253             /* uncompress() will return failed in some case, especially
2254              * when the page is dirted when doing the compression, it's
2255              * not a problem because the dirty page will be retransferred
2256              * and uncompress() won't break the data in other pages.
2257              */
2258             uncompress((Bytef *)des, &pagesize,
2259                        (const Bytef *)param->compbuf, len);
2260
2261             qemu_mutex_lock(&decomp_done_lock);
2262             param->done = true;
2263             qemu_cond_signal(&decomp_done_cond);
2264             qemu_mutex_unlock(&decomp_done_lock);
2265
2266             qemu_mutex_lock(&param->mutex);
2267         } else {
2268             qemu_cond_wait(&param->cond, &param->mutex);
2269         }
2270     }
2271     qemu_mutex_unlock(&param->mutex);
2272
2273     return NULL;
2274 }
2275
2276 static void wait_for_decompress_done(void)
2277 {
2278     int idx, thread_count;
2279
2280     if (!migrate_use_compression()) {
2281         return;
2282     }
2283
2284     thread_count = migrate_decompress_threads();
2285     qemu_mutex_lock(&decomp_done_lock);
2286     for (idx = 0; idx < thread_count; idx++) {
2287         while (!decomp_param[idx].done) {
2288             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2289         }
2290     }
2291     qemu_mutex_unlock(&decomp_done_lock);
2292 }
2293
2294 void migrate_decompress_threads_create(void)
2295 {
2296     int i, thread_count;
2297
2298     thread_count = migrate_decompress_threads();
2299     decompress_threads = g_new0(QemuThread, thread_count);
2300     decomp_param = g_new0(DecompressParam, thread_count);
2301     qemu_mutex_init(&decomp_done_lock);
2302     qemu_cond_init(&decomp_done_cond);
2303     for (i = 0; i < thread_count; i++) {
2304         qemu_mutex_init(&decomp_param[i].mutex);
2305         qemu_cond_init(&decomp_param[i].cond);
2306         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2307         decomp_param[i].done = true;
2308         decomp_param[i].quit = false;
2309         qemu_thread_create(decompress_threads + i, "decompress",
2310                            do_data_decompress, decomp_param + i,
2311                            QEMU_THREAD_JOINABLE);
2312     }
2313 }
2314
2315 void migrate_decompress_threads_join(void)
2316 {
2317     int i, thread_count;
2318
2319     thread_count = migrate_decompress_threads();
2320     for (i = 0; i < thread_count; i++) {
2321         qemu_mutex_lock(&decomp_param[i].mutex);
2322         decomp_param[i].quit = true;
2323         qemu_cond_signal(&decomp_param[i].cond);
2324         qemu_mutex_unlock(&decomp_param[i].mutex);
2325     }
2326     for (i = 0; i < thread_count; i++) {
2327         qemu_thread_join(decompress_threads + i);
2328         qemu_mutex_destroy(&decomp_param[i].mutex);
2329         qemu_cond_destroy(&decomp_param[i].cond);
2330         g_free(decomp_param[i].compbuf);
2331     }
2332     g_free(decompress_threads);
2333     g_free(decomp_param);
2334     decompress_threads = NULL;
2335     decomp_param = NULL;
2336 }
2337
2338 static void decompress_data_with_multi_threads(QEMUFile *f,
2339                                                void *host, int len)
2340 {
2341     int idx, thread_count;
2342
2343     thread_count = migrate_decompress_threads();
2344     qemu_mutex_lock(&decomp_done_lock);
2345     while (true) {
2346         for (idx = 0; idx < thread_count; idx++) {
2347             if (decomp_param[idx].done) {
2348                 decomp_param[idx].done = false;
2349                 qemu_mutex_lock(&decomp_param[idx].mutex);
2350                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2351                 decomp_param[idx].des = host;
2352                 decomp_param[idx].len = len;
2353                 qemu_cond_signal(&decomp_param[idx].cond);
2354                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2355                 break;
2356             }
2357         }
2358         if (idx < thread_count) {
2359             break;
2360         } else {
2361             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2362         }
2363     }
2364     qemu_mutex_unlock(&decomp_done_lock);
2365 }
2366
2367 /**
2368  * ram_postcopy_incoming_init: allocate postcopy data structures
2369  *
2370  * Returns 0 for success and negative if there was one error
2371  *
2372  * @mis: current migration incoming state
2373  *
2374  * Allocate data structures etc needed by incoming migration with
2375  * postcopy-ram. postcopy-ram's similarly names
2376  * postcopy_ram_incoming_init does the work.
2377  */
2378 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2379 {
2380     unsigned long ram_pages = last_ram_page();
2381
2382     return postcopy_ram_incoming_init(mis, ram_pages);
2383 }
2384
2385 /**
2386  * ram_load_postcopy: load a page in postcopy case
2387  *
2388  * Returns 0 for success or -errno in case of error
2389  *
2390  * Called in postcopy mode by ram_load().
2391  * rcu_read_lock is taken prior to this being called.
2392  *
2393  * @f: QEMUFile where to send the data
2394  */
2395 static int ram_load_postcopy(QEMUFile *f)
2396 {
2397     int flags = 0, ret = 0;
2398     bool place_needed = false;
2399     bool matching_page_sizes = false;
2400     MigrationIncomingState *mis = migration_incoming_get_current();
2401     /* Temporary page that is later 'placed' */
2402     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2403     void *last_host = NULL;
2404     bool all_zero = false;
2405
2406     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2407         ram_addr_t addr;
2408         void *host = NULL;
2409         void *page_buffer = NULL;
2410         void *place_source = NULL;
2411         RAMBlock *block = NULL;
2412         uint8_t ch;
2413
2414         addr = qemu_get_be64(f);
2415         flags = addr & ~TARGET_PAGE_MASK;
2416         addr &= TARGET_PAGE_MASK;
2417
2418         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2419         place_needed = false;
2420         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2421             block = ram_block_from_stream(f, flags);
2422
2423             host = host_from_ram_block_offset(block, addr);
2424             if (!host) {
2425                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2426                 ret = -EINVAL;
2427                 break;
2428             }
2429             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2430             /*
2431              * Postcopy requires that we place whole host pages atomically;
2432              * these may be huge pages for RAMBlocks that are backed by
2433              * hugetlbfs.
2434              * To make it atomic, the data is read into a temporary page
2435              * that's moved into place later.
2436              * The migration protocol uses,  possibly smaller, target-pages
2437              * however the source ensures it always sends all the components
2438              * of a host page in order.
2439              */
2440             page_buffer = postcopy_host_page +
2441                           ((uintptr_t)host & (block->page_size - 1));
2442             /* If all TP are zero then we can optimise the place */
2443             if (!((uintptr_t)host & (block->page_size - 1))) {
2444                 all_zero = true;
2445             } else {
2446                 /* not the 1st TP within the HP */
2447                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2448                     error_report("Non-sequential target page %p/%p",
2449                                   host, last_host);
2450                     ret = -EINVAL;
2451                     break;
2452                 }
2453             }
2454
2455
2456             /*
2457              * If it's the last part of a host page then we place the host
2458              * page
2459              */
2460             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2461                                      (block->page_size - 1)) == 0;
2462             place_source = postcopy_host_page;
2463         }
2464         last_host = host;
2465
2466         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2467         case RAM_SAVE_FLAG_ZERO:
2468             ch = qemu_get_byte(f);
2469             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2470             if (ch) {
2471                 all_zero = false;
2472             }
2473             break;
2474
2475         case RAM_SAVE_FLAG_PAGE:
2476             all_zero = false;
2477             if (!place_needed || !matching_page_sizes) {
2478                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2479             } else {
2480                 /* Avoids the qemu_file copy during postcopy, which is
2481                  * going to do a copy later; can only do it when we
2482                  * do this read in one go (matching page sizes)
2483                  */
2484                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2485                                          TARGET_PAGE_SIZE);
2486             }
2487             break;
2488         case RAM_SAVE_FLAG_EOS:
2489             /* normal exit */
2490             break;
2491         default:
2492             error_report("Unknown combination of migration flags: %#x"
2493                          " (postcopy mode)", flags);
2494             ret = -EINVAL;
2495         }
2496
2497         if (place_needed) {
2498             /* This gets called at the last target page in the host page */
2499             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2500
2501             if (all_zero) {
2502                 ret = postcopy_place_page_zero(mis, place_dest,
2503                                                block->page_size);
2504             } else {
2505                 ret = postcopy_place_page(mis, place_dest,
2506                                           place_source, block->page_size);
2507             }
2508         }
2509         if (!ret) {
2510             ret = qemu_file_get_error(f);
2511         }
2512     }
2513
2514     return ret;
2515 }
2516
2517 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2518 {
2519     int flags = 0, ret = 0;
2520     static uint64_t seq_iter;
2521     int len = 0;
2522     /*
2523      * If system is running in postcopy mode, page inserts to host memory must
2524      * be atomic
2525      */
2526     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2527     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2528     bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2529
2530     seq_iter++;
2531
2532     if (version_id != 4) {
2533         ret = -EINVAL;
2534     }
2535
2536     /* This RCU critical section can be very long running.
2537      * When RCU reclaims in the code start to become numerous,
2538      * it will be necessary to reduce the granularity of this
2539      * critical section.
2540      */
2541     rcu_read_lock();
2542
2543     if (postcopy_running) {
2544         ret = ram_load_postcopy(f);
2545     }
2546
2547     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2548         ram_addr_t addr, total_ram_bytes;
2549         void *host = NULL;
2550         uint8_t ch;
2551
2552         addr = qemu_get_be64(f);
2553         flags = addr & ~TARGET_PAGE_MASK;
2554         addr &= TARGET_PAGE_MASK;
2555
2556         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2557                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2558             RAMBlock *block = ram_block_from_stream(f, flags);
2559
2560             host = host_from_ram_block_offset(block, addr);
2561             if (!host) {
2562                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2563                 ret = -EINVAL;
2564                 break;
2565             }
2566             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2567         }
2568
2569         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2570         case RAM_SAVE_FLAG_MEM_SIZE:
2571             /* Synchronize RAM block list */
2572             total_ram_bytes = addr;
2573             while (!ret && total_ram_bytes) {
2574                 RAMBlock *block;
2575                 char id[256];
2576                 ram_addr_t length;
2577
2578                 len = qemu_get_byte(f);
2579                 qemu_get_buffer(f, (uint8_t *)id, len);
2580                 id[len] = 0;
2581                 length = qemu_get_be64(f);
2582
2583                 block = qemu_ram_block_by_name(id);
2584                 if (block) {
2585                     if (length != block->used_length) {
2586                         Error *local_err = NULL;
2587
2588                         ret = qemu_ram_resize(block, length,
2589                                               &local_err);
2590                         if (local_err) {
2591                             error_report_err(local_err);
2592                         }
2593                     }
2594                     /* For postcopy we need to check hugepage sizes match */
2595                     if (postcopy_advised &&
2596                         block->page_size != qemu_host_page_size) {
2597                         uint64_t remote_page_size = qemu_get_be64(f);
2598                         if (remote_page_size != block->page_size) {
2599                             error_report("Mismatched RAM page size %s "
2600                                          "(local) %zd != %" PRId64,
2601                                          id, block->page_size,
2602                                          remote_page_size);
2603                             ret = -EINVAL;
2604                         }
2605                     }
2606                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2607                                           block->idstr);
2608                 } else {
2609                     error_report("Unknown ramblock \"%s\", cannot "
2610                                  "accept migration", id);
2611                     ret = -EINVAL;
2612                 }
2613
2614                 total_ram_bytes -= length;
2615             }
2616             break;
2617
2618         case RAM_SAVE_FLAG_ZERO:
2619             ch = qemu_get_byte(f);
2620             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2621             break;
2622
2623         case RAM_SAVE_FLAG_PAGE:
2624             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2625             break;
2626
2627         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2628             len = qemu_get_be32(f);
2629             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2630                 error_report("Invalid compressed data length: %d", len);
2631                 ret = -EINVAL;
2632                 break;
2633             }
2634             decompress_data_with_multi_threads(f, host, len);
2635             break;
2636
2637         case RAM_SAVE_FLAG_XBZRLE:
2638             if (load_xbzrle(f, addr, host) < 0) {
2639                 error_report("Failed to decompress XBZRLE page at "
2640                              RAM_ADDR_FMT, addr);
2641                 ret = -EINVAL;
2642                 break;
2643             }
2644             break;
2645         case RAM_SAVE_FLAG_EOS:
2646             /* normal exit */
2647             break;
2648         default:
2649             if (flags & RAM_SAVE_FLAG_HOOK) {
2650                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2651             } else {
2652                 error_report("Unknown combination of migration flags: %#x",
2653                              flags);
2654                 ret = -EINVAL;
2655             }
2656         }
2657         if (!ret) {
2658             ret = qemu_file_get_error(f);
2659         }
2660     }
2661
2662     wait_for_decompress_done();
2663     rcu_read_unlock();
2664     trace_ram_load_complete(ret, seq_iter);
2665     return ret;
2666 }
2667
2668 static SaveVMHandlers savevm_ram_handlers = {
2669     .save_live_setup = ram_save_setup,
2670     .save_live_iterate = ram_save_iterate,
2671     .save_live_complete_postcopy = ram_save_complete,
2672     .save_live_complete_precopy = ram_save_complete,
2673     .save_live_pending = ram_save_pending,
2674     .load_state = ram_load,
2675     .cleanup = ram_migration_cleanup,
2676 };
2677
2678 void ram_mig_init(void)
2679 {
2680     qemu_mutex_init(&XBZRLE.lock);
2681     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2682 }