migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "qemu-common.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qapi-event.h"
  33 #include "qemu/cutils.h"
  34 #include "qemu/bitops.h"
  35 #include "qemu/bitmap.h"
  36 #include "qemu/timer.h"
  37 #include "qemu/main-loop.h"
  38 #include "migration/migration.h"
  39 #include "postcopy-ram.h"
  40 #include "exec/address-spaces.h"
  41 #include "migration/page_cache.h"
  42 #include "qemu/error-report.h"
  43 #include "trace.h"
  44 #include "exec/ram_addr.h"
  45 #include "qemu/rcu_queue.h"
  46 #include "migration/colo.h"
  47
  48 /***********************************************************/
  49 /* ram save/restore */
  50
  51 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  52  * worked for pages that where filled with the same char.  We switched
  53  * it to only search for the zero value.  And to avoid confusion with
  54  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  55  */
  56
  57 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  58 #define RAM_SAVE_FLAG_ZERO     0x02
  59 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  60 #define RAM_SAVE_FLAG_PAGE     0x08
  61 #define RAM_SAVE_FLAG_EOS      0x10
  62 #define RAM_SAVE_FLAG_CONTINUE 0x20
  63 #define RAM_SAVE_FLAG_XBZRLE   0x40
  64 /* 0x80 is reserved in migration.h start with 0x100 next */
  65 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  66
  67 static uint8_t *ZERO_TARGET_PAGE;
  68
  69 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  70 {
  71     return buffer_is_zero(p, size);
  72 }
  73
  74 /* struct contains XBZRLE cache and a static page
  75    used by the compression */
  76 static struct {
  77     /* buffer used for XBZRLE encoding */
  78     uint8_t *encoded_buf;
  79     /* buffer for storing page content */
  80     uint8_t *current_buf;
  81     /* Cache for XBZRLE, Protected by lock. */
  82     PageCache *cache;
  83     QemuMutex lock;
  84 } XBZRLE;
  85
  86 /* buffer used for XBZRLE decoding */
  87 static uint8_t *xbzrle_decoded_buf;
  88
  89 static void XBZRLE_cache_lock(void)
  90 {
  91     if (migrate_use_xbzrle())
  92         qemu_mutex_lock(&XBZRLE.lock);
  93 }
  94
  95 static void XBZRLE_cache_unlock(void)
  96 {
  97     if (migrate_use_xbzrle())
  98         qemu_mutex_unlock(&XBZRLE.lock);
  99 }
 100
 101 /**
 102  * xbzrle_cache_resize: resize the xbzrle cache
 103  *
 104  * This function is called from qmp_migrate_set_cache_size in main
 105  * thread, possibly while a migration is in progress.  A running
 106  * migration may be using the cache and might finish during this call,
 107  * hence changes to the cache are protected by XBZRLE.lock().
 108  *
 109  * Returns the new_size or negative in case of error.
 110  *
 111  * @new_size: new cache size
 112  */
 113 int64_t xbzrle_cache_resize(int64_t new_size)
 114 {
 115     PageCache *new_cache;
 116     int64_t ret;
 117
 118     if (new_size < TARGET_PAGE_SIZE) {
 119         return -1;
 120     }
 121
 122     XBZRLE_cache_lock();
 123
 124     if (XBZRLE.cache != NULL) {
 125         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 126             goto out_new_size;
 127         }
 128         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 129                                         TARGET_PAGE_SIZE);
 130         if (!new_cache) {
 131             error_report("Error creating cache");
 132             ret = -1;
 133             goto out;
 134         }
 135
 136         cache_fini(XBZRLE.cache);
 137         XBZRLE.cache = new_cache;
 138     }
 139
 140 out_new_size:
 141     ret = pow2floor(new_size);
 142 out:
 143     XBZRLE_cache_unlock();
 144     return ret;
 145 }
 146
 147 /*
 148  * An outstanding page request, on the source, having been received
 149  * and queued
 150  */
 151 struct RAMSrcPageRequest {
 152     RAMBlock *rb;
 153     hwaddr    offset;
 154     hwaddr    len;
 155
 156     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 157 };
 158
 159 /* State of RAM for migration */
 160 struct RAMState {
 161     /* QEMUFile used for this migration */
 162     QEMUFile *f;
 163     /* Last block that we have visited searching for dirty pages */
 164     RAMBlock *last_seen_block;
 165     /* Last block from where we have sent data */
 166     RAMBlock *last_sent_block;
 167     /* Last dirty target page we have sent */
 168     ram_addr_t last_page;
 169     /* last ram version we have seen */
 170     uint32_t last_version;
 171     /* We are in the first round */
 172     bool ram_bulk_stage;
 173     /* How many times we have dirty too many pages */
 174     int dirty_rate_high_cnt;
 175     /* How many times we have synchronized the bitmap */
 176     uint64_t bitmap_sync_count;
 177     /* these variables are used for bitmap sync */
 178     /* last time we did a full bitmap_sync */
 179     int64_t time_last_bitmap_sync;
 180     /* bytes transferred at start_time */
 181     uint64_t bytes_xfer_prev;
 182     /* number of dirty pages since start_time */
 183     uint64_t num_dirty_pages_period;
 184     /* xbzrle misses since the beginning of the period */
 185     uint64_t xbzrle_cache_miss_prev;
 186     /* number of iterations at the beginning of period */
 187     uint64_t iterations_prev;
 188     /* Accounting fields */
 189     /* number of zero pages.  It used to be pages filled by the same char. */
 190     uint64_t zero_pages;
 191     /* number of normal transferred pages */
 192     uint64_t norm_pages;
 193     /* Iterations since start */
 194     uint64_t iterations;
 195     /* xbzrle transmitted bytes.  Notice that this is with
 196      * compression, they can't be calculated from the pages */
 197     uint64_t xbzrle_bytes;
 198     /* xbzrle transmmited pages */
 199     uint64_t xbzrle_pages;
 200     /* xbzrle number of cache miss */
 201     uint64_t xbzrle_cache_miss;
 202     /* xbzrle miss rate */
 203     double xbzrle_cache_miss_rate;
 204     /* xbzrle number of overflows */
 205     uint64_t xbzrle_overflows;
 206     /* number of dirty bits in the bitmap */
 207     uint64_t migration_dirty_pages;
 208     /* total number of bytes transferred */
 209     uint64_t bytes_transferred;
 210     /* number of dirtied pages in the last second */
 211     uint64_t dirty_pages_rate;
 212     /* Count of requests incoming from destination */
 213     uint64_t postcopy_requests;
 214     /* protects modification of the bitmap */
 215     QemuMutex bitmap_mutex;
 216     /* The RAMBlock used in the last src_page_requests */
 217     RAMBlock *last_req_rb;
 218     /* Queue of outstanding page requests from the destination */
 219     QemuMutex src_page_req_mutex;
 220     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 221 };
 222 typedef struct RAMState RAMState;
 223
 224 static RAMState ram_state;
 225
 226 uint64_t dup_mig_pages_transferred(void)
 227 {
 228     return ram_state.zero_pages;
 229 }
 230
 231 uint64_t norm_mig_pages_transferred(void)
 232 {
 233     return ram_state.norm_pages;
 234 }
 235
 236 uint64_t xbzrle_mig_bytes_transferred(void)
 237 {
 238     return ram_state.xbzrle_bytes;
 239 }
 240
 241 uint64_t xbzrle_mig_pages_transferred(void)
 242 {
 243     return ram_state.xbzrle_pages;
 244 }
 245
 246 uint64_t xbzrle_mig_pages_cache_miss(void)
 247 {
 248     return ram_state.xbzrle_cache_miss;
 249 }
 250
 251 double xbzrle_mig_cache_miss_rate(void)
 252 {
 253     return ram_state.xbzrle_cache_miss_rate;
 254 }
 255
 256 uint64_t xbzrle_mig_pages_overflow(void)
 257 {
 258     return ram_state.xbzrle_overflows;
 259 }
 260
 261 uint64_t ram_bytes_transferred(void)
 262 {
 263     return ram_state.bytes_transferred;
 264 }
 265
 266 uint64_t ram_bytes_remaining(void)
 267 {
 268     return ram_state.migration_dirty_pages * TARGET_PAGE_SIZE;
 269 }
 270
 271 uint64_t ram_dirty_sync_count(void)
 272 {
 273     return ram_state.bitmap_sync_count;
 274 }
 275
 276 uint64_t ram_dirty_pages_rate(void)
 277 {
 278     return ram_state.dirty_pages_rate;
 279 }
 280
 281 uint64_t ram_postcopy_requests(void)
 282 {
 283     return ram_state.postcopy_requests;
 284 }
 285
 286 /* used by the search for pages to send */
 287 struct PageSearchStatus {
 288     /* Current block being searched */
 289     RAMBlock    *block;
 290     /* Current page to search from */
 291     unsigned long page;
 292     /* Set once we wrap around */
 293     bool         complete_round;
 294 };
 295 typedef struct PageSearchStatus PageSearchStatus;
 296
 297 struct CompressParam {
 298     bool done;
 299     bool quit;
 300     QEMUFile *file;
 301     QemuMutex mutex;
 302     QemuCond cond;
 303     RAMBlock *block;
 304     ram_addr_t offset;
 305 };
 306 typedef struct CompressParam CompressParam;
 307
 308 struct DecompressParam {
 309     bool done;
 310     bool quit;
 311     QemuMutex mutex;
 312     QemuCond cond;
 313     void *des;
 314     uint8_t *compbuf;
 315     int len;
 316 };
 317 typedef struct DecompressParam DecompressParam;
 318
 319 static CompressParam *comp_param;
 320 static QemuThread *compress_threads;
 321 /* comp_done_cond is used to wake up the migration thread when
 322  * one of the compression threads has finished the compression.
 323  * comp_done_lock is used to co-work with comp_done_cond.
 324  */
 325 static QemuMutex comp_done_lock;
 326 static QemuCond comp_done_cond;
 327 /* The empty QEMUFileOps will be used by file in CompressParam */
 328 static const QEMUFileOps empty_ops = { };
 329
 330 static DecompressParam *decomp_param;
 331 static QemuThread *decompress_threads;
 332 static QemuMutex decomp_done_lock;
 333 static QemuCond decomp_done_cond;
 334
 335 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 336                                 ram_addr_t offset);
 337
 338 static void *do_data_compress(void *opaque)
 339 {
 340     CompressParam *param = opaque;
 341     RAMBlock *block;
 342     ram_addr_t offset;
 343
 344     qemu_mutex_lock(&param->mutex);
 345     while (!param->quit) {
 346         if (param->block) {
 347             block = param->block;
 348             offset = param->offset;
 349             param->block = NULL;
 350             qemu_mutex_unlock(&param->mutex);
 351
 352             do_compress_ram_page(param->file, block, offset);
 353
 354             qemu_mutex_lock(&comp_done_lock);
 355             param->done = true;
 356             qemu_cond_signal(&comp_done_cond);
 357             qemu_mutex_unlock(&comp_done_lock);
 358
 359             qemu_mutex_lock(&param->mutex);
 360         } else {
 361             qemu_cond_wait(&param->cond, &param->mutex);
 362         }
 363     }
 364     qemu_mutex_unlock(&param->mutex);
 365
 366     return NULL;
 367 }
 368
 369 static inline void terminate_compression_threads(void)
 370 {
 371     int idx, thread_count;
 372
 373     thread_count = migrate_compress_threads();
 374
 375     for (idx = 0; idx < thread_count; idx++) {
 376         qemu_mutex_lock(&comp_param[idx].mutex);
 377         comp_param[idx].quit = true;
 378         qemu_cond_signal(&comp_param[idx].cond);
 379         qemu_mutex_unlock(&comp_param[idx].mutex);
 380     }
 381 }
 382
 383 void migrate_compress_threads_join(void)
 384 {
 385     int i, thread_count;
 386
 387     if (!migrate_use_compression()) {
 388         return;
 389     }
 390     terminate_compression_threads();
 391     thread_count = migrate_compress_threads();
 392     for (i = 0; i < thread_count; i++) {
 393         qemu_thread_join(compress_threads + i);
 394         qemu_fclose(comp_param[i].file);
 395         qemu_mutex_destroy(&comp_param[i].mutex);
 396         qemu_cond_destroy(&comp_param[i].cond);
 397     }
 398     qemu_mutex_destroy(&comp_done_lock);
 399     qemu_cond_destroy(&comp_done_cond);
 400     g_free(compress_threads);
 401     g_free(comp_param);
 402     compress_threads = NULL;
 403     comp_param = NULL;
 404 }
 405
 406 void migrate_compress_threads_create(void)
 407 {
 408     int i, thread_count;
 409
 410     if (!migrate_use_compression()) {
 411         return;
 412     }
 413     thread_count = migrate_compress_threads();
 414     compress_threads = g_new0(QemuThread, thread_count);
 415     comp_param = g_new0(CompressParam, thread_count);
 416     qemu_cond_init(&comp_done_cond);
 417     qemu_mutex_init(&comp_done_lock);
 418     for (i = 0; i < thread_count; i++) {
 419         /* comp_param[i].file is just used as a dummy buffer to save data,
 420          * set its ops to empty.
 421          */
 422         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 423         comp_param[i].done = true;
 424         comp_param[i].quit = false;
 425         qemu_mutex_init(&comp_param[i].mutex);
 426         qemu_cond_init(&comp_param[i].cond);
 427         qemu_thread_create(compress_threads + i, "compress",
 428                            do_data_compress, comp_param + i,
 429                            QEMU_THREAD_JOINABLE);
 430     }
 431 }
 432
 433 /**
 434  * save_page_header: write page header to wire
 435  *
 436  * If this is the 1st block, it also writes the block identification
 437  *
 438  * Returns the number of bytes written
 439  *
 440  * @f: QEMUFile where to send the data
 441  * @block: block that contains the page we want to send
 442  * @offset: offset inside the block for the page
 443  *          in the lower bits, it contains flags
 444  */
 445 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 446                                ram_addr_t offset)
 447 {
 448     size_t size, len;
 449
 450     if (block == rs->last_sent_block) {
 451         offset |= RAM_SAVE_FLAG_CONTINUE;
 452     }
 453     qemu_put_be64(f, offset);
 454     size = 8;
 455
 456     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 457         len = strlen(block->idstr);
 458         qemu_put_byte(f, len);
 459         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 460         size += 1 + len;
 461         rs->last_sent_block = block;
 462     }
 463     return size;
 464 }
 465
 466 /**
 467  * mig_throttle_guest_down: throotle down the guest
 468  *
 469  * Reduce amount of guest cpu execution to hopefully slow down memory
 470  * writes. If guest dirty memory rate is reduced below the rate at
 471  * which we can transfer pages to the destination then we should be
 472  * able to complete migration. Some workloads dirty memory way too
 473  * fast and will not effectively converge, even with auto-converge.
 474  */
 475 static void mig_throttle_guest_down(void)
 476 {
 477     MigrationState *s = migrate_get_current();
 478     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 479     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 480
 481     /* We have not started throttling yet. Let's start it. */
 482     if (!cpu_throttle_active()) {
 483         cpu_throttle_set(pct_initial);
 484     } else {
 485         /* Throttling already on, just increase the rate */
 486         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 487     }
 488 }
 489
 490 /**
 491  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 492  *
 493  * @rs: current RAM state
 494  * @current_addr: address for the zero page
 495  *
 496  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 497  * The important thing is that a stale (not-yet-0'd) page be replaced
 498  * by the new data.
 499  * As a bonus, if the page wasn't in the cache it gets added so that
 500  * when a small write is made into the 0'd page it gets XBZRLE sent.
 501  */
 502 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 503 {
 504     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 505         return;
 506     }
 507
 508     /* We don't care if this fails to allocate a new cache page
 509      * as long as it updated an old one */
 510     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
 511                  rs->bitmap_sync_count);
 512 }
 513
 514 #define ENCODING_FLAG_XBZRLE 0x1
 515
 516 /**
 517  * save_xbzrle_page: compress and send current page
 518  *
 519  * Returns: 1 means that we wrote the page
 520  *          0 means that page is identical to the one already sent
 521  *          -1 means that xbzrle would be longer than normal
 522  *
 523  * @rs: current RAM state
 524  * @current_data: pointer to the address of the page contents
 525  * @current_addr: addr of the page
 526  * @block: block that contains the page we want to send
 527  * @offset: offset inside the block for the page
 528  * @last_stage: if we are at the completion stage
 529  */
 530 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 531                             ram_addr_t current_addr, RAMBlock *block,
 532                             ram_addr_t offset, bool last_stage)
 533 {
 534     int encoded_len = 0, bytes_xbzrle;
 535     uint8_t *prev_cached_page;
 536
 537     if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
 538         rs->xbzrle_cache_miss++;
 539         if (!last_stage) {
 540             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 541                              rs->bitmap_sync_count) == -1) {
 542                 return -1;
 543             } else {
 544                 /* update *current_data when the page has been
 545                    inserted into cache */
 546                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 547             }
 548         }
 549         return -1;
 550     }
 551
 552     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 553
 554     /* save current buffer into memory */
 555     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 556
 557     /* XBZRLE encoding (if there is no overflow) */
 558     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 559                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 560                                        TARGET_PAGE_SIZE);
 561     if (encoded_len == 0) {
 562         trace_save_xbzrle_page_skipping();
 563         return 0;
 564     } else if (encoded_len == -1) {
 565         trace_save_xbzrle_page_overflow();
 566         rs->xbzrle_overflows++;
 567         /* update data in the cache */
 568         if (!last_stage) {
 569             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 570             *current_data = prev_cached_page;
 571         }
 572         return -1;
 573     }
 574
 575     /* we need to update the data in the cache, in order to get the same data */
 576     if (!last_stage) {
 577         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 578     }
 579
 580     /* Send XBZRLE based compressed page */
 581     bytes_xbzrle = save_page_header(rs, rs->f, block,
 582                                     offset | RAM_SAVE_FLAG_XBZRLE);
 583     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 584     qemu_put_be16(rs->f, encoded_len);
 585     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 586     bytes_xbzrle += encoded_len + 1 + 2;
 587     rs->xbzrle_pages++;
 588     rs->xbzrle_bytes += bytes_xbzrle;
 589     rs->bytes_transferred += bytes_xbzrle;
 590
 591     return 1;
 592 }
 593
 594 /**
 595  * migration_bitmap_find_dirty: find the next dirty page from start
 596  *
 597  * Called with rcu_read_lock() to protect migration_bitmap
 598  *
 599  * Returns the byte offset within memory region of the start of a dirty page
 600  *
 601  * @rs: current RAM state
 602  * @rb: RAMBlock where to search for dirty pages
 603  * @start: page where we start the search
 604  */
 605 static inline
 606 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 607                                           unsigned long start)
 608 {
 609     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 610     unsigned long *bitmap = rb->bmap;
 611     unsigned long next;
 612
 613     if (rs->ram_bulk_stage && start > 0) {
 614         next = start + 1;
 615     } else {
 616         next = find_next_bit(bitmap, size, start);
 617     }
 618
 619     return next;
 620 }
 621
 622 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 623                                                 RAMBlock *rb,
 624                                                 unsigned long page)
 625 {
 626     bool ret;
 627
 628     ret = test_and_clear_bit(page, rb->bmap);
 629
 630     if (ret) {
 631         rs->migration_dirty_pages--;
 632     }
 633     return ret;
 634 }
 635
 636 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 637                                         ram_addr_t start, ram_addr_t length)
 638 {
 639     rs->migration_dirty_pages +=
 640         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 641                                               &rs->num_dirty_pages_period);
 642 }
 643
 644 /**
 645  * ram_pagesize_summary: calculate all the pagesizes of a VM
 646  *
 647  * Returns a summary bitmap of the page sizes of all RAMBlocks
 648  *
 649  * For VMs with just normal pages this is equivalent to the host page
 650  * size. If it's got some huge pages then it's the OR of all the
 651  * different page sizes.
 652  */
 653 uint64_t ram_pagesize_summary(void)
 654 {
 655     RAMBlock *block;
 656     uint64_t summary = 0;
 657
 658     RAMBLOCK_FOREACH(block) {
 659         summary |= block->page_size;
 660     }
 661
 662     return summary;
 663 }
 664
 665 static void migration_bitmap_sync(RAMState *rs)
 666 {
 667     RAMBlock *block;
 668     int64_t end_time;
 669     uint64_t bytes_xfer_now;
 670
 671     rs->bitmap_sync_count++;
 672
 673     if (!rs->bytes_xfer_prev) {
 674         rs->bytes_xfer_prev = ram_bytes_transferred();
 675     }
 676
 677     if (!rs->time_last_bitmap_sync) {
 678         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 679     }
 680
 681     trace_migration_bitmap_sync_start();
 682     memory_global_dirty_log_sync();
 683
 684     qemu_mutex_lock(&rs->bitmap_mutex);
 685     rcu_read_lock();
 686     RAMBLOCK_FOREACH(block) {
 687         migration_bitmap_sync_range(rs, block, 0, block->used_length);
 688     }
 689     rcu_read_unlock();
 690     qemu_mutex_unlock(&rs->bitmap_mutex);
 691
 692     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 693
 694     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 695
 696     /* more than 1 second = 1000 millisecons */
 697     if (end_time > rs->time_last_bitmap_sync + 1000) {
 698         if (migrate_auto_converge()) {
 699             /* The following detection logic can be refined later. For now:
 700                Check to see if the dirtied bytes is 50% more than the approx.
 701                amount of bytes that just got transferred since the last time we
 702                were in this routine. If that happens twice, start or increase
 703                throttling */
 704             bytes_xfer_now = ram_bytes_transferred();
 705
 706             if (rs->dirty_pages_rate &&
 707                (rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 708                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 709                (rs->dirty_rate_high_cnt++ >= 2)) {
 710                     trace_migration_throttle();
 711                     rs->dirty_rate_high_cnt = 0;
 712                     mig_throttle_guest_down();
 713              }
 714              rs->bytes_xfer_prev = bytes_xfer_now;
 715         }
 716
 717         if (migrate_use_xbzrle()) {
 718             if (rs->iterations_prev != rs->iterations) {
 719                 rs->xbzrle_cache_miss_rate =
 720                    (double)(rs->xbzrle_cache_miss -
 721                             rs->xbzrle_cache_miss_prev) /
 722                    (rs->iterations - rs->iterations_prev);
 723             }
 724             rs->iterations_prev = rs->iterations;
 725             rs->xbzrle_cache_miss_prev = rs->xbzrle_cache_miss;
 726         }
 727         rs->dirty_pages_rate = rs->num_dirty_pages_period * 1000
 728             / (end_time - rs->time_last_bitmap_sync);
 729         rs->time_last_bitmap_sync = end_time;
 730         rs->num_dirty_pages_period = 0;
 731     }
 732     if (migrate_use_events()) {
 733         qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
 734     }
 735 }
 736
 737 /**
 738  * save_zero_page: send the zero page to the stream
 739  *
 740  * Returns the number of pages written.
 741  *
 742  * @rs: current RAM state
 743  * @block: block that contains the page we want to send
 744  * @offset: offset inside the block for the page
 745  * @p: pointer to the page
 746  */
 747 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
 748                           uint8_t *p)
 749 {
 750     int pages = -1;
 751
 752     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 753         rs->zero_pages++;
 754         rs->bytes_transferred +=
 755             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
 756         qemu_put_byte(rs->f, 0);
 757         rs->bytes_transferred += 1;
 758         pages = 1;
 759     }
 760
 761     return pages;
 762 }
 763
 764 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 765 {
 766     if (!migrate_release_ram() || !migration_in_postcopy()) {
 767         return;
 768     }
 769
 770     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 771 }
 772
 773 /**
 774  * ram_save_page: send the given page to the stream
 775  *
 776  * Returns the number of pages written.
 777  *          < 0 - error
 778  *          >=0 - Number of pages written - this might legally be 0
 779  *                if xbzrle noticed the page was the same.
 780  *
 781  * @rs: current RAM state
 782  * @block: block that contains the page we want to send
 783  * @offset: offset inside the block for the page
 784  * @last_stage: if we are at the completion stage
 785  */
 786 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
 787 {
 788     int pages = -1;
 789     uint64_t bytes_xmit;
 790     ram_addr_t current_addr;
 791     uint8_t *p;
 792     int ret;
 793     bool send_async = true;
 794     RAMBlock *block = pss->block;
 795     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 796
 797     p = block->host + offset;
 798     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
 799
 800     /* In doubt sent page as normal */
 801     bytes_xmit = 0;
 802     ret = ram_control_save_page(rs->f, block->offset,
 803                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 804     if (bytes_xmit) {
 805         rs->bytes_transferred += bytes_xmit;
 806         pages = 1;
 807     }
 808
 809     XBZRLE_cache_lock();
 810
 811     current_addr = block->offset + offset;
 812
 813     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 814         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 815             if (bytes_xmit > 0) {
 816                 rs->norm_pages++;
 817             } else if (bytes_xmit == 0) {
 818                 rs->zero_pages++;
 819             }
 820         }
 821     } else {
 822         pages = save_zero_page(rs, block, offset, p);
 823         if (pages > 0) {
 824             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 825              * page would be stale
 826              */
 827             xbzrle_cache_zero_page(rs, current_addr);
 828             ram_release_pages(block->idstr, offset, pages);
 829         } else if (!rs->ram_bulk_stage &&
 830                    !migration_in_postcopy() && migrate_use_xbzrle()) {
 831             pages = save_xbzrle_page(rs, &p, current_addr, block,
 832                                      offset, last_stage);
 833             if (!last_stage) {
 834                 /* Can't send this cached data async, since the cache page
 835                  * might get updated before it gets to the wire
 836                  */
 837                 send_async = false;
 838             }
 839         }
 840     }
 841
 842     /* XBZRLE overflow or normal page */
 843     if (pages == -1) {
 844         rs->bytes_transferred += save_page_header(rs, rs->f, block,
 845                                                   offset | RAM_SAVE_FLAG_PAGE);
 846         if (send_async) {
 847             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
 848                                   migrate_release_ram() &
 849                                   migration_in_postcopy());
 850         } else {
 851             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
 852         }
 853         rs->bytes_transferred += TARGET_PAGE_SIZE;
 854         pages = 1;
 855         rs->norm_pages++;
 856     }
 857
 858     XBZRLE_cache_unlock();
 859
 860     return pages;
 861 }
 862
 863 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 864                                 ram_addr_t offset)
 865 {
 866     RAMState *rs = &ram_state;
 867     int bytes_sent, blen;
 868     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 869
 870     bytes_sent = save_page_header(rs, f, block, offset |
 871                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
 872     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 873                                      migrate_compress_level());
 874     if (blen < 0) {
 875         bytes_sent = 0;
 876         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
 877         error_report("compressed data failed!");
 878     } else {
 879         bytes_sent += blen;
 880         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
 881     }
 882
 883     return bytes_sent;
 884 }
 885
 886 static void flush_compressed_data(RAMState *rs)
 887 {
 888     int idx, len, thread_count;
 889
 890     if (!migrate_use_compression()) {
 891         return;
 892     }
 893     thread_count = migrate_compress_threads();
 894
 895     qemu_mutex_lock(&comp_done_lock);
 896     for (idx = 0; idx < thread_count; idx++) {
 897         while (!comp_param[idx].done) {
 898             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 899         }
 900     }
 901     qemu_mutex_unlock(&comp_done_lock);
 902
 903     for (idx = 0; idx < thread_count; idx++) {
 904         qemu_mutex_lock(&comp_param[idx].mutex);
 905         if (!comp_param[idx].quit) {
 906             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 907             rs->bytes_transferred += len;
 908         }
 909         qemu_mutex_unlock(&comp_param[idx].mutex);
 910     }
 911 }
 912
 913 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 914                                        ram_addr_t offset)
 915 {
 916     param->block = block;
 917     param->offset = offset;
 918 }
 919
 920 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
 921                                            ram_addr_t offset)
 922 {
 923     int idx, thread_count, bytes_xmit = -1, pages = -1;
 924
 925     thread_count = migrate_compress_threads();
 926     qemu_mutex_lock(&comp_done_lock);
 927     while (true) {
 928         for (idx = 0; idx < thread_count; idx++) {
 929             if (comp_param[idx].done) {
 930                 comp_param[idx].done = false;
 931                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 932                 qemu_mutex_lock(&comp_param[idx].mutex);
 933                 set_compress_params(&comp_param[idx], block, offset);
 934                 qemu_cond_signal(&comp_param[idx].cond);
 935                 qemu_mutex_unlock(&comp_param[idx].mutex);
 936                 pages = 1;
 937                 rs->norm_pages++;
 938                 rs->bytes_transferred += bytes_xmit;
 939                 break;
 940             }
 941         }
 942         if (pages > 0) {
 943             break;
 944         } else {
 945             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 946         }
 947     }
 948     qemu_mutex_unlock(&comp_done_lock);
 949
 950     return pages;
 951 }
 952
 953 /**
 954  * ram_save_compressed_page: compress the given page and send it to the stream
 955  *
 956  * Returns the number of pages written.
 957  *
 958  * @rs: current RAM state
 959  * @block: block that contains the page we want to send
 960  * @offset: offset inside the block for the page
 961  * @last_stage: if we are at the completion stage
 962  */
 963 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
 964                                     bool last_stage)
 965 {
 966     int pages = -1;
 967     uint64_t bytes_xmit = 0;
 968     uint8_t *p;
 969     int ret, blen;
 970     RAMBlock *block = pss->block;
 971     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 972
 973     p = block->host + offset;
 974
 975     ret = ram_control_save_page(rs->f, block->offset,
 976                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
 977     if (bytes_xmit) {
 978         rs->bytes_transferred += bytes_xmit;
 979         pages = 1;
 980     }
 981     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 982         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 983             if (bytes_xmit > 0) {
 984                 rs->norm_pages++;
 985             } else if (bytes_xmit == 0) {
 986                 rs->zero_pages++;
 987             }
 988         }
 989     } else {
 990         /* When starting the process of a new block, the first page of
 991          * the block should be sent out before other pages in the same
 992          * block, and all the pages in last block should have been sent
 993          * out, keeping this order is important, because the 'cont' flag
 994          * is used to avoid resending the block name.
 995          */
 996         if (block != rs->last_sent_block) {
 997             flush_compressed_data(rs);
 998             pages = save_zero_page(rs, block, offset, p);
 999             if (pages == -1) {
1000                 /* Make sure the first page is sent out before other pages */
1001                 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1002                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
1003                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1004                                                  migrate_compress_level());
1005                 if (blen > 0) {
1006                     rs->bytes_transferred += bytes_xmit + blen;
1007                     rs->norm_pages++;
1008                     pages = 1;
1009                 } else {
1010                     qemu_file_set_error(rs->f, blen);
1011                     error_report("compressed data failed!");
1012                 }
1013             }
1014             if (pages > 0) {
1015                 ram_release_pages(block->idstr, offset, pages);
1016             }
1017         } else {
1018             pages = save_zero_page(rs, block, offset, p);
1019             if (pages == -1) {
1020                 pages = compress_page_with_multi_thread(rs, block, offset);
1021             } else {
1022                 ram_release_pages(block->idstr, offset, pages);
1023             }
1024         }
1025     }
1026
1027     return pages;
1028 }
1029
1030 /**
1031  * find_dirty_block: find the next dirty page and update any state
1032  * associated with the search process.
1033  *
1034  * Returns if a page is found
1035  *
1036  * @rs: current RAM state
1037  * @pss: data about the state of the current dirty page scan
1038  * @again: set to false if the search has scanned the whole of RAM
1039  */
1040 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1041 {
1042     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1043     if (pss->complete_round && pss->block == rs->last_seen_block &&
1044         pss->page >= rs->last_page) {
1045         /*
1046          * We've been once around the RAM and haven't found anything.
1047          * Give up.
1048          */
1049         *again = false;
1050         return false;
1051     }
1052     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1053         /* Didn't find anything in this RAM Block */
1054         pss->page = 0;
1055         pss->block = QLIST_NEXT_RCU(pss->block, next);
1056         if (!pss->block) {
1057             /* Hit the end of the list */
1058             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1059             /* Flag that we've looped */
1060             pss->complete_round = true;
1061             rs->ram_bulk_stage = false;
1062             if (migrate_use_xbzrle()) {
1063                 /* If xbzrle is on, stop using the data compression at this
1064                  * point. In theory, xbzrle can do better than compression.
1065                  */
1066                 flush_compressed_data(rs);
1067             }
1068         }
1069         /* Didn't find anything this time, but try again on the new block */
1070         *again = true;
1071         return false;
1072     } else {
1073         /* Can go around again, but... */
1074         *again = true;
1075         /* We've found something so probably don't need to */
1076         return true;
1077     }
1078 }
1079
1080 /**
1081  * unqueue_page: gets a page of the queue
1082  *
1083  * Helper for 'get_queued_page' - gets a page off the queue
1084  *
1085  * Returns the block of the page (or NULL if none available)
1086  *
1087  * @rs: current RAM state
1088  * @offset: used to return the offset within the RAMBlock
1089  */
1090 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1091 {
1092     RAMBlock *block = NULL;
1093
1094     qemu_mutex_lock(&rs->src_page_req_mutex);
1095     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1096         struct RAMSrcPageRequest *entry =
1097                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1098         block = entry->rb;
1099         *offset = entry->offset;
1100
1101         if (entry->len > TARGET_PAGE_SIZE) {
1102             entry->len -= TARGET_PAGE_SIZE;
1103             entry->offset += TARGET_PAGE_SIZE;
1104         } else {
1105             memory_region_unref(block->mr);
1106             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1107             g_free(entry);
1108         }
1109     }
1110     qemu_mutex_unlock(&rs->src_page_req_mutex);
1111
1112     return block;
1113 }
1114
1115 /**
1116  * get_queued_page: unqueue a page from the postocpy requests
1117  *
1118  * Skips pages that are already sent (!dirty)
1119  *
1120  * Returns if a queued page is found
1121  *
1122  * @rs: current RAM state
1123  * @pss: data about the state of the current dirty page scan
1124  */
1125 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1126 {
1127     RAMBlock  *block;
1128     ram_addr_t offset;
1129     bool dirty;
1130
1131     do {
1132         block = unqueue_page(rs, &offset);
1133         /*
1134          * We're sending this page, and since it's postcopy nothing else
1135          * will dirty it, and we must make sure it doesn't get sent again
1136          * even if this queue request was received after the background
1137          * search already sent it.
1138          */
1139         if (block) {
1140             unsigned long page;
1141
1142             page = offset >> TARGET_PAGE_BITS;
1143             dirty = test_bit(page, block->bmap);
1144             if (!dirty) {
1145                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1146                        page, test_bit(page, block->unsentmap));
1147             } else {
1148                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1149             }
1150         }
1151
1152     } while (block && !dirty);
1153
1154     if (block) {
1155         /*
1156          * As soon as we start servicing pages out of order, then we have
1157          * to kill the bulk stage, since the bulk stage assumes
1158          * in (migration_bitmap_find_and_reset_dirty) that every page is
1159          * dirty, that's no longer true.
1160          */
1161         rs->ram_bulk_stage = false;
1162
1163         /*
1164          * We want the background search to continue from the queued page
1165          * since the guest is likely to want other pages near to the page
1166          * it just requested.
1167          */
1168         pss->block = block;
1169         pss->page = offset >> TARGET_PAGE_BITS;
1170     }
1171
1172     return !!block;
1173 }
1174
1175 /**
1176  * migration_page_queue_free: drop any remaining pages in the ram
1177  * request queue
1178  *
1179  * It should be empty at the end anyway, but in error cases there may
1180  * be some left.  in case that there is any page left, we drop it.
1181  *
1182  */
1183 void migration_page_queue_free(void)
1184 {
1185     struct RAMSrcPageRequest *mspr, *next_mspr;
1186     RAMState *rs = &ram_state;
1187     /* This queue generally should be empty - but in the case of a failed
1188      * migration might have some droppings in.
1189      */
1190     rcu_read_lock();
1191     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1192         memory_region_unref(mspr->rb->mr);
1193         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1194         g_free(mspr);
1195     }
1196     rcu_read_unlock();
1197 }
1198
1199 /**
1200  * ram_save_queue_pages: queue the page for transmission
1201  *
1202  * A request from postcopy destination for example.
1203  *
1204  * Returns zero on success or negative on error
1205  *
1206  * @rbname: Name of the RAMBLock of the request. NULL means the
1207  *          same that last one.
1208  * @start: starting address from the start of the RAMBlock
1209  * @len: length (in bytes) to send
1210  */
1211 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1212 {
1213     RAMBlock *ramblock;
1214     RAMState *rs = &ram_state;
1215
1216     rs->postcopy_requests++;
1217     rcu_read_lock();
1218     if (!rbname) {
1219         /* Reuse last RAMBlock */
1220         ramblock = rs->last_req_rb;
1221
1222         if (!ramblock) {
1223             /*
1224              * Shouldn't happen, we can't reuse the last RAMBlock if
1225              * it's the 1st request.
1226              */
1227             error_report("ram_save_queue_pages no previous block");
1228             goto err;
1229         }
1230     } else {
1231         ramblock = qemu_ram_block_by_name(rbname);
1232
1233         if (!ramblock) {
1234             /* We shouldn't be asked for a non-existent RAMBlock */
1235             error_report("ram_save_queue_pages no block '%s'", rbname);
1236             goto err;
1237         }
1238         rs->last_req_rb = ramblock;
1239     }
1240     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1241     if (start+len > ramblock->used_length) {
1242         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1243                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1244                      __func__, start, len, ramblock->used_length);
1245         goto err;
1246     }
1247
1248     struct RAMSrcPageRequest *new_entry =
1249         g_malloc0(sizeof(struct RAMSrcPageRequest));
1250     new_entry->rb = ramblock;
1251     new_entry->offset = start;
1252     new_entry->len = len;
1253
1254     memory_region_ref(ramblock->mr);
1255     qemu_mutex_lock(&rs->src_page_req_mutex);
1256     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1257     qemu_mutex_unlock(&rs->src_page_req_mutex);
1258     rcu_read_unlock();
1259
1260     return 0;
1261
1262 err:
1263     rcu_read_unlock();
1264     return -1;
1265 }
1266
1267 /**
1268  * ram_save_target_page: save one target page
1269  *
1270  * Returns the number of pages written
1271  *
1272  * @rs: current RAM state
1273  * @ms: current migration state
1274  * @pss: data about the page we want to send
1275  * @last_stage: if we are at the completion stage
1276  */
1277 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1278                                 bool last_stage)
1279 {
1280     int res = 0;
1281
1282     /* Check the pages is dirty and if it is send it */
1283     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1284         /*
1285          * If xbzrle is on, stop using the data compression after first
1286          * round of migration even if compression is enabled. In theory,
1287          * xbzrle can do better than compression.
1288          */
1289         if (migrate_use_compression() &&
1290             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1291             res = ram_save_compressed_page(rs, pss, last_stage);
1292         } else {
1293             res = ram_save_page(rs, pss, last_stage);
1294         }
1295
1296         if (res < 0) {
1297             return res;
1298         }
1299         if (pss->block->unsentmap) {
1300             clear_bit(pss->page, pss->block->unsentmap);
1301         }
1302     }
1303
1304     return res;
1305 }
1306
1307 /**
1308  * ram_save_host_page: save a whole host page
1309  *
1310  * Starting at *offset send pages up to the end of the current host
1311  * page. It's valid for the initial offset to point into the middle of
1312  * a host page in which case the remainder of the hostpage is sent.
1313  * Only dirty target pages are sent. Note that the host page size may
1314  * be a huge page for this block.
1315  *
1316  * Returns the number of pages written or negative on error
1317  *
1318  * @rs: current RAM state
1319  * @ms: current migration state
1320  * @pss: data about the page we want to send
1321  * @last_stage: if we are at the completion stage
1322  */
1323 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1324                               bool last_stage)
1325 {
1326     int tmppages, pages = 0;
1327     size_t pagesize_bits =
1328         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1329
1330     do {
1331         tmppages = ram_save_target_page(rs, pss, last_stage);
1332         if (tmppages < 0) {
1333             return tmppages;
1334         }
1335
1336         pages += tmppages;
1337         pss->page++;
1338     } while (pss->page & (pagesize_bits - 1));
1339
1340     /* The offset we leave with is the last one we looked at */
1341     pss->page--;
1342     return pages;
1343 }
1344
1345 /**
1346  * ram_find_and_save_block: finds a dirty page and sends it to f
1347  *
1348  * Called within an RCU critical section.
1349  *
1350  * Returns the number of pages written where zero means no dirty pages
1351  *
1352  * @rs: current RAM state
1353  * @last_stage: if we are at the completion stage
1354  *
1355  * On systems where host-page-size > target-page-size it will send all the
1356  * pages in a host page that are dirty.
1357  */
1358
1359 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1360 {
1361     PageSearchStatus pss;
1362     int pages = 0;
1363     bool again, found;
1364
1365     /* No dirty page as there is zero RAM */
1366     if (!ram_bytes_total()) {
1367         return pages;
1368     }
1369
1370     pss.block = rs->last_seen_block;
1371     pss.page = rs->last_page;
1372     pss.complete_round = false;
1373
1374     if (!pss.block) {
1375         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1376     }
1377
1378     do {
1379         again = true;
1380         found = get_queued_page(rs, &pss);
1381
1382         if (!found) {
1383             /* priority queue empty, so just search for something dirty */
1384             found = find_dirty_block(rs, &pss, &again);
1385         }
1386
1387         if (found) {
1388             pages = ram_save_host_page(rs, &pss, last_stage);
1389         }
1390     } while (!pages && again);
1391
1392     rs->last_seen_block = pss.block;
1393     rs->last_page = pss.page;
1394
1395     return pages;
1396 }
1397
1398 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1399 {
1400     uint64_t pages = size / TARGET_PAGE_SIZE;
1401     RAMState *rs = &ram_state;
1402
1403     if (zero) {
1404         rs->zero_pages += pages;
1405     } else {
1406         rs->norm_pages += pages;
1407         rs->bytes_transferred += size;
1408         qemu_update_position(f, size);
1409     }
1410 }
1411
1412 uint64_t ram_bytes_total(void)
1413 {
1414     RAMBlock *block;
1415     uint64_t total = 0;
1416
1417     rcu_read_lock();
1418     RAMBLOCK_FOREACH(block) {
1419         total += block->used_length;
1420     }
1421     rcu_read_unlock();
1422     return total;
1423 }
1424
1425 void free_xbzrle_decoded_buf(void)
1426 {
1427     g_free(xbzrle_decoded_buf);
1428     xbzrle_decoded_buf = NULL;
1429 }
1430
1431 static void ram_migration_cleanup(void *opaque)
1432 {
1433     RAMBlock *block;
1434
1435     /* caller have hold iothread lock or is in a bh, so there is
1436      * no writing race against this migration_bitmap
1437      */
1438     memory_global_dirty_log_stop();
1439
1440     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1441         g_free(block->bmap);
1442         block->bmap = NULL;
1443         g_free(block->unsentmap);
1444         block->unsentmap = NULL;
1445     }
1446
1447     XBZRLE_cache_lock();
1448     if (XBZRLE.cache) {
1449         cache_fini(XBZRLE.cache);
1450         g_free(XBZRLE.encoded_buf);
1451         g_free(XBZRLE.current_buf);
1452         g_free(ZERO_TARGET_PAGE);
1453         XBZRLE.cache = NULL;
1454         XBZRLE.encoded_buf = NULL;
1455         XBZRLE.current_buf = NULL;
1456     }
1457     XBZRLE_cache_unlock();
1458 }
1459
1460 static void ram_state_reset(RAMState *rs)
1461 {
1462     rs->last_seen_block = NULL;
1463     rs->last_sent_block = NULL;
1464     rs->last_page = 0;
1465     rs->last_version = ram_list.version;
1466     rs->ram_bulk_stage = true;
1467 }
1468
1469 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1470
1471 /*
1472  * 'expected' is the value you expect the bitmap mostly to be full
1473  * of; it won't bother printing lines that are all this value.
1474  * If 'todump' is null the migration bitmap is dumped.
1475  */
1476 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1477                            unsigned long pages)
1478 {
1479     int64_t cur;
1480     int64_t linelen = 128;
1481     char linebuf[129];
1482
1483     for (cur = 0; cur < pages; cur += linelen) {
1484         int64_t curb;
1485         bool found = false;
1486         /*
1487          * Last line; catch the case where the line length
1488          * is longer than remaining ram
1489          */
1490         if (cur + linelen > pages) {
1491             linelen = pages - cur;
1492         }
1493         for (curb = 0; curb < linelen; curb++) {
1494             bool thisbit = test_bit(cur + curb, todump);
1495             linebuf[curb] = thisbit ? '1' : '.';
1496             found = found || (thisbit != expected);
1497         }
1498         if (found) {
1499             linebuf[curb] = '\0';
1500             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1501         }
1502     }
1503 }
1504
1505 /* **** functions for postcopy ***** */
1506
1507 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1508 {
1509     struct RAMBlock *block;
1510
1511     RAMBLOCK_FOREACH(block) {
1512         unsigned long *bitmap = block->bmap;
1513         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1514         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1515
1516         while (run_start < range) {
1517             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1518             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1519                               (run_end - run_start) << TARGET_PAGE_BITS);
1520             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1521         }
1522     }
1523 }
1524
1525 /**
1526  * postcopy_send_discard_bm_ram: discard a RAMBlock
1527  *
1528  * Returns zero on success
1529  *
1530  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1531  * Note: At this point the 'unsentmap' is the processed bitmap combined
1532  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1533  *
1534  * @ms: current migration state
1535  * @pds: state for postcopy
1536  * @start: RAMBlock starting page
1537  * @length: RAMBlock size
1538  */
1539 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1540                                         PostcopyDiscardState *pds,
1541                                         RAMBlock *block)
1542 {
1543     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1544     unsigned long current;
1545     unsigned long *unsentmap = block->unsentmap;
1546
1547     for (current = 0; current < end; ) {
1548         unsigned long one = find_next_bit(unsentmap, end, current);
1549
1550         if (one <= end) {
1551             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1552             unsigned long discard_length;
1553
1554             if (zero >= end) {
1555                 discard_length = end - one;
1556             } else {
1557                 discard_length = zero - one;
1558             }
1559             if (discard_length) {
1560                 postcopy_discard_send_range(ms, pds, one, discard_length);
1561             }
1562             current = one + discard_length;
1563         } else {
1564             current = one;
1565         }
1566     }
1567
1568     return 0;
1569 }
1570
1571 /**
1572  * postcopy_each_ram_send_discard: discard all RAMBlocks
1573  *
1574  * Returns 0 for success or negative for error
1575  *
1576  * Utility for the outgoing postcopy code.
1577  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1578  *   passing it bitmap indexes and name.
1579  * (qemu_ram_foreach_block ends up passing unscaled lengths
1580  *  which would mean postcopy code would have to deal with target page)
1581  *
1582  * @ms: current migration state
1583  */
1584 static int postcopy_each_ram_send_discard(MigrationState *ms)
1585 {
1586     struct RAMBlock *block;
1587     int ret;
1588
1589     RAMBLOCK_FOREACH(block) {
1590         PostcopyDiscardState *pds =
1591             postcopy_discard_send_init(ms, block->idstr);
1592
1593         /*
1594          * Postcopy sends chunks of bitmap over the wire, but it
1595          * just needs indexes at this point, avoids it having
1596          * target page specific code.
1597          */
1598         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1599         postcopy_discard_send_finish(ms, pds);
1600         if (ret) {
1601             return ret;
1602         }
1603     }
1604
1605     return 0;
1606 }
1607
1608 /**
1609  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1610  *
1611  * Helper for postcopy_chunk_hostpages; it's called twice to
1612  * canonicalize the two bitmaps, that are similar, but one is
1613  * inverted.
1614  *
1615  * Postcopy requires that all target pages in a hostpage are dirty or
1616  * clean, not a mix.  This function canonicalizes the bitmaps.
1617  *
1618  * @ms: current migration state
1619  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1620  *               otherwise we need to canonicalize partially dirty host pages
1621  * @block: block that contains the page we want to canonicalize
1622  * @pds: state for postcopy
1623  */
1624 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1625                                           RAMBlock *block,
1626                                           PostcopyDiscardState *pds)
1627 {
1628     RAMState *rs = &ram_state;
1629     unsigned long *bitmap = block->bmap;
1630     unsigned long *unsentmap = block->unsentmap;
1631     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1632     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1633     unsigned long run_start;
1634
1635     if (block->page_size == TARGET_PAGE_SIZE) {
1636         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1637         return;
1638     }
1639
1640     if (unsent_pass) {
1641         /* Find a sent page */
1642         run_start = find_next_zero_bit(unsentmap, pages, 0);
1643     } else {
1644         /* Find a dirty page */
1645         run_start = find_next_bit(bitmap, pages, 0);
1646     }
1647
1648     while (run_start < pages) {
1649         bool do_fixup = false;
1650         unsigned long fixup_start_addr;
1651         unsigned long host_offset;
1652
1653         /*
1654          * If the start of this run of pages is in the middle of a host
1655          * page, then we need to fixup this host page.
1656          */
1657         host_offset = run_start % host_ratio;
1658         if (host_offset) {
1659             do_fixup = true;
1660             run_start -= host_offset;
1661             fixup_start_addr = run_start;
1662             /* For the next pass */
1663             run_start = run_start + host_ratio;
1664         } else {
1665             /* Find the end of this run */
1666             unsigned long run_end;
1667             if (unsent_pass) {
1668                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1669             } else {
1670                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1671             }
1672             /*
1673              * If the end isn't at the start of a host page, then the
1674              * run doesn't finish at the end of a host page
1675              * and we need to discard.
1676              */
1677             host_offset = run_end % host_ratio;
1678             if (host_offset) {
1679                 do_fixup = true;
1680                 fixup_start_addr = run_end - host_offset;
1681                 /*
1682                  * This host page has gone, the next loop iteration starts
1683                  * from after the fixup
1684                  */
1685                 run_start = fixup_start_addr + host_ratio;
1686             } else {
1687                 /*
1688                  * No discards on this iteration, next loop starts from
1689                  * next sent/dirty page
1690                  */
1691                 run_start = run_end + 1;
1692             }
1693         }
1694
1695         if (do_fixup) {
1696             unsigned long page;
1697
1698             /* Tell the destination to discard this page */
1699             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1700                 /* For the unsent_pass we:
1701                  *     discard partially sent pages
1702                  * For the !unsent_pass (dirty) we:
1703                  *     discard partially dirty pages that were sent
1704                  *     (any partially sent pages were already discarded
1705                  *     by the previous unsent_pass)
1706                  */
1707                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1708                                             host_ratio);
1709             }
1710
1711             /* Clean up the bitmap */
1712             for (page = fixup_start_addr;
1713                  page < fixup_start_addr + host_ratio; page++) {
1714                 /* All pages in this host page are now not sent */
1715                 set_bit(page, unsentmap);
1716
1717                 /*
1718                  * Remark them as dirty, updating the count for any pages
1719                  * that weren't previously dirty.
1720                  */
1721                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1722             }
1723         }
1724
1725         if (unsent_pass) {
1726             /* Find the next sent page for the next iteration */
1727             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1728         } else {
1729             /* Find the next dirty page for the next iteration */
1730             run_start = find_next_bit(bitmap, pages, run_start);
1731         }
1732     }
1733 }
1734
1735 /**
1736  * postcopy_chuck_hostpages: discrad any partially sent host page
1737  *
1738  * Utility for the outgoing postcopy code.
1739  *
1740  * Discard any partially sent host-page size chunks, mark any partially
1741  * dirty host-page size chunks as all dirty.  In this case the host-page
1742  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1743  *
1744  * Returns zero on success
1745  *
1746  * @ms: current migration state
1747  * @block: block we want to work with
1748  */
1749 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1750 {
1751     PostcopyDiscardState *pds =
1752         postcopy_discard_send_init(ms, block->idstr);
1753
1754     /* First pass: Discard all partially sent host pages */
1755     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1756     /*
1757      * Second pass: Ensure that all partially dirty host pages are made
1758      * fully dirty.
1759      */
1760     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1761
1762     postcopy_discard_send_finish(ms, pds);
1763     return 0;
1764 }
1765
1766 /**
1767  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1768  *
1769  * Returns zero on success
1770  *
1771  * Transmit the set of pages to be discarded after precopy to the target
1772  * these are pages that:
1773  *     a) Have been previously transmitted but are now dirty again
1774  *     b) Pages that have never been transmitted, this ensures that
1775  *        any pages on the destination that have been mapped by background
1776  *        tasks get discarded (transparent huge pages is the specific concern)
1777  * Hopefully this is pretty sparse
1778  *
1779  * @ms: current migration state
1780  */
1781 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1782 {
1783     RAMState *rs = &ram_state;
1784     RAMBlock *block;
1785     int ret;
1786
1787     rcu_read_lock();
1788
1789     /* This should be our last sync, the src is now paused */
1790     migration_bitmap_sync(rs);
1791
1792     /* Easiest way to make sure we don't resume in the middle of a host-page */
1793     rs->last_seen_block = NULL;
1794     rs->last_sent_block = NULL;
1795     rs->last_page = 0;
1796
1797     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1798         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1799         unsigned long *bitmap = block->bmap;
1800         unsigned long *unsentmap = block->unsentmap;
1801
1802         if (!unsentmap) {
1803             /* We don't have a safe way to resize the sentmap, so
1804              * if the bitmap was resized it will be NULL at this
1805              * point.
1806              */
1807             error_report("migration ram resized during precopy phase");
1808             rcu_read_unlock();
1809             return -EINVAL;
1810         }
1811         /* Deal with TPS != HPS and huge pages */
1812         ret = postcopy_chunk_hostpages(ms, block);
1813         if (ret) {
1814             rcu_read_unlock();
1815             return ret;
1816         }
1817
1818         /*
1819          * Update the unsentmap to be unsentmap = unsentmap | dirty
1820          */
1821         bitmap_or(unsentmap, unsentmap, bitmap, pages);
1822 #ifdef DEBUG_POSTCOPY
1823         ram_debug_dump_bitmap(unsentmap, true, pages);
1824 #endif
1825     }
1826     trace_ram_postcopy_send_discard_bitmap();
1827
1828     ret = postcopy_each_ram_send_discard(ms);
1829     rcu_read_unlock();
1830
1831     return ret;
1832 }
1833
1834 /**
1835  * ram_discard_range: discard dirtied pages at the beginning of postcopy
1836  *
1837  * Returns zero on success
1838  *
1839  * @rbname: name of the RAMBlock of the request. NULL means the
1840  *          same that last one.
1841  * @start: RAMBlock starting page
1842  * @length: RAMBlock size
1843  */
1844 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1845 {
1846     int ret = -1;
1847
1848     trace_ram_discard_range(rbname, start, length);
1849
1850     rcu_read_lock();
1851     RAMBlock *rb = qemu_ram_block_by_name(rbname);
1852
1853     if (!rb) {
1854         error_report("ram_discard_range: Failed to find block '%s'", rbname);
1855         goto err;
1856     }
1857
1858     ret = ram_block_discard_range(rb, start, length);
1859
1860 err:
1861     rcu_read_unlock();
1862
1863     return ret;
1864 }
1865
1866 static int ram_state_init(RAMState *rs)
1867 {
1868     memset(rs, 0, sizeof(*rs));
1869     qemu_mutex_init(&rs->bitmap_mutex);
1870     qemu_mutex_init(&rs->src_page_req_mutex);
1871     QSIMPLEQ_INIT(&rs->src_page_requests);
1872
1873     if (migrate_use_xbzrle()) {
1874         XBZRLE_cache_lock();
1875         ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
1876         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1877                                   TARGET_PAGE_SIZE,
1878                                   TARGET_PAGE_SIZE);
1879         if (!XBZRLE.cache) {
1880             XBZRLE_cache_unlock();
1881             error_report("Error creating cache");
1882             return -1;
1883         }
1884         XBZRLE_cache_unlock();
1885
1886         /* We prefer not to abort if there is no memory */
1887         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1888         if (!XBZRLE.encoded_buf) {
1889             error_report("Error allocating encoded_buf");
1890             return -1;
1891         }
1892
1893         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1894         if (!XBZRLE.current_buf) {
1895             error_report("Error allocating current_buf");
1896             g_free(XBZRLE.encoded_buf);
1897             XBZRLE.encoded_buf = NULL;
1898             return -1;
1899         }
1900     }
1901
1902     /* For memory_global_dirty_log_start below.  */
1903     qemu_mutex_lock_iothread();
1904
1905     qemu_mutex_lock_ramlist();
1906     rcu_read_lock();
1907     ram_state_reset(rs);
1908
1909     /* Skip setting bitmap if there is no RAM */
1910     if (ram_bytes_total()) {
1911         RAMBlock *block;
1912
1913         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1914             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
1915
1916             block->bmap = bitmap_new(pages);
1917             bitmap_set(block->bmap, 0, pages);
1918             if (migrate_postcopy_ram()) {
1919                 block->unsentmap = bitmap_new(pages);
1920                 bitmap_set(block->unsentmap, 0, pages);
1921             }
1922         }
1923     }
1924
1925     /*
1926      * Count the total number of pages used by ram blocks not including any
1927      * gaps due to alignment or unplugs.
1928      */
1929     rs->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1930
1931     memory_global_dirty_log_start();
1932     migration_bitmap_sync(rs);
1933     qemu_mutex_unlock_ramlist();
1934     qemu_mutex_unlock_iothread();
1935     rcu_read_unlock();
1936
1937     return 0;
1938 }
1939
1940 /*
1941  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1942  * long-running RCU critical section.  When rcu-reclaims in the code
1943  * start to become numerous it will be necessary to reduce the
1944  * granularity of these critical sections.
1945  */
1946
1947 /**
1948  * ram_save_setup: Setup RAM for migration
1949  *
1950  * Returns zero to indicate success and negative for error
1951  *
1952  * @f: QEMUFile where to send the data
1953  * @opaque: RAMState pointer
1954  */
1955 static int ram_save_setup(QEMUFile *f, void *opaque)
1956 {
1957     RAMState *rs = opaque;
1958     RAMBlock *block;
1959
1960     /* migration has already setup the bitmap, reuse it. */
1961     if (!migration_in_colo_state()) {
1962         if (ram_state_init(rs) < 0) {
1963             return -1;
1964          }
1965     }
1966     rs->f = f;
1967
1968     rcu_read_lock();
1969
1970     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1971
1972     RAMBLOCK_FOREACH(block) {
1973         qemu_put_byte(f, strlen(block->idstr));
1974         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1975         qemu_put_be64(f, block->used_length);
1976         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
1977             qemu_put_be64(f, block->page_size);
1978         }
1979     }
1980
1981     rcu_read_unlock();
1982
1983     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1984     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1985
1986     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1987
1988     return 0;
1989 }
1990
1991 /**
1992  * ram_save_iterate: iterative stage for migration
1993  *
1994  * Returns zero to indicate success and negative for error
1995  *
1996  * @f: QEMUFile where to send the data
1997  * @opaque: RAMState pointer
1998  */
1999 static int ram_save_iterate(QEMUFile *f, void *opaque)
2000 {
2001     RAMState *rs = opaque;
2002     int ret;
2003     int i;
2004     int64_t t0;
2005     int done = 0;
2006
2007     rcu_read_lock();
2008     if (ram_list.version != rs->last_version) {
2009         ram_state_reset(rs);
2010     }
2011
2012     /* Read version before ram_list.blocks */
2013     smp_rmb();
2014
2015     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2016
2017     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2018     i = 0;
2019     while ((ret = qemu_file_rate_limit(f)) == 0) {
2020         int pages;
2021
2022         pages = ram_find_and_save_block(rs, false);
2023         /* no more pages to sent */
2024         if (pages == 0) {
2025             done = 1;
2026             break;
2027         }
2028         rs->iterations++;
2029
2030         /* we want to check in the 1st loop, just in case it was the 1st time
2031            and we had to sync the dirty bitmap.
2032            qemu_get_clock_ns() is a bit expensive, so we only check each some
2033            iterations
2034         */
2035         if ((i & 63) == 0) {
2036             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2037             if (t1 > MAX_WAIT) {
2038                 trace_ram_save_iterate_big_wait(t1, i);
2039                 break;
2040             }
2041         }
2042         i++;
2043     }
2044     flush_compressed_data(rs);
2045     rcu_read_unlock();
2046
2047     /*
2048      * Must occur before EOS (or any QEMUFile operation)
2049      * because of RDMA protocol.
2050      */
2051     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2052
2053     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2054     rs->bytes_transferred += 8;
2055
2056     ret = qemu_file_get_error(f);
2057     if (ret < 0) {
2058         return ret;
2059     }
2060
2061     return done;
2062 }
2063
2064 /**
2065  * ram_save_complete: function called to send the remaining amount of ram
2066  *
2067  * Returns zero to indicate success
2068  *
2069  * Called with iothread lock
2070  *
2071  * @f: QEMUFile where to send the data
2072  * @opaque: RAMState pointer
2073  */
2074 static int ram_save_complete(QEMUFile *f, void *opaque)
2075 {
2076     RAMState *rs = opaque;
2077
2078     rcu_read_lock();
2079
2080     if (!migration_in_postcopy()) {
2081         migration_bitmap_sync(rs);
2082     }
2083
2084     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2085
2086     /* try transferring iterative blocks of memory */
2087
2088     /* flush all remaining blocks regardless of rate limiting */
2089     while (true) {
2090         int pages;
2091
2092         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2093         /* no more blocks to sent */
2094         if (pages == 0) {
2095             break;
2096         }
2097     }
2098
2099     flush_compressed_data(rs);
2100     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2101
2102     rcu_read_unlock();
2103
2104     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2105
2106     return 0;
2107 }
2108
2109 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2110                              uint64_t *non_postcopiable_pending,
2111                              uint64_t *postcopiable_pending)
2112 {
2113     RAMState *rs = opaque;
2114     uint64_t remaining_size;
2115
2116     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2117
2118     if (!migration_in_postcopy() &&
2119         remaining_size < max_size) {
2120         qemu_mutex_lock_iothread();
2121         rcu_read_lock();
2122         migration_bitmap_sync(rs);
2123         rcu_read_unlock();
2124         qemu_mutex_unlock_iothread();
2125         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2126     }
2127
2128     /* We can do postcopy, and all the data is postcopiable */
2129     *postcopiable_pending += remaining_size;
2130 }
2131
2132 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2133 {
2134     unsigned int xh_len;
2135     int xh_flags;
2136     uint8_t *loaded_data;
2137
2138     if (!xbzrle_decoded_buf) {
2139         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2140     }
2141     loaded_data = xbzrle_decoded_buf;
2142
2143     /* extract RLE header */
2144     xh_flags = qemu_get_byte(f);
2145     xh_len = qemu_get_be16(f);
2146
2147     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2148         error_report("Failed to load XBZRLE page - wrong compression!");
2149         return -1;
2150     }
2151
2152     if (xh_len > TARGET_PAGE_SIZE) {
2153         error_report("Failed to load XBZRLE page - len overflow!");
2154         return -1;
2155     }
2156     /* load data and decode */
2157     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2158
2159     /* decode RLE */
2160     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2161                              TARGET_PAGE_SIZE) == -1) {
2162         error_report("Failed to load XBZRLE page - decode error!");
2163         return -1;
2164     }
2165
2166     return 0;
2167 }
2168
2169 /**
2170  * ram_block_from_stream: read a RAMBlock id from the migration stream
2171  *
2172  * Must be called from within a rcu critical section.
2173  *
2174  * Returns a pointer from within the RCU-protected ram_list.
2175  *
2176  * @f: QEMUFile where to read the data from
2177  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2178  */
2179 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2180 {
2181     static RAMBlock *block = NULL;
2182     char id[256];
2183     uint8_t len;
2184
2185     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2186         if (!block) {
2187             error_report("Ack, bad migration stream!");
2188             return NULL;
2189         }
2190         return block;
2191     }
2192
2193     len = qemu_get_byte(f);
2194     qemu_get_buffer(f, (uint8_t *)id, len);
2195     id[len] = 0;
2196
2197     block = qemu_ram_block_by_name(id);
2198     if (!block) {
2199         error_report("Can't find block %s", id);
2200         return NULL;
2201     }
2202
2203     return block;
2204 }
2205
2206 static inline void *host_from_ram_block_offset(RAMBlock *block,
2207                                                ram_addr_t offset)
2208 {
2209     if (!offset_in_ramblock(block, offset)) {
2210         return NULL;
2211     }
2212
2213     return block->host + offset;
2214 }
2215
2216 /**
2217  * ram_handle_compressed: handle the zero page case
2218  *
2219  * If a page (or a whole RDMA chunk) has been
2220  * determined to be zero, then zap it.
2221  *
2222  * @host: host address for the zero page
2223  * @ch: what the page is filled from.  We only support zero
2224  * @size: size of the zero page
2225  */
2226 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2227 {
2228     if (ch != 0 || !is_zero_range(host, size)) {
2229         memset(host, ch, size);
2230     }
2231 }
2232
2233 static void *do_data_decompress(void *opaque)
2234 {
2235     DecompressParam *param = opaque;
2236     unsigned long pagesize;
2237     uint8_t *des;
2238     int len;
2239
2240     qemu_mutex_lock(&param->mutex);
2241     while (!param->quit) {
2242         if (param->des) {
2243             des = param->des;
2244             len = param->len;
2245             param->des = 0;
2246             qemu_mutex_unlock(&param->mutex);
2247
2248             pagesize = TARGET_PAGE_SIZE;
2249             /* uncompress() will return failed in some case, especially
2250              * when the page is dirted when doing the compression, it's
2251              * not a problem because the dirty page will be retransferred
2252              * and uncompress() won't break the data in other pages.
2253              */
2254             uncompress((Bytef *)des, &pagesize,
2255                        (const Bytef *)param->compbuf, len);
2256
2257             qemu_mutex_lock(&decomp_done_lock);
2258             param->done = true;
2259             qemu_cond_signal(&decomp_done_cond);
2260             qemu_mutex_unlock(&decomp_done_lock);
2261
2262             qemu_mutex_lock(&param->mutex);
2263         } else {
2264             qemu_cond_wait(&param->cond, &param->mutex);
2265         }
2266     }
2267     qemu_mutex_unlock(&param->mutex);
2268
2269     return NULL;
2270 }
2271
2272 static void wait_for_decompress_done(void)
2273 {
2274     int idx, thread_count;
2275
2276     if (!migrate_use_compression()) {
2277         return;
2278     }
2279
2280     thread_count = migrate_decompress_threads();
2281     qemu_mutex_lock(&decomp_done_lock);
2282     for (idx = 0; idx < thread_count; idx++) {
2283         while (!decomp_param[idx].done) {
2284             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2285         }
2286     }
2287     qemu_mutex_unlock(&decomp_done_lock);
2288 }
2289
2290 void migrate_decompress_threads_create(void)
2291 {
2292     int i, thread_count;
2293
2294     thread_count = migrate_decompress_threads();
2295     decompress_threads = g_new0(QemuThread, thread_count);
2296     decomp_param = g_new0(DecompressParam, thread_count);
2297     qemu_mutex_init(&decomp_done_lock);
2298     qemu_cond_init(&decomp_done_cond);
2299     for (i = 0; i < thread_count; i++) {
2300         qemu_mutex_init(&decomp_param[i].mutex);
2301         qemu_cond_init(&decomp_param[i].cond);
2302         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2303         decomp_param[i].done = true;
2304         decomp_param[i].quit = false;
2305         qemu_thread_create(decompress_threads + i, "decompress",
2306                            do_data_decompress, decomp_param + i,
2307                            QEMU_THREAD_JOINABLE);
2308     }
2309 }
2310
2311 void migrate_decompress_threads_join(void)
2312 {
2313     int i, thread_count;
2314
2315     thread_count = migrate_decompress_threads();
2316     for (i = 0; i < thread_count; i++) {
2317         qemu_mutex_lock(&decomp_param[i].mutex);
2318         decomp_param[i].quit = true;
2319         qemu_cond_signal(&decomp_param[i].cond);
2320         qemu_mutex_unlock(&decomp_param[i].mutex);
2321     }
2322     for (i = 0; i < thread_count; i++) {
2323         qemu_thread_join(decompress_threads + i);
2324         qemu_mutex_destroy(&decomp_param[i].mutex);
2325         qemu_cond_destroy(&decomp_param[i].cond);
2326         g_free(decomp_param[i].compbuf);
2327     }
2328     g_free(decompress_threads);
2329     g_free(decomp_param);
2330     decompress_threads = NULL;
2331     decomp_param = NULL;
2332 }
2333
2334 static void decompress_data_with_multi_threads(QEMUFile *f,
2335                                                void *host, int len)
2336 {
2337     int idx, thread_count;
2338
2339     thread_count = migrate_decompress_threads();
2340     qemu_mutex_lock(&decomp_done_lock);
2341     while (true) {
2342         for (idx = 0; idx < thread_count; idx++) {
2343             if (decomp_param[idx].done) {
2344                 decomp_param[idx].done = false;
2345                 qemu_mutex_lock(&decomp_param[idx].mutex);
2346                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2347                 decomp_param[idx].des = host;
2348                 decomp_param[idx].len = len;
2349                 qemu_cond_signal(&decomp_param[idx].cond);
2350                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2351                 break;
2352             }
2353         }
2354         if (idx < thread_count) {
2355             break;
2356         } else {
2357             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2358         }
2359     }
2360     qemu_mutex_unlock(&decomp_done_lock);
2361 }
2362
2363 /**
2364  * ram_postcopy_incoming_init: allocate postcopy data structures
2365  *
2366  * Returns 0 for success and negative if there was one error
2367  *
2368  * @mis: current migration incoming state
2369  *
2370  * Allocate data structures etc needed by incoming migration with
2371  * postcopy-ram. postcopy-ram's similarly names
2372  * postcopy_ram_incoming_init does the work.
2373  */
2374 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2375 {
2376     unsigned long ram_pages = last_ram_page();
2377
2378     return postcopy_ram_incoming_init(mis, ram_pages);
2379 }
2380
2381 /**
2382  * ram_load_postcopy: load a page in postcopy case
2383  *
2384  * Returns 0 for success or -errno in case of error
2385  *
2386  * Called in postcopy mode by ram_load().
2387  * rcu_read_lock is taken prior to this being called.
2388  *
2389  * @f: QEMUFile where to send the data
2390  */
2391 static int ram_load_postcopy(QEMUFile *f)
2392 {
2393     int flags = 0, ret = 0;
2394     bool place_needed = false;
2395     bool matching_page_sizes = false;
2396     MigrationIncomingState *mis = migration_incoming_get_current();
2397     /* Temporary page that is later 'placed' */
2398     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2399     void *last_host = NULL;
2400     bool all_zero = false;
2401
2402     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2403         ram_addr_t addr;
2404         void *host = NULL;
2405         void *page_buffer = NULL;
2406         void *place_source = NULL;
2407         RAMBlock *block = NULL;
2408         uint8_t ch;
2409
2410         addr = qemu_get_be64(f);
2411         flags = addr & ~TARGET_PAGE_MASK;
2412         addr &= TARGET_PAGE_MASK;
2413
2414         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2415         place_needed = false;
2416         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2417             block = ram_block_from_stream(f, flags);
2418
2419             host = host_from_ram_block_offset(block, addr);
2420             if (!host) {
2421                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2422                 ret = -EINVAL;
2423                 break;
2424             }
2425             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2426             /*
2427              * Postcopy requires that we place whole host pages atomically;
2428              * these may be huge pages for RAMBlocks that are backed by
2429              * hugetlbfs.
2430              * To make it atomic, the data is read into a temporary page
2431              * that's moved into place later.
2432              * The migration protocol uses,  possibly smaller, target-pages
2433              * however the source ensures it always sends all the components
2434              * of a host page in order.
2435              */
2436             page_buffer = postcopy_host_page +
2437                           ((uintptr_t)host & (block->page_size - 1));
2438             /* If all TP are zero then we can optimise the place */
2439             if (!((uintptr_t)host & (block->page_size - 1))) {
2440                 all_zero = true;
2441             } else {
2442                 /* not the 1st TP within the HP */
2443                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2444                     error_report("Non-sequential target page %p/%p",
2445                                   host, last_host);
2446                     ret = -EINVAL;
2447                     break;
2448                 }
2449             }
2450
2451
2452             /*
2453              * If it's the last part of a host page then we place the host
2454              * page
2455              */
2456             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2457                                      (block->page_size - 1)) == 0;
2458             place_source = postcopy_host_page;
2459         }
2460         last_host = host;
2461
2462         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2463         case RAM_SAVE_FLAG_ZERO:
2464             ch = qemu_get_byte(f);
2465             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2466             if (ch) {
2467                 all_zero = false;
2468             }
2469             break;
2470
2471         case RAM_SAVE_FLAG_PAGE:
2472             all_zero = false;
2473             if (!place_needed || !matching_page_sizes) {
2474                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2475             } else {
2476                 /* Avoids the qemu_file copy during postcopy, which is
2477                  * going to do a copy later; can only do it when we
2478                  * do this read in one go (matching page sizes)
2479                  */
2480                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2481                                          TARGET_PAGE_SIZE);
2482             }
2483             break;
2484         case RAM_SAVE_FLAG_EOS:
2485             /* normal exit */
2486             break;
2487         default:
2488             error_report("Unknown combination of migration flags: %#x"
2489                          " (postcopy mode)", flags);
2490             ret = -EINVAL;
2491         }
2492
2493         if (place_needed) {
2494             /* This gets called at the last target page in the host page */
2495             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2496
2497             if (all_zero) {
2498                 ret = postcopy_place_page_zero(mis, place_dest,
2499                                                block->page_size);
2500             } else {
2501                 ret = postcopy_place_page(mis, place_dest,
2502                                           place_source, block->page_size);
2503             }
2504         }
2505         if (!ret) {
2506             ret = qemu_file_get_error(f);
2507         }
2508     }
2509
2510     return ret;
2511 }
2512
2513 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2514 {
2515     int flags = 0, ret = 0;
2516     static uint64_t seq_iter;
2517     int len = 0;
2518     /*
2519      * If system is running in postcopy mode, page inserts to host memory must
2520      * be atomic
2521      */
2522     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2523     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2524     bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2525
2526     seq_iter++;
2527
2528     if (version_id != 4) {
2529         ret = -EINVAL;
2530     }
2531
2532     /* This RCU critical section can be very long running.
2533      * When RCU reclaims in the code start to become numerous,
2534      * it will be necessary to reduce the granularity of this
2535      * critical section.
2536      */
2537     rcu_read_lock();
2538
2539     if (postcopy_running) {
2540         ret = ram_load_postcopy(f);
2541     }
2542
2543     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2544         ram_addr_t addr, total_ram_bytes;
2545         void *host = NULL;
2546         uint8_t ch;
2547
2548         addr = qemu_get_be64(f);
2549         flags = addr & ~TARGET_PAGE_MASK;
2550         addr &= TARGET_PAGE_MASK;
2551
2552         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2553                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2554             RAMBlock *block = ram_block_from_stream(f, flags);
2555
2556             host = host_from_ram_block_offset(block, addr);
2557             if (!host) {
2558                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2559                 ret = -EINVAL;
2560                 break;
2561             }
2562             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2563         }
2564
2565         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2566         case RAM_SAVE_FLAG_MEM_SIZE:
2567             /* Synchronize RAM block list */
2568             total_ram_bytes = addr;
2569             while (!ret && total_ram_bytes) {
2570                 RAMBlock *block;
2571                 char id[256];
2572                 ram_addr_t length;
2573
2574                 len = qemu_get_byte(f);
2575                 qemu_get_buffer(f, (uint8_t *)id, len);
2576                 id[len] = 0;
2577                 length = qemu_get_be64(f);
2578
2579                 block = qemu_ram_block_by_name(id);
2580                 if (block) {
2581                     if (length != block->used_length) {
2582                         Error *local_err = NULL;
2583
2584                         ret = qemu_ram_resize(block, length,
2585                                               &local_err);
2586                         if (local_err) {
2587                             error_report_err(local_err);
2588                         }
2589                     }
2590                     /* For postcopy we need to check hugepage sizes match */
2591                     if (postcopy_advised &&
2592                         block->page_size != qemu_host_page_size) {
2593                         uint64_t remote_page_size = qemu_get_be64(f);
2594                         if (remote_page_size != block->page_size) {
2595                             error_report("Mismatched RAM page size %s "
2596                                          "(local) %zd != %" PRId64,
2597                                          id, block->page_size,
2598                                          remote_page_size);
2599                             ret = -EINVAL;
2600                         }
2601                     }
2602                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2603                                           block->idstr);
2604                 } else {
2605                     error_report("Unknown ramblock \"%s\", cannot "
2606                                  "accept migration", id);
2607                     ret = -EINVAL;
2608                 }
2609
2610                 total_ram_bytes -= length;
2611             }
2612             break;
2613
2614         case RAM_SAVE_FLAG_ZERO:
2615             ch = qemu_get_byte(f);
2616             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2617             break;
2618
2619         case RAM_SAVE_FLAG_PAGE:
2620             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2621             break;
2622
2623         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2624             len = qemu_get_be32(f);
2625             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2626                 error_report("Invalid compressed data length: %d", len);
2627                 ret = -EINVAL;
2628                 break;
2629             }
2630             decompress_data_with_multi_threads(f, host, len);
2631             break;
2632
2633         case RAM_SAVE_FLAG_XBZRLE:
2634             if (load_xbzrle(f, addr, host) < 0) {
2635                 error_report("Failed to decompress XBZRLE page at "
2636                              RAM_ADDR_FMT, addr);
2637                 ret = -EINVAL;
2638                 break;
2639             }
2640             break;
2641         case RAM_SAVE_FLAG_EOS:
2642             /* normal exit */
2643             break;
2644         default:
2645             if (flags & RAM_SAVE_FLAG_HOOK) {
2646                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2647             } else {
2648                 error_report("Unknown combination of migration flags: %#x",
2649                              flags);
2650                 ret = -EINVAL;
2651             }
2652         }
2653         if (!ret) {
2654             ret = qemu_file_get_error(f);
2655         }
2656     }
2657
2658     wait_for_decompress_done();
2659     rcu_read_unlock();
2660     trace_ram_load_complete(ret, seq_iter);
2661     return ret;
2662 }
2663
2664 static SaveVMHandlers savevm_ram_handlers = {
2665     .save_live_setup = ram_save_setup,
2666     .save_live_iterate = ram_save_iterate,
2667     .save_live_complete_postcopy = ram_save_complete,
2668     .save_live_complete_precopy = ram_save_complete,
2669     .save_live_pending = ram_save_pending,
2670     .load_state = ram_load,
2671     .cleanup = ram_migration_cleanup,
2672 };
2673
2674 void ram_mig_init(void)
2675 {
2676     qemu_mutex_init(&XBZRLE.lock);
2677     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2678 }