migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "cpu.h"
  30 #include <zlib.h>
  31 #include "qapi-event.h"
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "migration/register.h"
  40 #include "migration/misc.h"
  41 #include "qemu-file.h"
  42 #include "postcopy-ram.h"
  43 #include "migration/page_cache.h"
  44 #include "qemu/error-report.h"
  45 #include "trace.h"
  46 #include "exec/ram_addr.h"
  47 #include "qemu/rcu_queue.h"
  48 #include "migration/colo.h"
  49
  50 /***********************************************************/
  51 /* ram save/restore */
  52
  53 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  54  * worked for pages that where filled with the same char.  We switched
  55  * it to only search for the zero value.  And to avoid confusion with
  56  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  57  */
  58
  59 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  60 #define RAM_SAVE_FLAG_ZERO     0x02
  61 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  62 #define RAM_SAVE_FLAG_PAGE     0x08
  63 #define RAM_SAVE_FLAG_EOS      0x10
  64 #define RAM_SAVE_FLAG_CONTINUE 0x20
  65 #define RAM_SAVE_FLAG_XBZRLE   0x40
  66 /* 0x80 is reserved in migration.h start with 0x100 next */
  67 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  68
  69 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  70 {
  71     return buffer_is_zero(p, size);
  72 }
  73
  74 XBZRLECacheStats xbzrle_counters;
  75
  76 /* struct contains XBZRLE cache and a static page
  77    used by the compression */
  78 static struct {
  79     /* buffer used for XBZRLE encoding */
  80     uint8_t *encoded_buf;
  81     /* buffer for storing page content */
  82     uint8_t *current_buf;
  83     /* Cache for XBZRLE, Protected by lock. */
  84     PageCache *cache;
  85     QemuMutex lock;
  86     /* it will store a page full of zeros */
  87     uint8_t *zero_target_page;
  88     /* buffer used for XBZRLE decoding */
  89     uint8_t *decoded_buf;
  90 } XBZRLE;
  91
  92 static void XBZRLE_cache_lock(void)
  93 {
  94     if (migrate_use_xbzrle())
  95         qemu_mutex_lock(&XBZRLE.lock);
  96 }
  97
  98 static void XBZRLE_cache_unlock(void)
  99 {
 100     if (migrate_use_xbzrle())
 101         qemu_mutex_unlock(&XBZRLE.lock);
 102 }
 103
 104 /**
 105  * xbzrle_cache_resize: resize the xbzrle cache
 106  *
 107  * This function is called from qmp_migrate_set_cache_size in main
 108  * thread, possibly while a migration is in progress.  A running
 109  * migration may be using the cache and might finish during this call,
 110  * hence changes to the cache are protected by XBZRLE.lock().
 111  *
 112  * Returns the new_size or negative in case of error.
 113  *
 114  * @new_size: new cache size
 115  */
 116 int64_t xbzrle_cache_resize(int64_t new_size)
 117 {
 118     PageCache *new_cache;
 119     int64_t ret;
 120
 121     if (new_size < TARGET_PAGE_SIZE) {
 122         return -1;
 123     }
 124
 125     XBZRLE_cache_lock();
 126
 127     if (XBZRLE.cache != NULL) {
 128         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 129             goto out_new_size;
 130         }
 131         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 132                                         TARGET_PAGE_SIZE);
 133         if (!new_cache) {
 134             error_report("Error creating cache");
 135             ret = -1;
 136             goto out;
 137         }
 138
 139         cache_fini(XBZRLE.cache);
 140         XBZRLE.cache = new_cache;
 141     }
 142
 143 out_new_size:
 144     ret = pow2floor(new_size);
 145 out:
 146     XBZRLE_cache_unlock();
 147     return ret;
 148 }
 149
 150 /*
 151  * An outstanding page request, on the source, having been received
 152  * and queued
 153  */
 154 struct RAMSrcPageRequest {
 155     RAMBlock *rb;
 156     hwaddr    offset;
 157     hwaddr    len;
 158
 159     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 160 };
 161
 162 /* State of RAM for migration */
 163 struct RAMState {
 164     /* QEMUFile used for this migration */
 165     QEMUFile *f;
 166     /* Last block that we have visited searching for dirty pages */
 167     RAMBlock *last_seen_block;
 168     /* Last block from where we have sent data */
 169     RAMBlock *last_sent_block;
 170     /* Last dirty target page we have sent */
 171     ram_addr_t last_page;
 172     /* last ram version we have seen */
 173     uint32_t last_version;
 174     /* We are in the first round */
 175     bool ram_bulk_stage;
 176     /* How many times we have dirty too many pages */
 177     int dirty_rate_high_cnt;
 178     /* these variables are used for bitmap sync */
 179     /* last time we did a full bitmap_sync */
 180     int64_t time_last_bitmap_sync;
 181     /* bytes transferred at start_time */
 182     uint64_t bytes_xfer_prev;
 183     /* number of dirty pages since start_time */
 184     uint64_t num_dirty_pages_period;
 185     /* xbzrle misses since the beginning of the period */
 186     uint64_t xbzrle_cache_miss_prev;
 187     /* number of iterations at the beginning of period */
 188     uint64_t iterations_prev;
 189     /* Iterations since start */
 190     uint64_t iterations;
 191     /* number of dirty bits in the bitmap */
 192     uint64_t migration_dirty_pages;
 193     /* protects modification of the bitmap */
 194     QemuMutex bitmap_mutex;
 195     /* The RAMBlock used in the last src_page_requests */
 196     RAMBlock *last_req_rb;
 197     /* Queue of outstanding page requests from the destination */
 198     QemuMutex src_page_req_mutex;
 199     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 200 };
 201 typedef struct RAMState RAMState;
 202
 203 static RAMState *ram_state;
 204
 205 uint64_t ram_bytes_remaining(void)
 206 {
 207     return ram_state->migration_dirty_pages * TARGET_PAGE_SIZE;
 208 }
 209
 210 MigrationStats ram_counters;
 211
 212 /* used by the search for pages to send */
 213 struct PageSearchStatus {
 214     /* Current block being searched */
 215     RAMBlock    *block;
 216     /* Current page to search from */
 217     unsigned long page;
 218     /* Set once we wrap around */
 219     bool         complete_round;
 220 };
 221 typedef struct PageSearchStatus PageSearchStatus;
 222
 223 struct CompressParam {
 224     bool done;
 225     bool quit;
 226     QEMUFile *file;
 227     QemuMutex mutex;
 228     QemuCond cond;
 229     RAMBlock *block;
 230     ram_addr_t offset;
 231 };
 232 typedef struct CompressParam CompressParam;
 233
 234 struct DecompressParam {
 235     bool done;
 236     bool quit;
 237     QemuMutex mutex;
 238     QemuCond cond;
 239     void *des;
 240     uint8_t *compbuf;
 241     int len;
 242 };
 243 typedef struct DecompressParam DecompressParam;
 244
 245 static CompressParam *comp_param;
 246 static QemuThread *compress_threads;
 247 /* comp_done_cond is used to wake up the migration thread when
 248  * one of the compression threads has finished the compression.
 249  * comp_done_lock is used to co-work with comp_done_cond.
 250  */
 251 static QemuMutex comp_done_lock;
 252 static QemuCond comp_done_cond;
 253 /* The empty QEMUFileOps will be used by file in CompressParam */
 254 static const QEMUFileOps empty_ops = { };
 255
 256 static DecompressParam *decomp_param;
 257 static QemuThread *decompress_threads;
 258 static QemuMutex decomp_done_lock;
 259 static QemuCond decomp_done_cond;
 260
 261 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 262                                 ram_addr_t offset);
 263
 264 static void *do_data_compress(void *opaque)
 265 {
 266     CompressParam *param = opaque;
 267     RAMBlock *block;
 268     ram_addr_t offset;
 269
 270     qemu_mutex_lock(&param->mutex);
 271     while (!param->quit) {
 272         if (param->block) {
 273             block = param->block;
 274             offset = param->offset;
 275             param->block = NULL;
 276             qemu_mutex_unlock(&param->mutex);
 277
 278             do_compress_ram_page(param->file, block, offset);
 279
 280             qemu_mutex_lock(&comp_done_lock);
 281             param->done = true;
 282             qemu_cond_signal(&comp_done_cond);
 283             qemu_mutex_unlock(&comp_done_lock);
 284
 285             qemu_mutex_lock(&param->mutex);
 286         } else {
 287             qemu_cond_wait(&param->cond, &param->mutex);
 288         }
 289     }
 290     qemu_mutex_unlock(&param->mutex);
 291
 292     return NULL;
 293 }
 294
 295 static inline void terminate_compression_threads(void)
 296 {
 297     int idx, thread_count;
 298
 299     thread_count = migrate_compress_threads();
 300
 301     for (idx = 0; idx < thread_count; idx++) {
 302         qemu_mutex_lock(&comp_param[idx].mutex);
 303         comp_param[idx].quit = true;
 304         qemu_cond_signal(&comp_param[idx].cond);
 305         qemu_mutex_unlock(&comp_param[idx].mutex);
 306     }
 307 }
 308
 309 static void compress_threads_save_cleanup(void)
 310 {
 311     int i, thread_count;
 312
 313     if (!migrate_use_compression()) {
 314         return;
 315     }
 316     terminate_compression_threads();
 317     thread_count = migrate_compress_threads();
 318     for (i = 0; i < thread_count; i++) {
 319         qemu_thread_join(compress_threads + i);
 320         qemu_fclose(comp_param[i].file);
 321         qemu_mutex_destroy(&comp_param[i].mutex);
 322         qemu_cond_destroy(&comp_param[i].cond);
 323     }
 324     qemu_mutex_destroy(&comp_done_lock);
 325     qemu_cond_destroy(&comp_done_cond);
 326     g_free(compress_threads);
 327     g_free(comp_param);
 328     compress_threads = NULL;
 329     comp_param = NULL;
 330 }
 331
 332 static void compress_threads_save_setup(void)
 333 {
 334     int i, thread_count;
 335
 336     if (!migrate_use_compression()) {
 337         return;
 338     }
 339     thread_count = migrate_compress_threads();
 340     compress_threads = g_new0(QemuThread, thread_count);
 341     comp_param = g_new0(CompressParam, thread_count);
 342     qemu_cond_init(&comp_done_cond);
 343     qemu_mutex_init(&comp_done_lock);
 344     for (i = 0; i < thread_count; i++) {
 345         /* comp_param[i].file is just used as a dummy buffer to save data,
 346          * set its ops to empty.
 347          */
 348         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 349         comp_param[i].done = true;
 350         comp_param[i].quit = false;
 351         qemu_mutex_init(&comp_param[i].mutex);
 352         qemu_cond_init(&comp_param[i].cond);
 353         qemu_thread_create(compress_threads + i, "compress",
 354                            do_data_compress, comp_param + i,
 355                            QEMU_THREAD_JOINABLE);
 356     }
 357 }
 358
 359 /**
 360  * save_page_header: write page header to wire
 361  *
 362  * If this is the 1st block, it also writes the block identification
 363  *
 364  * Returns the number of bytes written
 365  *
 366  * @f: QEMUFile where to send the data
 367  * @block: block that contains the page we want to send
 368  * @offset: offset inside the block for the page
 369  *          in the lower bits, it contains flags
 370  */
 371 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 372                                ram_addr_t offset)
 373 {
 374     size_t size, len;
 375
 376     if (block == rs->last_sent_block) {
 377         offset |= RAM_SAVE_FLAG_CONTINUE;
 378     }
 379     qemu_put_be64(f, offset);
 380     size = 8;
 381
 382     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 383         len = strlen(block->idstr);
 384         qemu_put_byte(f, len);
 385         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 386         size += 1 + len;
 387         rs->last_sent_block = block;
 388     }
 389     return size;
 390 }
 391
 392 /**
 393  * mig_throttle_guest_down: throotle down the guest
 394  *
 395  * Reduce amount of guest cpu execution to hopefully slow down memory
 396  * writes. If guest dirty memory rate is reduced below the rate at
 397  * which we can transfer pages to the destination then we should be
 398  * able to complete migration. Some workloads dirty memory way too
 399  * fast and will not effectively converge, even with auto-converge.
 400  */
 401 static void mig_throttle_guest_down(void)
 402 {
 403     MigrationState *s = migrate_get_current();
 404     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 405     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 406
 407     /* We have not started throttling yet. Let's start it. */
 408     if (!cpu_throttle_active()) {
 409         cpu_throttle_set(pct_initial);
 410     } else {
 411         /* Throttling already on, just increase the rate */
 412         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 413     }
 414 }
 415
 416 /**
 417  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 418  *
 419  * @rs: current RAM state
 420  * @current_addr: address for the zero page
 421  *
 422  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 423  * The important thing is that a stale (not-yet-0'd) page be replaced
 424  * by the new data.
 425  * As a bonus, if the page wasn't in the cache it gets added so that
 426  * when a small write is made into the 0'd page it gets XBZRLE sent.
 427  */
 428 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 429 {
 430     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 431         return;
 432     }
 433
 434     /* We don't care if this fails to allocate a new cache page
 435      * as long as it updated an old one */
 436     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 437                  ram_counters.dirty_sync_count);
 438 }
 439
 440 #define ENCODING_FLAG_XBZRLE 0x1
 441
 442 /**
 443  * save_xbzrle_page: compress and send current page
 444  *
 445  * Returns: 1 means that we wrote the page
 446  *          0 means that page is identical to the one already sent
 447  *          -1 means that xbzrle would be longer than normal
 448  *
 449  * @rs: current RAM state
 450  * @current_data: pointer to the address of the page contents
 451  * @current_addr: addr of the page
 452  * @block: block that contains the page we want to send
 453  * @offset: offset inside the block for the page
 454  * @last_stage: if we are at the completion stage
 455  */
 456 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 457                             ram_addr_t current_addr, RAMBlock *block,
 458                             ram_addr_t offset, bool last_stage)
 459 {
 460     int encoded_len = 0, bytes_xbzrle;
 461     uint8_t *prev_cached_page;
 462
 463     if (!cache_is_cached(XBZRLE.cache, current_addr,
 464                          ram_counters.dirty_sync_count)) {
 465         xbzrle_counters.cache_miss++;
 466         if (!last_stage) {
 467             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 468                              ram_counters.dirty_sync_count) == -1) {
 469                 return -1;
 470             } else {
 471                 /* update *current_data when the page has been
 472                    inserted into cache */
 473                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 474             }
 475         }
 476         return -1;
 477     }
 478
 479     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 480
 481     /* save current buffer into memory */
 482     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 483
 484     /* XBZRLE encoding (if there is no overflow) */
 485     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 486                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 487                                        TARGET_PAGE_SIZE);
 488     if (encoded_len == 0) {
 489         trace_save_xbzrle_page_skipping();
 490         return 0;
 491     } else if (encoded_len == -1) {
 492         trace_save_xbzrle_page_overflow();
 493         xbzrle_counters.overflow++;
 494         /* update data in the cache */
 495         if (!last_stage) {
 496             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 497             *current_data = prev_cached_page;
 498         }
 499         return -1;
 500     }
 501
 502     /* we need to update the data in the cache, in order to get the same data */
 503     if (!last_stage) {
 504         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 505     }
 506
 507     /* Send XBZRLE based compressed page */
 508     bytes_xbzrle = save_page_header(rs, rs->f, block,
 509                                     offset | RAM_SAVE_FLAG_XBZRLE);
 510     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 511     qemu_put_be16(rs->f, encoded_len);
 512     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 513     bytes_xbzrle += encoded_len + 1 + 2;
 514     xbzrle_counters.pages++;
 515     xbzrle_counters.bytes += bytes_xbzrle;
 516     ram_counters.transferred += bytes_xbzrle;
 517
 518     return 1;
 519 }
 520
 521 /**
 522  * migration_bitmap_find_dirty: find the next dirty page from start
 523  *
 524  * Called with rcu_read_lock() to protect migration_bitmap
 525  *
 526  * Returns the byte offset within memory region of the start of a dirty page
 527  *
 528  * @rs: current RAM state
 529  * @rb: RAMBlock where to search for dirty pages
 530  * @start: page where we start the search
 531  */
 532 static inline
 533 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 534                                           unsigned long start)
 535 {
 536     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 537     unsigned long *bitmap = rb->bmap;
 538     unsigned long next;
 539
 540     if (rs->ram_bulk_stage && start > 0) {
 541         next = start + 1;
 542     } else {
 543         next = find_next_bit(bitmap, size, start);
 544     }
 545
 546     return next;
 547 }
 548
 549 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 550                                                 RAMBlock *rb,
 551                                                 unsigned long page)
 552 {
 553     bool ret;
 554
 555     ret = test_and_clear_bit(page, rb->bmap);
 556
 557     if (ret) {
 558         rs->migration_dirty_pages--;
 559     }
 560     return ret;
 561 }
 562
 563 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 564                                         ram_addr_t start, ram_addr_t length)
 565 {
 566     rs->migration_dirty_pages +=
 567         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 568                                               &rs->num_dirty_pages_period);
 569 }
 570
 571 /**
 572  * ram_pagesize_summary: calculate all the pagesizes of a VM
 573  *
 574  * Returns a summary bitmap of the page sizes of all RAMBlocks
 575  *
 576  * For VMs with just normal pages this is equivalent to the host page
 577  * size. If it's got some huge pages then it's the OR of all the
 578  * different page sizes.
 579  */
 580 uint64_t ram_pagesize_summary(void)
 581 {
 582     RAMBlock *block;
 583     uint64_t summary = 0;
 584
 585     RAMBLOCK_FOREACH(block) {
 586         summary |= block->page_size;
 587     }
 588
 589     return summary;
 590 }
 591
 592 static void migration_bitmap_sync(RAMState *rs)
 593 {
 594     RAMBlock *block;
 595     int64_t end_time;
 596     uint64_t bytes_xfer_now;
 597
 598     ram_counters.dirty_sync_count++;
 599
 600     if (!rs->time_last_bitmap_sync) {
 601         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 602     }
 603
 604     trace_migration_bitmap_sync_start();
 605     memory_global_dirty_log_sync();
 606
 607     qemu_mutex_lock(&rs->bitmap_mutex);
 608     rcu_read_lock();
 609     RAMBLOCK_FOREACH(block) {
 610         migration_bitmap_sync_range(rs, block, 0, block->used_length);
 611     }
 612     rcu_read_unlock();
 613     qemu_mutex_unlock(&rs->bitmap_mutex);
 614
 615     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 616
 617     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 618
 619     /* more than 1 second = 1000 millisecons */
 620     if (end_time > rs->time_last_bitmap_sync + 1000) {
 621         /* calculate period counters */
 622         ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 623             / (end_time - rs->time_last_bitmap_sync);
 624         bytes_xfer_now = ram_counters.transferred;
 625
 626         if (migrate_auto_converge()) {
 627             /* The following detection logic can be refined later. For now:
 628                Check to see if the dirtied bytes is 50% more than the approx.
 629                amount of bytes that just got transferred since the last time we
 630                were in this routine. If that happens twice, start or increase
 631                throttling */
 632
 633             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 634                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 635                 (++rs->dirty_rate_high_cnt >= 2)) {
 636                     trace_migration_throttle();
 637                     rs->dirty_rate_high_cnt = 0;
 638                     mig_throttle_guest_down();
 639             }
 640         }
 641
 642         if (migrate_use_xbzrle()) {
 643             if (rs->iterations_prev != rs->iterations) {
 644                 xbzrle_counters.cache_miss_rate =
 645                    (double)(xbzrle_counters.cache_miss -
 646                             rs->xbzrle_cache_miss_prev) /
 647                    (rs->iterations - rs->iterations_prev);
 648             }
 649             rs->iterations_prev = rs->iterations;
 650             rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 651         }
 652
 653         /* reset period counters */
 654         rs->time_last_bitmap_sync = end_time;
 655         rs->num_dirty_pages_period = 0;
 656         rs->bytes_xfer_prev = bytes_xfer_now;
 657     }
 658     if (migrate_use_events()) {
 659         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
 660     }
 661 }
 662
 663 /**
 664  * save_zero_page: send the zero page to the stream
 665  *
 666  * Returns the number of pages written.
 667  *
 668  * @rs: current RAM state
 669  * @block: block that contains the page we want to send
 670  * @offset: offset inside the block for the page
 671  * @p: pointer to the page
 672  */
 673 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
 674                           uint8_t *p)
 675 {
 676     int pages = -1;
 677
 678     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 679         ram_counters.duplicate++;
 680         ram_counters.transferred +=
 681             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
 682         qemu_put_byte(rs->f, 0);
 683         ram_counters.transferred += 1;
 684         pages = 1;
 685     }
 686
 687     return pages;
 688 }
 689
 690 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 691 {
 692     if (!migrate_release_ram() || !migration_in_postcopy()) {
 693         return;
 694     }
 695
 696     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 697 }
 698
 699 /**
 700  * ram_save_page: send the given page to the stream
 701  *
 702  * Returns the number of pages written.
 703  *          < 0 - error
 704  *          >=0 - Number of pages written - this might legally be 0
 705  *                if xbzrle noticed the page was the same.
 706  *
 707  * @rs: current RAM state
 708  * @block: block that contains the page we want to send
 709  * @offset: offset inside the block for the page
 710  * @last_stage: if we are at the completion stage
 711  */
 712 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
 713 {
 714     int pages = -1;
 715     uint64_t bytes_xmit;
 716     ram_addr_t current_addr;
 717     uint8_t *p;
 718     int ret;
 719     bool send_async = true;
 720     RAMBlock *block = pss->block;
 721     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 722
 723     p = block->host + offset;
 724     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
 725
 726     /* In doubt sent page as normal */
 727     bytes_xmit = 0;
 728     ret = ram_control_save_page(rs->f, block->offset,
 729                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 730     if (bytes_xmit) {
 731         ram_counters.transferred += bytes_xmit;
 732         pages = 1;
 733     }
 734
 735     XBZRLE_cache_lock();
 736
 737     current_addr = block->offset + offset;
 738
 739     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 740         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 741             if (bytes_xmit > 0) {
 742                 ram_counters.normal++;
 743             } else if (bytes_xmit == 0) {
 744                 ram_counters.duplicate++;
 745             }
 746         }
 747     } else {
 748         pages = save_zero_page(rs, block, offset, p);
 749         if (pages > 0) {
 750             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 751              * page would be stale
 752              */
 753             xbzrle_cache_zero_page(rs, current_addr);
 754             ram_release_pages(block->idstr, offset, pages);
 755         } else if (!rs->ram_bulk_stage &&
 756                    !migration_in_postcopy() && migrate_use_xbzrle()) {
 757             pages = save_xbzrle_page(rs, &p, current_addr, block,
 758                                      offset, last_stage);
 759             if (!last_stage) {
 760                 /* Can't send this cached data async, since the cache page
 761                  * might get updated before it gets to the wire
 762                  */
 763                 send_async = false;
 764             }
 765         }
 766     }
 767
 768     /* XBZRLE overflow or normal page */
 769     if (pages == -1) {
 770         ram_counters.transferred +=
 771             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
 772         if (send_async) {
 773             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
 774                                   migrate_release_ram() &
 775                                   migration_in_postcopy());
 776         } else {
 777             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
 778         }
 779         ram_counters.transferred += TARGET_PAGE_SIZE;
 780         pages = 1;
 781         ram_counters.normal++;
 782     }
 783
 784     XBZRLE_cache_unlock();
 785
 786     return pages;
 787 }
 788
 789 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 790                                 ram_addr_t offset)
 791 {
 792     RAMState *rs = ram_state;
 793     int bytes_sent, blen;
 794     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 795
 796     bytes_sent = save_page_header(rs, f, block, offset |
 797                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
 798     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 799                                      migrate_compress_level());
 800     if (blen < 0) {
 801         bytes_sent = 0;
 802         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
 803         error_report("compressed data failed!");
 804     } else {
 805         bytes_sent += blen;
 806         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
 807     }
 808
 809     return bytes_sent;
 810 }
 811
 812 static void flush_compressed_data(RAMState *rs)
 813 {
 814     int idx, len, thread_count;
 815
 816     if (!migrate_use_compression()) {
 817         return;
 818     }
 819     thread_count = migrate_compress_threads();
 820
 821     qemu_mutex_lock(&comp_done_lock);
 822     for (idx = 0; idx < thread_count; idx++) {
 823         while (!comp_param[idx].done) {
 824             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 825         }
 826     }
 827     qemu_mutex_unlock(&comp_done_lock);
 828
 829     for (idx = 0; idx < thread_count; idx++) {
 830         qemu_mutex_lock(&comp_param[idx].mutex);
 831         if (!comp_param[idx].quit) {
 832             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 833             ram_counters.transferred += len;
 834         }
 835         qemu_mutex_unlock(&comp_param[idx].mutex);
 836     }
 837 }
 838
 839 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 840                                        ram_addr_t offset)
 841 {
 842     param->block = block;
 843     param->offset = offset;
 844 }
 845
 846 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
 847                                            ram_addr_t offset)
 848 {
 849     int idx, thread_count, bytes_xmit = -1, pages = -1;
 850
 851     thread_count = migrate_compress_threads();
 852     qemu_mutex_lock(&comp_done_lock);
 853     while (true) {
 854         for (idx = 0; idx < thread_count; idx++) {
 855             if (comp_param[idx].done) {
 856                 comp_param[idx].done = false;
 857                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 858                 qemu_mutex_lock(&comp_param[idx].mutex);
 859                 set_compress_params(&comp_param[idx], block, offset);
 860                 qemu_cond_signal(&comp_param[idx].cond);
 861                 qemu_mutex_unlock(&comp_param[idx].mutex);
 862                 pages = 1;
 863                 ram_counters.normal++;
 864                 ram_counters.transferred += bytes_xmit;
 865                 break;
 866             }
 867         }
 868         if (pages > 0) {
 869             break;
 870         } else {
 871             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 872         }
 873     }
 874     qemu_mutex_unlock(&comp_done_lock);
 875
 876     return pages;
 877 }
 878
 879 /**
 880  * ram_save_compressed_page: compress the given page and send it to the stream
 881  *
 882  * Returns the number of pages written.
 883  *
 884  * @rs: current RAM state
 885  * @block: block that contains the page we want to send
 886  * @offset: offset inside the block for the page
 887  * @last_stage: if we are at the completion stage
 888  */
 889 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
 890                                     bool last_stage)
 891 {
 892     int pages = -1;
 893     uint64_t bytes_xmit = 0;
 894     uint8_t *p;
 895     int ret, blen;
 896     RAMBlock *block = pss->block;
 897     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 898
 899     p = block->host + offset;
 900
 901     ret = ram_control_save_page(rs->f, block->offset,
 902                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
 903     if (bytes_xmit) {
 904         ram_counters.transferred += bytes_xmit;
 905         pages = 1;
 906     }
 907     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 908         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 909             if (bytes_xmit > 0) {
 910                 ram_counters.normal++;
 911             } else if (bytes_xmit == 0) {
 912                 ram_counters.duplicate++;
 913             }
 914         }
 915     } else {
 916         /* When starting the process of a new block, the first page of
 917          * the block should be sent out before other pages in the same
 918          * block, and all the pages in last block should have been sent
 919          * out, keeping this order is important, because the 'cont' flag
 920          * is used to avoid resending the block name.
 921          */
 922         if (block != rs->last_sent_block) {
 923             flush_compressed_data(rs);
 924             pages = save_zero_page(rs, block, offset, p);
 925             if (pages == -1) {
 926                 /* Make sure the first page is sent out before other pages */
 927                 bytes_xmit = save_page_header(rs, rs->f, block, offset |
 928                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
 929                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
 930                                                  migrate_compress_level());
 931                 if (blen > 0) {
 932                     ram_counters.transferred += bytes_xmit + blen;
 933                     ram_counters.normal++;
 934                     pages = 1;
 935                 } else {
 936                     qemu_file_set_error(rs->f, blen);
 937                     error_report("compressed data failed!");
 938                 }
 939             }
 940             if (pages > 0) {
 941                 ram_release_pages(block->idstr, offset, pages);
 942             }
 943         } else {
 944             pages = save_zero_page(rs, block, offset, p);
 945             if (pages == -1) {
 946                 pages = compress_page_with_multi_thread(rs, block, offset);
 947             } else {
 948                 ram_release_pages(block->idstr, offset, pages);
 949             }
 950         }
 951     }
 952
 953     return pages;
 954 }
 955
 956 /**
 957  * find_dirty_block: find the next dirty page and update any state
 958  * associated with the search process.
 959  *
 960  * Returns if a page is found
 961  *
 962  * @rs: current RAM state
 963  * @pss: data about the state of the current dirty page scan
 964  * @again: set to false if the search has scanned the whole of RAM
 965  */
 966 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
 967 {
 968     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
 969     if (pss->complete_round && pss->block == rs->last_seen_block &&
 970         pss->page >= rs->last_page) {
 971         /*
 972          * We've been once around the RAM and haven't found anything.
 973          * Give up.
 974          */
 975         *again = false;
 976         return false;
 977     }
 978     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
 979         /* Didn't find anything in this RAM Block */
 980         pss->page = 0;
 981         pss->block = QLIST_NEXT_RCU(pss->block, next);
 982         if (!pss->block) {
 983             /* Hit the end of the list */
 984             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
 985             /* Flag that we've looped */
 986             pss->complete_round = true;
 987             rs->ram_bulk_stage = false;
 988             if (migrate_use_xbzrle()) {
 989                 /* If xbzrle is on, stop using the data compression at this
 990                  * point. In theory, xbzrle can do better than compression.
 991                  */
 992                 flush_compressed_data(rs);
 993             }
 994         }
 995         /* Didn't find anything this time, but try again on the new block */
 996         *again = true;
 997         return false;
 998     } else {
 999         /* Can go around again, but... */
1000         *again = true;
1001         /* We've found something so probably don't need to */
1002         return true;
1003     }
1004 }
1005
1006 /**
1007  * unqueue_page: gets a page of the queue
1008  *
1009  * Helper for 'get_queued_page' - gets a page off the queue
1010  *
1011  * Returns the block of the page (or NULL if none available)
1012  *
1013  * @rs: current RAM state
1014  * @offset: used to return the offset within the RAMBlock
1015  */
1016 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1017 {
1018     RAMBlock *block = NULL;
1019
1020     qemu_mutex_lock(&rs->src_page_req_mutex);
1021     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1022         struct RAMSrcPageRequest *entry =
1023                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1024         block = entry->rb;
1025         *offset = entry->offset;
1026
1027         if (entry->len > TARGET_PAGE_SIZE) {
1028             entry->len -= TARGET_PAGE_SIZE;
1029             entry->offset += TARGET_PAGE_SIZE;
1030         } else {
1031             memory_region_unref(block->mr);
1032             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1033             g_free(entry);
1034         }
1035     }
1036     qemu_mutex_unlock(&rs->src_page_req_mutex);
1037
1038     return block;
1039 }
1040
1041 /**
1042  * get_queued_page: unqueue a page from the postocpy requests
1043  *
1044  * Skips pages that are already sent (!dirty)
1045  *
1046  * Returns if a queued page is found
1047  *
1048  * @rs: current RAM state
1049  * @pss: data about the state of the current dirty page scan
1050  */
1051 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1052 {
1053     RAMBlock  *block;
1054     ram_addr_t offset;
1055     bool dirty;
1056
1057     do {
1058         block = unqueue_page(rs, &offset);
1059         /*
1060          * We're sending this page, and since it's postcopy nothing else
1061          * will dirty it, and we must make sure it doesn't get sent again
1062          * even if this queue request was received after the background
1063          * search already sent it.
1064          */
1065         if (block) {
1066             unsigned long page;
1067
1068             page = offset >> TARGET_PAGE_BITS;
1069             dirty = test_bit(page, block->bmap);
1070             if (!dirty) {
1071                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1072                        page, test_bit(page, block->unsentmap));
1073             } else {
1074                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1075             }
1076         }
1077
1078     } while (block && !dirty);
1079
1080     if (block) {
1081         /*
1082          * As soon as we start servicing pages out of order, then we have
1083          * to kill the bulk stage, since the bulk stage assumes
1084          * in (migration_bitmap_find_and_reset_dirty) that every page is
1085          * dirty, that's no longer true.
1086          */
1087         rs->ram_bulk_stage = false;
1088
1089         /*
1090          * We want the background search to continue from the queued page
1091          * since the guest is likely to want other pages near to the page
1092          * it just requested.
1093          */
1094         pss->block = block;
1095         pss->page = offset >> TARGET_PAGE_BITS;
1096     }
1097
1098     return !!block;
1099 }
1100
1101 /**
1102  * migration_page_queue_free: drop any remaining pages in the ram
1103  * request queue
1104  *
1105  * It should be empty at the end anyway, but in error cases there may
1106  * be some left.  in case that there is any page left, we drop it.
1107  *
1108  */
1109 static void migration_page_queue_free(RAMState *rs)
1110 {
1111     struct RAMSrcPageRequest *mspr, *next_mspr;
1112     /* This queue generally should be empty - but in the case of a failed
1113      * migration might have some droppings in.
1114      */
1115     rcu_read_lock();
1116     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1117         memory_region_unref(mspr->rb->mr);
1118         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1119         g_free(mspr);
1120     }
1121     rcu_read_unlock();
1122 }
1123
1124 /**
1125  * ram_save_queue_pages: queue the page for transmission
1126  *
1127  * A request from postcopy destination for example.
1128  *
1129  * Returns zero on success or negative on error
1130  *
1131  * @rbname: Name of the RAMBLock of the request. NULL means the
1132  *          same that last one.
1133  * @start: starting address from the start of the RAMBlock
1134  * @len: length (in bytes) to send
1135  */
1136 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1137 {
1138     RAMBlock *ramblock;
1139     RAMState *rs = ram_state;
1140
1141     ram_counters.postcopy_requests++;
1142     rcu_read_lock();
1143     if (!rbname) {
1144         /* Reuse last RAMBlock */
1145         ramblock = rs->last_req_rb;
1146
1147         if (!ramblock) {
1148             /*
1149              * Shouldn't happen, we can't reuse the last RAMBlock if
1150              * it's the 1st request.
1151              */
1152             error_report("ram_save_queue_pages no previous block");
1153             goto err;
1154         }
1155     } else {
1156         ramblock = qemu_ram_block_by_name(rbname);
1157
1158         if (!ramblock) {
1159             /* We shouldn't be asked for a non-existent RAMBlock */
1160             error_report("ram_save_queue_pages no block '%s'", rbname);
1161             goto err;
1162         }
1163         rs->last_req_rb = ramblock;
1164     }
1165     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1166     if (start+len > ramblock->used_length) {
1167         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1168                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1169                      __func__, start, len, ramblock->used_length);
1170         goto err;
1171     }
1172
1173     struct RAMSrcPageRequest *new_entry =
1174         g_malloc0(sizeof(struct RAMSrcPageRequest));
1175     new_entry->rb = ramblock;
1176     new_entry->offset = start;
1177     new_entry->len = len;
1178
1179     memory_region_ref(ramblock->mr);
1180     qemu_mutex_lock(&rs->src_page_req_mutex);
1181     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1182     qemu_mutex_unlock(&rs->src_page_req_mutex);
1183     rcu_read_unlock();
1184
1185     return 0;
1186
1187 err:
1188     rcu_read_unlock();
1189     return -1;
1190 }
1191
1192 /**
1193  * ram_save_target_page: save one target page
1194  *
1195  * Returns the number of pages written
1196  *
1197  * @rs: current RAM state
1198  * @ms: current migration state
1199  * @pss: data about the page we want to send
1200  * @last_stage: if we are at the completion stage
1201  */
1202 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1203                                 bool last_stage)
1204 {
1205     int res = 0;
1206
1207     /* Check the pages is dirty and if it is send it */
1208     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1209         /*
1210          * If xbzrle is on, stop using the data compression after first
1211          * round of migration even if compression is enabled. In theory,
1212          * xbzrle can do better than compression.
1213          */
1214         if (migrate_use_compression() &&
1215             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1216             res = ram_save_compressed_page(rs, pss, last_stage);
1217         } else {
1218             res = ram_save_page(rs, pss, last_stage);
1219         }
1220
1221         if (res < 0) {
1222             return res;
1223         }
1224         if (pss->block->unsentmap) {
1225             clear_bit(pss->page, pss->block->unsentmap);
1226         }
1227     }
1228
1229     return res;
1230 }
1231
1232 /**
1233  * ram_save_host_page: save a whole host page
1234  *
1235  * Starting at *offset send pages up to the end of the current host
1236  * page. It's valid for the initial offset to point into the middle of
1237  * a host page in which case the remainder of the hostpage is sent.
1238  * Only dirty target pages are sent. Note that the host page size may
1239  * be a huge page for this block.
1240  * The saving stops at the boundary of the used_length of the block
1241  * if the RAMBlock isn't a multiple of the host page size.
1242  *
1243  * Returns the number of pages written or negative on error
1244  *
1245  * @rs: current RAM state
1246  * @ms: current migration state
1247  * @pss: data about the page we want to send
1248  * @last_stage: if we are at the completion stage
1249  */
1250 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1251                               bool last_stage)
1252 {
1253     int tmppages, pages = 0;
1254     size_t pagesize_bits =
1255         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1256
1257     do {
1258         tmppages = ram_save_target_page(rs, pss, last_stage);
1259         if (tmppages < 0) {
1260             return tmppages;
1261         }
1262
1263         pages += tmppages;
1264         pss->page++;
1265     } while ((pss->page & (pagesize_bits - 1)) &&
1266              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1267
1268     /* The offset we leave with is the last one we looked at */
1269     pss->page--;
1270     return pages;
1271 }
1272
1273 /**
1274  * ram_find_and_save_block: finds a dirty page and sends it to f
1275  *
1276  * Called within an RCU critical section.
1277  *
1278  * Returns the number of pages written where zero means no dirty pages
1279  *
1280  * @rs: current RAM state
1281  * @last_stage: if we are at the completion stage
1282  *
1283  * On systems where host-page-size > target-page-size it will send all the
1284  * pages in a host page that are dirty.
1285  */
1286
1287 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1288 {
1289     PageSearchStatus pss;
1290     int pages = 0;
1291     bool again, found;
1292
1293     /* No dirty page as there is zero RAM */
1294     if (!ram_bytes_total()) {
1295         return pages;
1296     }
1297
1298     pss.block = rs->last_seen_block;
1299     pss.page = rs->last_page;
1300     pss.complete_round = false;
1301
1302     if (!pss.block) {
1303         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1304     }
1305
1306     do {
1307         again = true;
1308         found = get_queued_page(rs, &pss);
1309
1310         if (!found) {
1311             /* priority queue empty, so just search for something dirty */
1312             found = find_dirty_block(rs, &pss, &again);
1313         }
1314
1315         if (found) {
1316             pages = ram_save_host_page(rs, &pss, last_stage);
1317         }
1318     } while (!pages && again);
1319
1320     rs->last_seen_block = pss.block;
1321     rs->last_page = pss.page;
1322
1323     return pages;
1324 }
1325
1326 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1327 {
1328     uint64_t pages = size / TARGET_PAGE_SIZE;
1329
1330     if (zero) {
1331         ram_counters.duplicate += pages;
1332     } else {
1333         ram_counters.normal += pages;
1334         ram_counters.transferred += size;
1335         qemu_update_position(f, size);
1336     }
1337 }
1338
1339 uint64_t ram_bytes_total(void)
1340 {
1341     RAMBlock *block;
1342     uint64_t total = 0;
1343
1344     rcu_read_lock();
1345     RAMBLOCK_FOREACH(block) {
1346         total += block->used_length;
1347     }
1348     rcu_read_unlock();
1349     return total;
1350 }
1351
1352 static void xbzrle_load_setup(void)
1353 {
1354     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1355 }
1356
1357 static void xbzrle_load_cleanup(void)
1358 {
1359     g_free(XBZRLE.decoded_buf);
1360     XBZRLE.decoded_buf = NULL;
1361 }
1362
1363 static void ram_save_cleanup(void *opaque)
1364 {
1365     RAMState **rsp = opaque;
1366     RAMBlock *block;
1367
1368     /* caller have hold iothread lock or is in a bh, so there is
1369      * no writing race against this migration_bitmap
1370      */
1371     memory_global_dirty_log_stop();
1372
1373     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1374         g_free(block->bmap);
1375         block->bmap = NULL;
1376         g_free(block->unsentmap);
1377         block->unsentmap = NULL;
1378     }
1379
1380     XBZRLE_cache_lock();
1381     if (XBZRLE.cache) {
1382         cache_fini(XBZRLE.cache);
1383         g_free(XBZRLE.encoded_buf);
1384         g_free(XBZRLE.current_buf);
1385         g_free(XBZRLE.zero_target_page);
1386         XBZRLE.cache = NULL;
1387         XBZRLE.encoded_buf = NULL;
1388         XBZRLE.current_buf = NULL;
1389         XBZRLE.zero_target_page = NULL;
1390     }
1391     XBZRLE_cache_unlock();
1392     migration_page_queue_free(*rsp);
1393     compress_threads_save_cleanup();
1394     g_free(*rsp);
1395     *rsp = NULL;
1396 }
1397
1398 static void ram_state_reset(RAMState *rs)
1399 {
1400     rs->last_seen_block = NULL;
1401     rs->last_sent_block = NULL;
1402     rs->last_page = 0;
1403     rs->last_version = ram_list.version;
1404     rs->ram_bulk_stage = true;
1405 }
1406
1407 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1408
1409 /*
1410  * 'expected' is the value you expect the bitmap mostly to be full
1411  * of; it won't bother printing lines that are all this value.
1412  * If 'todump' is null the migration bitmap is dumped.
1413  */
1414 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1415                            unsigned long pages)
1416 {
1417     int64_t cur;
1418     int64_t linelen = 128;
1419     char linebuf[129];
1420
1421     for (cur = 0; cur < pages; cur += linelen) {
1422         int64_t curb;
1423         bool found = false;
1424         /*
1425          * Last line; catch the case where the line length
1426          * is longer than remaining ram
1427          */
1428         if (cur + linelen > pages) {
1429             linelen = pages - cur;
1430         }
1431         for (curb = 0; curb < linelen; curb++) {
1432             bool thisbit = test_bit(cur + curb, todump);
1433             linebuf[curb] = thisbit ? '1' : '.';
1434             found = found || (thisbit != expected);
1435         }
1436         if (found) {
1437             linebuf[curb] = '\0';
1438             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1439         }
1440     }
1441 }
1442
1443 /* **** functions for postcopy ***** */
1444
1445 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1446 {
1447     struct RAMBlock *block;
1448
1449     RAMBLOCK_FOREACH(block) {
1450         unsigned long *bitmap = block->bmap;
1451         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1452         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1453
1454         while (run_start < range) {
1455             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1456             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1457                               (run_end - run_start) << TARGET_PAGE_BITS);
1458             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1459         }
1460     }
1461 }
1462
1463 /**
1464  * postcopy_send_discard_bm_ram: discard a RAMBlock
1465  *
1466  * Returns zero on success
1467  *
1468  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1469  * Note: At this point the 'unsentmap' is the processed bitmap combined
1470  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1471  *
1472  * @ms: current migration state
1473  * @pds: state for postcopy
1474  * @start: RAMBlock starting page
1475  * @length: RAMBlock size
1476  */
1477 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1478                                         PostcopyDiscardState *pds,
1479                                         RAMBlock *block)
1480 {
1481     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1482     unsigned long current;
1483     unsigned long *unsentmap = block->unsentmap;
1484
1485     for (current = 0; current < end; ) {
1486         unsigned long one = find_next_bit(unsentmap, end, current);
1487
1488         if (one <= end) {
1489             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1490             unsigned long discard_length;
1491
1492             if (zero >= end) {
1493                 discard_length = end - one;
1494             } else {
1495                 discard_length = zero - one;
1496             }
1497             if (discard_length) {
1498                 postcopy_discard_send_range(ms, pds, one, discard_length);
1499             }
1500             current = one + discard_length;
1501         } else {
1502             current = one;
1503         }
1504     }
1505
1506     return 0;
1507 }
1508
1509 /**
1510  * postcopy_each_ram_send_discard: discard all RAMBlocks
1511  *
1512  * Returns 0 for success or negative for error
1513  *
1514  * Utility for the outgoing postcopy code.
1515  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1516  *   passing it bitmap indexes and name.
1517  * (qemu_ram_foreach_block ends up passing unscaled lengths
1518  *  which would mean postcopy code would have to deal with target page)
1519  *
1520  * @ms: current migration state
1521  */
1522 static int postcopy_each_ram_send_discard(MigrationState *ms)
1523 {
1524     struct RAMBlock *block;
1525     int ret;
1526
1527     RAMBLOCK_FOREACH(block) {
1528         PostcopyDiscardState *pds =
1529             postcopy_discard_send_init(ms, block->idstr);
1530
1531         /*
1532          * Postcopy sends chunks of bitmap over the wire, but it
1533          * just needs indexes at this point, avoids it having
1534          * target page specific code.
1535          */
1536         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1537         postcopy_discard_send_finish(ms, pds);
1538         if (ret) {
1539             return ret;
1540         }
1541     }
1542
1543     return 0;
1544 }
1545
1546 /**
1547  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1548  *
1549  * Helper for postcopy_chunk_hostpages; it's called twice to
1550  * canonicalize the two bitmaps, that are similar, but one is
1551  * inverted.
1552  *
1553  * Postcopy requires that all target pages in a hostpage are dirty or
1554  * clean, not a mix.  This function canonicalizes the bitmaps.
1555  *
1556  * @ms: current migration state
1557  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1558  *               otherwise we need to canonicalize partially dirty host pages
1559  * @block: block that contains the page we want to canonicalize
1560  * @pds: state for postcopy
1561  */
1562 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1563                                           RAMBlock *block,
1564                                           PostcopyDiscardState *pds)
1565 {
1566     RAMState *rs = ram_state;
1567     unsigned long *bitmap = block->bmap;
1568     unsigned long *unsentmap = block->unsentmap;
1569     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1570     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1571     unsigned long run_start;
1572
1573     if (block->page_size == TARGET_PAGE_SIZE) {
1574         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1575         return;
1576     }
1577
1578     if (unsent_pass) {
1579         /* Find a sent page */
1580         run_start = find_next_zero_bit(unsentmap, pages, 0);
1581     } else {
1582         /* Find a dirty page */
1583         run_start = find_next_bit(bitmap, pages, 0);
1584     }
1585
1586     while (run_start < pages) {
1587         bool do_fixup = false;
1588         unsigned long fixup_start_addr;
1589         unsigned long host_offset;
1590
1591         /*
1592          * If the start of this run of pages is in the middle of a host
1593          * page, then we need to fixup this host page.
1594          */
1595         host_offset = run_start % host_ratio;
1596         if (host_offset) {
1597             do_fixup = true;
1598             run_start -= host_offset;
1599             fixup_start_addr = run_start;
1600             /* For the next pass */
1601             run_start = run_start + host_ratio;
1602         } else {
1603             /* Find the end of this run */
1604             unsigned long run_end;
1605             if (unsent_pass) {
1606                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1607             } else {
1608                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1609             }
1610             /*
1611              * If the end isn't at the start of a host page, then the
1612              * run doesn't finish at the end of a host page
1613              * and we need to discard.
1614              */
1615             host_offset = run_end % host_ratio;
1616             if (host_offset) {
1617                 do_fixup = true;
1618                 fixup_start_addr = run_end - host_offset;
1619                 /*
1620                  * This host page has gone, the next loop iteration starts
1621                  * from after the fixup
1622                  */
1623                 run_start = fixup_start_addr + host_ratio;
1624             } else {
1625                 /*
1626                  * No discards on this iteration, next loop starts from
1627                  * next sent/dirty page
1628                  */
1629                 run_start = run_end + 1;
1630             }
1631         }
1632
1633         if (do_fixup) {
1634             unsigned long page;
1635
1636             /* Tell the destination to discard this page */
1637             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1638                 /* For the unsent_pass we:
1639                  *     discard partially sent pages
1640                  * For the !unsent_pass (dirty) we:
1641                  *     discard partially dirty pages that were sent
1642                  *     (any partially sent pages were already discarded
1643                  *     by the previous unsent_pass)
1644                  */
1645                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1646                                             host_ratio);
1647             }
1648
1649             /* Clean up the bitmap */
1650             for (page = fixup_start_addr;
1651                  page < fixup_start_addr + host_ratio; page++) {
1652                 /* All pages in this host page are now not sent */
1653                 set_bit(page, unsentmap);
1654
1655                 /*
1656                  * Remark them as dirty, updating the count for any pages
1657                  * that weren't previously dirty.
1658                  */
1659                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1660             }
1661         }
1662
1663         if (unsent_pass) {
1664             /* Find the next sent page for the next iteration */
1665             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1666         } else {
1667             /* Find the next dirty page for the next iteration */
1668             run_start = find_next_bit(bitmap, pages, run_start);
1669         }
1670     }
1671 }
1672
1673 /**
1674  * postcopy_chuck_hostpages: discrad any partially sent host page
1675  *
1676  * Utility for the outgoing postcopy code.
1677  *
1678  * Discard any partially sent host-page size chunks, mark any partially
1679  * dirty host-page size chunks as all dirty.  In this case the host-page
1680  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1681  *
1682  * Returns zero on success
1683  *
1684  * @ms: current migration state
1685  * @block: block we want to work with
1686  */
1687 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1688 {
1689     PostcopyDiscardState *pds =
1690         postcopy_discard_send_init(ms, block->idstr);
1691
1692     /* First pass: Discard all partially sent host pages */
1693     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1694     /*
1695      * Second pass: Ensure that all partially dirty host pages are made
1696      * fully dirty.
1697      */
1698     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1699
1700     postcopy_discard_send_finish(ms, pds);
1701     return 0;
1702 }
1703
1704 /**
1705  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1706  *
1707  * Returns zero on success
1708  *
1709  * Transmit the set of pages to be discarded after precopy to the target
1710  * these are pages that:
1711  *     a) Have been previously transmitted but are now dirty again
1712  *     b) Pages that have never been transmitted, this ensures that
1713  *        any pages on the destination that have been mapped by background
1714  *        tasks get discarded (transparent huge pages is the specific concern)
1715  * Hopefully this is pretty sparse
1716  *
1717  * @ms: current migration state
1718  */
1719 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1720 {
1721     RAMState *rs = ram_state;
1722     RAMBlock *block;
1723     int ret;
1724
1725     rcu_read_lock();
1726
1727     /* This should be our last sync, the src is now paused */
1728     migration_bitmap_sync(rs);
1729
1730     /* Easiest way to make sure we don't resume in the middle of a host-page */
1731     rs->last_seen_block = NULL;
1732     rs->last_sent_block = NULL;
1733     rs->last_page = 0;
1734
1735     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1736         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1737         unsigned long *bitmap = block->bmap;
1738         unsigned long *unsentmap = block->unsentmap;
1739
1740         if (!unsentmap) {
1741             /* We don't have a safe way to resize the sentmap, so
1742              * if the bitmap was resized it will be NULL at this
1743              * point.
1744              */
1745             error_report("migration ram resized during precopy phase");
1746             rcu_read_unlock();
1747             return -EINVAL;
1748         }
1749         /* Deal with TPS != HPS and huge pages */
1750         ret = postcopy_chunk_hostpages(ms, block);
1751         if (ret) {
1752             rcu_read_unlock();
1753             return ret;
1754         }
1755
1756         /*
1757          * Update the unsentmap to be unsentmap = unsentmap | dirty
1758          */
1759         bitmap_or(unsentmap, unsentmap, bitmap, pages);
1760 #ifdef DEBUG_POSTCOPY
1761         ram_debug_dump_bitmap(unsentmap, true, pages);
1762 #endif
1763     }
1764     trace_ram_postcopy_send_discard_bitmap();
1765
1766     ret = postcopy_each_ram_send_discard(ms);
1767     rcu_read_unlock();
1768
1769     return ret;
1770 }
1771
1772 /**
1773  * ram_discard_range: discard dirtied pages at the beginning of postcopy
1774  *
1775  * Returns zero on success
1776  *
1777  * @rbname: name of the RAMBlock of the request. NULL means the
1778  *          same that last one.
1779  * @start: RAMBlock starting page
1780  * @length: RAMBlock size
1781  */
1782 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1783 {
1784     int ret = -1;
1785
1786     trace_ram_discard_range(rbname, start, length);
1787
1788     rcu_read_lock();
1789     RAMBlock *rb = qemu_ram_block_by_name(rbname);
1790
1791     if (!rb) {
1792         error_report("ram_discard_range: Failed to find block '%s'", rbname);
1793         goto err;
1794     }
1795
1796     ret = ram_block_discard_range(rb, start, length);
1797
1798 err:
1799     rcu_read_unlock();
1800
1801     return ret;
1802 }
1803
1804 static int ram_state_init(RAMState **rsp)
1805 {
1806     *rsp = g_new0(RAMState, 1);
1807
1808     qemu_mutex_init(&(*rsp)->bitmap_mutex);
1809     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
1810     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
1811
1812     if (migrate_use_xbzrle()) {
1813         XBZRLE_cache_lock();
1814         XBZRLE.zero_target_page = g_malloc0(TARGET_PAGE_SIZE);
1815         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1816                                   TARGET_PAGE_SIZE,
1817                                   TARGET_PAGE_SIZE);
1818         if (!XBZRLE.cache) {
1819             XBZRLE_cache_unlock();
1820             error_report("Error creating cache");
1821             g_free(*rsp);
1822             *rsp = NULL;
1823             return -1;
1824         }
1825         XBZRLE_cache_unlock();
1826
1827         /* We prefer not to abort if there is no memory */
1828         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1829         if (!XBZRLE.encoded_buf) {
1830             error_report("Error allocating encoded_buf");
1831             g_free(*rsp);
1832             *rsp = NULL;
1833             return -1;
1834         }
1835
1836         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1837         if (!XBZRLE.current_buf) {
1838             error_report("Error allocating current_buf");
1839             g_free(XBZRLE.encoded_buf);
1840             XBZRLE.encoded_buf = NULL;
1841             g_free(*rsp);
1842             *rsp = NULL;
1843             return -1;
1844         }
1845     }
1846
1847     /* For memory_global_dirty_log_start below.  */
1848     qemu_mutex_lock_iothread();
1849
1850     qemu_mutex_lock_ramlist();
1851     rcu_read_lock();
1852     ram_state_reset(*rsp);
1853
1854     /* Skip setting bitmap if there is no RAM */
1855     if (ram_bytes_total()) {
1856         RAMBlock *block;
1857
1858         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1859             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
1860
1861             block->bmap = bitmap_new(pages);
1862             bitmap_set(block->bmap, 0, pages);
1863             if (migrate_postcopy_ram()) {
1864                 block->unsentmap = bitmap_new(pages);
1865                 bitmap_set(block->unsentmap, 0, pages);
1866             }
1867         }
1868     }
1869
1870     /*
1871      * Count the total number of pages used by ram blocks not including any
1872      * gaps due to alignment or unplugs.
1873      */
1874     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1875
1876     memory_global_dirty_log_start();
1877     migration_bitmap_sync(*rsp);
1878     qemu_mutex_unlock_ramlist();
1879     qemu_mutex_unlock_iothread();
1880     rcu_read_unlock();
1881
1882     return 0;
1883 }
1884
1885 /*
1886  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1887  * long-running RCU critical section.  When rcu-reclaims in the code
1888  * start to become numerous it will be necessary to reduce the
1889  * granularity of these critical sections.
1890  */
1891
1892 /**
1893  * ram_save_setup: Setup RAM for migration
1894  *
1895  * Returns zero to indicate success and negative for error
1896  *
1897  * @f: QEMUFile where to send the data
1898  * @opaque: RAMState pointer
1899  */
1900 static int ram_save_setup(QEMUFile *f, void *opaque)
1901 {
1902     RAMState **rsp = opaque;
1903     RAMBlock *block;
1904
1905     /* migration has already setup the bitmap, reuse it. */
1906     if (!migration_in_colo_state()) {
1907         if (ram_state_init(rsp) != 0) {
1908             return -1;
1909         }
1910     }
1911     (*rsp)->f = f;
1912
1913     rcu_read_lock();
1914
1915     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1916
1917     RAMBLOCK_FOREACH(block) {
1918         qemu_put_byte(f, strlen(block->idstr));
1919         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1920         qemu_put_be64(f, block->used_length);
1921         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
1922             qemu_put_be64(f, block->page_size);
1923         }
1924     }
1925
1926     rcu_read_unlock();
1927     compress_threads_save_setup();
1928
1929     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1930     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1931
1932     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1933
1934     return 0;
1935 }
1936
1937 /**
1938  * ram_save_iterate: iterative stage for migration
1939  *
1940  * Returns zero to indicate success and negative for error
1941  *
1942  * @f: QEMUFile where to send the data
1943  * @opaque: RAMState pointer
1944  */
1945 static int ram_save_iterate(QEMUFile *f, void *opaque)
1946 {
1947     RAMState **temp = opaque;
1948     RAMState *rs = *temp;
1949     int ret;
1950     int i;
1951     int64_t t0;
1952     int done = 0;
1953
1954     rcu_read_lock();
1955     if (ram_list.version != rs->last_version) {
1956         ram_state_reset(rs);
1957     }
1958
1959     /* Read version before ram_list.blocks */
1960     smp_rmb();
1961
1962     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1963
1964     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1965     i = 0;
1966     while ((ret = qemu_file_rate_limit(f)) == 0) {
1967         int pages;
1968
1969         pages = ram_find_and_save_block(rs, false);
1970         /* no more pages to sent */
1971         if (pages == 0) {
1972             done = 1;
1973             break;
1974         }
1975         rs->iterations++;
1976
1977         /* we want to check in the 1st loop, just in case it was the 1st time
1978            and we had to sync the dirty bitmap.
1979            qemu_get_clock_ns() is a bit expensive, so we only check each some
1980            iterations
1981         */
1982         if ((i & 63) == 0) {
1983             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
1984             if (t1 > MAX_WAIT) {
1985                 trace_ram_save_iterate_big_wait(t1, i);
1986                 break;
1987             }
1988         }
1989         i++;
1990     }
1991     flush_compressed_data(rs);
1992     rcu_read_unlock();
1993
1994     /*
1995      * Must occur before EOS (or any QEMUFile operation)
1996      * because of RDMA protocol.
1997      */
1998     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
1999
2000     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2001     ram_counters.transferred += 8;
2002
2003     ret = qemu_file_get_error(f);
2004     if (ret < 0) {
2005         return ret;
2006     }
2007
2008     return done;
2009 }
2010
2011 /**
2012  * ram_save_complete: function called to send the remaining amount of ram
2013  *
2014  * Returns zero to indicate success
2015  *
2016  * Called with iothread lock
2017  *
2018  * @f: QEMUFile where to send the data
2019  * @opaque: RAMState pointer
2020  */
2021 static int ram_save_complete(QEMUFile *f, void *opaque)
2022 {
2023     RAMState **temp = opaque;
2024     RAMState *rs = *temp;
2025
2026     rcu_read_lock();
2027
2028     if (!migration_in_postcopy()) {
2029         migration_bitmap_sync(rs);
2030     }
2031
2032     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2033
2034     /* try transferring iterative blocks of memory */
2035
2036     /* flush all remaining blocks regardless of rate limiting */
2037     while (true) {
2038         int pages;
2039
2040         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2041         /* no more blocks to sent */
2042         if (pages == 0) {
2043             break;
2044         }
2045     }
2046
2047     flush_compressed_data(rs);
2048     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2049
2050     rcu_read_unlock();
2051
2052     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2053
2054     return 0;
2055 }
2056
2057 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2058                              uint64_t *non_postcopiable_pending,
2059                              uint64_t *postcopiable_pending)
2060 {
2061     RAMState **temp = opaque;
2062     RAMState *rs = *temp;
2063     uint64_t remaining_size;
2064
2065     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2066
2067     if (!migration_in_postcopy() &&
2068         remaining_size < max_size) {
2069         qemu_mutex_lock_iothread();
2070         rcu_read_lock();
2071         migration_bitmap_sync(rs);
2072         rcu_read_unlock();
2073         qemu_mutex_unlock_iothread();
2074         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2075     }
2076
2077     /* We can do postcopy, and all the data is postcopiable */
2078     *postcopiable_pending += remaining_size;
2079 }
2080
2081 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2082 {
2083     unsigned int xh_len;
2084     int xh_flags;
2085     uint8_t *loaded_data;
2086
2087     /* extract RLE header */
2088     xh_flags = qemu_get_byte(f);
2089     xh_len = qemu_get_be16(f);
2090
2091     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2092         error_report("Failed to load XBZRLE page - wrong compression!");
2093         return -1;
2094     }
2095
2096     if (xh_len > TARGET_PAGE_SIZE) {
2097         error_report("Failed to load XBZRLE page - len overflow!");
2098         return -1;
2099     }
2100     loaded_data = XBZRLE.decoded_buf;
2101     /* load data and decode */
2102     /* it can change loaded_data to point to an internal buffer */
2103     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2104
2105     /* decode RLE */
2106     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2107                              TARGET_PAGE_SIZE) == -1) {
2108         error_report("Failed to load XBZRLE page - decode error!");
2109         return -1;
2110     }
2111
2112     return 0;
2113 }
2114
2115 /**
2116  * ram_block_from_stream: read a RAMBlock id from the migration stream
2117  *
2118  * Must be called from within a rcu critical section.
2119  *
2120  * Returns a pointer from within the RCU-protected ram_list.
2121  *
2122  * @f: QEMUFile where to read the data from
2123  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2124  */
2125 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2126 {
2127     static RAMBlock *block = NULL;
2128     char id[256];
2129     uint8_t len;
2130
2131     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2132         if (!block) {
2133             error_report("Ack, bad migration stream!");
2134             return NULL;
2135         }
2136         return block;
2137     }
2138
2139     len = qemu_get_byte(f);
2140     qemu_get_buffer(f, (uint8_t *)id, len);
2141     id[len] = 0;
2142
2143     block = qemu_ram_block_by_name(id);
2144     if (!block) {
2145         error_report("Can't find block %s", id);
2146         return NULL;
2147     }
2148
2149     return block;
2150 }
2151
2152 static inline void *host_from_ram_block_offset(RAMBlock *block,
2153                                                ram_addr_t offset)
2154 {
2155     if (!offset_in_ramblock(block, offset)) {
2156         return NULL;
2157     }
2158
2159     return block->host + offset;
2160 }
2161
2162 /**
2163  * ram_handle_compressed: handle the zero page case
2164  *
2165  * If a page (or a whole RDMA chunk) has been
2166  * determined to be zero, then zap it.
2167  *
2168  * @host: host address for the zero page
2169  * @ch: what the page is filled from.  We only support zero
2170  * @size: size of the zero page
2171  */
2172 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2173 {
2174     if (ch != 0 || !is_zero_range(host, size)) {
2175         memset(host, ch, size);
2176     }
2177 }
2178
2179 static void *do_data_decompress(void *opaque)
2180 {
2181     DecompressParam *param = opaque;
2182     unsigned long pagesize;
2183     uint8_t *des;
2184     int len;
2185
2186     qemu_mutex_lock(&param->mutex);
2187     while (!param->quit) {
2188         if (param->des) {
2189             des = param->des;
2190             len = param->len;
2191             param->des = 0;
2192             qemu_mutex_unlock(&param->mutex);
2193
2194             pagesize = TARGET_PAGE_SIZE;
2195             /* uncompress() will return failed in some case, especially
2196              * when the page is dirted when doing the compression, it's
2197              * not a problem because the dirty page will be retransferred
2198              * and uncompress() won't break the data in other pages.
2199              */
2200             uncompress((Bytef *)des, &pagesize,
2201                        (const Bytef *)param->compbuf, len);
2202
2203             qemu_mutex_lock(&decomp_done_lock);
2204             param->done = true;
2205             qemu_cond_signal(&decomp_done_cond);
2206             qemu_mutex_unlock(&decomp_done_lock);
2207
2208             qemu_mutex_lock(&param->mutex);
2209         } else {
2210             qemu_cond_wait(&param->cond, &param->mutex);
2211         }
2212     }
2213     qemu_mutex_unlock(&param->mutex);
2214
2215     return NULL;
2216 }
2217
2218 static void wait_for_decompress_done(void)
2219 {
2220     int idx, thread_count;
2221
2222     if (!migrate_use_compression()) {
2223         return;
2224     }
2225
2226     thread_count = migrate_decompress_threads();
2227     qemu_mutex_lock(&decomp_done_lock);
2228     for (idx = 0; idx < thread_count; idx++) {
2229         while (!decomp_param[idx].done) {
2230             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2231         }
2232     }
2233     qemu_mutex_unlock(&decomp_done_lock);
2234 }
2235
2236 static void compress_threads_load_setup(void)
2237 {
2238     int i, thread_count;
2239
2240     if (!migrate_use_compression()) {
2241         return;
2242     }
2243     thread_count = migrate_decompress_threads();
2244     decompress_threads = g_new0(QemuThread, thread_count);
2245     decomp_param = g_new0(DecompressParam, thread_count);
2246     qemu_mutex_init(&decomp_done_lock);
2247     qemu_cond_init(&decomp_done_cond);
2248     for (i = 0; i < thread_count; i++) {
2249         qemu_mutex_init(&decomp_param[i].mutex);
2250         qemu_cond_init(&decomp_param[i].cond);
2251         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2252         decomp_param[i].done = true;
2253         decomp_param[i].quit = false;
2254         qemu_thread_create(decompress_threads + i, "decompress",
2255                            do_data_decompress, decomp_param + i,
2256                            QEMU_THREAD_JOINABLE);
2257     }
2258 }
2259
2260 static void compress_threads_load_cleanup(void)
2261 {
2262     int i, thread_count;
2263
2264     if (!migrate_use_compression()) {
2265         return;
2266     }
2267     thread_count = migrate_decompress_threads();
2268     for (i = 0; i < thread_count; i++) {
2269         qemu_mutex_lock(&decomp_param[i].mutex);
2270         decomp_param[i].quit = true;
2271         qemu_cond_signal(&decomp_param[i].cond);
2272         qemu_mutex_unlock(&decomp_param[i].mutex);
2273     }
2274     for (i = 0; i < thread_count; i++) {
2275         qemu_thread_join(decompress_threads + i);
2276         qemu_mutex_destroy(&decomp_param[i].mutex);
2277         qemu_cond_destroy(&decomp_param[i].cond);
2278         g_free(decomp_param[i].compbuf);
2279     }
2280     g_free(decompress_threads);
2281     g_free(decomp_param);
2282     decompress_threads = NULL;
2283     decomp_param = NULL;
2284 }
2285
2286 static void decompress_data_with_multi_threads(QEMUFile *f,
2287                                                void *host, int len)
2288 {
2289     int idx, thread_count;
2290
2291     thread_count = migrate_decompress_threads();
2292     qemu_mutex_lock(&decomp_done_lock);
2293     while (true) {
2294         for (idx = 0; idx < thread_count; idx++) {
2295             if (decomp_param[idx].done) {
2296                 decomp_param[idx].done = false;
2297                 qemu_mutex_lock(&decomp_param[idx].mutex);
2298                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2299                 decomp_param[idx].des = host;
2300                 decomp_param[idx].len = len;
2301                 qemu_cond_signal(&decomp_param[idx].cond);
2302                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2303                 break;
2304             }
2305         }
2306         if (idx < thread_count) {
2307             break;
2308         } else {
2309             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2310         }
2311     }
2312     qemu_mutex_unlock(&decomp_done_lock);
2313 }
2314
2315 /**
2316  * ram_load_setup: Setup RAM for migration incoming side
2317  *
2318  * Returns zero to indicate success and negative for error
2319  *
2320  * @f: QEMUFile where to receive the data
2321  * @opaque: RAMState pointer
2322  */
2323 static int ram_load_setup(QEMUFile *f, void *opaque)
2324 {
2325     xbzrle_load_setup();
2326     compress_threads_load_setup();
2327     return 0;
2328 }
2329
2330 static int ram_load_cleanup(void *opaque)
2331 {
2332     xbzrle_load_cleanup();
2333     compress_threads_load_cleanup();
2334     return 0;
2335 }
2336
2337 /**
2338  * ram_postcopy_incoming_init: allocate postcopy data structures
2339  *
2340  * Returns 0 for success and negative if there was one error
2341  *
2342  * @mis: current migration incoming state
2343  *
2344  * Allocate data structures etc needed by incoming migration with
2345  * postcopy-ram. postcopy-ram's similarly names
2346  * postcopy_ram_incoming_init does the work.
2347  */
2348 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2349 {
2350     unsigned long ram_pages = last_ram_page();
2351
2352     return postcopy_ram_incoming_init(mis, ram_pages);
2353 }
2354
2355 /**
2356  * ram_load_postcopy: load a page in postcopy case
2357  *
2358  * Returns 0 for success or -errno in case of error
2359  *
2360  * Called in postcopy mode by ram_load().
2361  * rcu_read_lock is taken prior to this being called.
2362  *
2363  * @f: QEMUFile where to send the data
2364  */
2365 static int ram_load_postcopy(QEMUFile *f)
2366 {
2367     int flags = 0, ret = 0;
2368     bool place_needed = false;
2369     bool matching_page_sizes = false;
2370     MigrationIncomingState *mis = migration_incoming_get_current();
2371     /* Temporary page that is later 'placed' */
2372     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2373     void *last_host = NULL;
2374     bool all_zero = false;
2375
2376     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2377         ram_addr_t addr;
2378         void *host = NULL;
2379         void *page_buffer = NULL;
2380         void *place_source = NULL;
2381         RAMBlock *block = NULL;
2382         uint8_t ch;
2383
2384         addr = qemu_get_be64(f);
2385         flags = addr & ~TARGET_PAGE_MASK;
2386         addr &= TARGET_PAGE_MASK;
2387
2388         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2389         place_needed = false;
2390         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2391             block = ram_block_from_stream(f, flags);
2392
2393             host = host_from_ram_block_offset(block, addr);
2394             if (!host) {
2395                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2396                 ret = -EINVAL;
2397                 break;
2398             }
2399             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2400             /*
2401              * Postcopy requires that we place whole host pages atomically;
2402              * these may be huge pages for RAMBlocks that are backed by
2403              * hugetlbfs.
2404              * To make it atomic, the data is read into a temporary page
2405              * that's moved into place later.
2406              * The migration protocol uses,  possibly smaller, target-pages
2407              * however the source ensures it always sends all the components
2408              * of a host page in order.
2409              */
2410             page_buffer = postcopy_host_page +
2411                           ((uintptr_t)host & (block->page_size - 1));
2412             /* If all TP are zero then we can optimise the place */
2413             if (!((uintptr_t)host & (block->page_size - 1))) {
2414                 all_zero = true;
2415             } else {
2416                 /* not the 1st TP within the HP */
2417                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2418                     error_report("Non-sequential target page %p/%p",
2419                                   host, last_host);
2420                     ret = -EINVAL;
2421                     break;
2422                 }
2423             }
2424
2425
2426             /*
2427              * If it's the last part of a host page then we place the host
2428              * page
2429              */
2430             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2431                                      (block->page_size - 1)) == 0;
2432             place_source = postcopy_host_page;
2433         }
2434         last_host = host;
2435
2436         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2437         case RAM_SAVE_FLAG_ZERO:
2438             ch = qemu_get_byte(f);
2439             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2440             if (ch) {
2441                 all_zero = false;
2442             }
2443             break;
2444
2445         case RAM_SAVE_FLAG_PAGE:
2446             all_zero = false;
2447             if (!place_needed || !matching_page_sizes) {
2448                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2449             } else {
2450                 /* Avoids the qemu_file copy during postcopy, which is
2451                  * going to do a copy later; can only do it when we
2452                  * do this read in one go (matching page sizes)
2453                  */
2454                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2455                                          TARGET_PAGE_SIZE);
2456             }
2457             break;
2458         case RAM_SAVE_FLAG_EOS:
2459             /* normal exit */
2460             break;
2461         default:
2462             error_report("Unknown combination of migration flags: %#x"
2463                          " (postcopy mode)", flags);
2464             ret = -EINVAL;
2465         }
2466
2467         if (place_needed) {
2468             /* This gets called at the last target page in the host page */
2469             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2470
2471             if (all_zero) {
2472                 ret = postcopy_place_page_zero(mis, place_dest,
2473                                                block->page_size);
2474             } else {
2475                 ret = postcopy_place_page(mis, place_dest,
2476                                           place_source, block->page_size);
2477             }
2478         }
2479         if (!ret) {
2480             ret = qemu_file_get_error(f);
2481         }
2482     }
2483
2484     return ret;
2485 }
2486
2487 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2488 {
2489     int flags = 0, ret = 0, invalid_flags = 0;
2490     static uint64_t seq_iter;
2491     int len = 0;
2492     /*
2493      * If system is running in postcopy mode, page inserts to host memory must
2494      * be atomic
2495      */
2496     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2497     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2498     bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2499
2500     seq_iter++;
2501
2502     if (version_id != 4) {
2503         ret = -EINVAL;
2504     }
2505
2506     if (!migrate_use_compression()) {
2507         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2508     }
2509     /* This RCU critical section can be very long running.
2510      * When RCU reclaims in the code start to become numerous,
2511      * it will be necessary to reduce the granularity of this
2512      * critical section.
2513      */
2514     rcu_read_lock();
2515
2516     if (postcopy_running) {
2517         ret = ram_load_postcopy(f);
2518     }
2519
2520     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2521         ram_addr_t addr, total_ram_bytes;
2522         void *host = NULL;
2523         uint8_t ch;
2524
2525         addr = qemu_get_be64(f);
2526         flags = addr & ~TARGET_PAGE_MASK;
2527         addr &= TARGET_PAGE_MASK;
2528
2529         if (flags & invalid_flags) {
2530             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2531                 error_report("Received an unexpected compressed page");
2532             }
2533
2534             ret = -EINVAL;
2535             break;
2536         }
2537
2538         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2539                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2540             RAMBlock *block = ram_block_from_stream(f, flags);
2541
2542             host = host_from_ram_block_offset(block, addr);
2543             if (!host) {
2544                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2545                 ret = -EINVAL;
2546                 break;
2547             }
2548             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2549         }
2550
2551         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2552         case RAM_SAVE_FLAG_MEM_SIZE:
2553             /* Synchronize RAM block list */
2554             total_ram_bytes = addr;
2555             while (!ret && total_ram_bytes) {
2556                 RAMBlock *block;
2557                 char id[256];
2558                 ram_addr_t length;
2559
2560                 len = qemu_get_byte(f);
2561                 qemu_get_buffer(f, (uint8_t *)id, len);
2562                 id[len] = 0;
2563                 length = qemu_get_be64(f);
2564
2565                 block = qemu_ram_block_by_name(id);
2566                 if (block) {
2567                     if (length != block->used_length) {
2568                         Error *local_err = NULL;
2569
2570                         ret = qemu_ram_resize(block, length,
2571                                               &local_err);
2572                         if (local_err) {
2573                             error_report_err(local_err);
2574                         }
2575                     }
2576                     /* For postcopy we need to check hugepage sizes match */
2577                     if (postcopy_advised &&
2578                         block->page_size != qemu_host_page_size) {
2579                         uint64_t remote_page_size = qemu_get_be64(f);
2580                         if (remote_page_size != block->page_size) {
2581                             error_report("Mismatched RAM page size %s "
2582                                          "(local) %zd != %" PRId64,
2583                                          id, block->page_size,
2584                                          remote_page_size);
2585                             ret = -EINVAL;
2586                         }
2587                     }
2588                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2589                                           block->idstr);
2590                 } else {
2591                     error_report("Unknown ramblock \"%s\", cannot "
2592                                  "accept migration", id);
2593                     ret = -EINVAL;
2594                 }
2595
2596                 total_ram_bytes -= length;
2597             }
2598             break;
2599
2600         case RAM_SAVE_FLAG_ZERO:
2601             ch = qemu_get_byte(f);
2602             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2603             break;
2604
2605         case RAM_SAVE_FLAG_PAGE:
2606             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2607             break;
2608
2609         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2610             len = qemu_get_be32(f);
2611             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2612                 error_report("Invalid compressed data length: %d", len);
2613                 ret = -EINVAL;
2614                 break;
2615             }
2616             decompress_data_with_multi_threads(f, host, len);
2617             break;
2618
2619         case RAM_SAVE_FLAG_XBZRLE:
2620             if (load_xbzrle(f, addr, host) < 0) {
2621                 error_report("Failed to decompress XBZRLE page at "
2622                              RAM_ADDR_FMT, addr);
2623                 ret = -EINVAL;
2624                 break;
2625             }
2626             break;
2627         case RAM_SAVE_FLAG_EOS:
2628             /* normal exit */
2629             break;
2630         default:
2631             if (flags & RAM_SAVE_FLAG_HOOK) {
2632                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2633             } else {
2634                 error_report("Unknown combination of migration flags: %#x",
2635                              flags);
2636                 ret = -EINVAL;
2637             }
2638         }
2639         if (!ret) {
2640             ret = qemu_file_get_error(f);
2641         }
2642     }
2643
2644     wait_for_decompress_done();
2645     rcu_read_unlock();
2646     trace_ram_load_complete(ret, seq_iter);
2647     return ret;
2648 }
2649
2650 static SaveVMHandlers savevm_ram_handlers = {
2651     .save_setup = ram_save_setup,
2652     .save_live_iterate = ram_save_iterate,
2653     .save_live_complete_postcopy = ram_save_complete,
2654     .save_live_complete_precopy = ram_save_complete,
2655     .save_live_pending = ram_save_pending,
2656     .load_state = ram_load,
2657     .save_cleanup = ram_save_cleanup,
2658     .load_setup = ram_load_setup,
2659     .load_cleanup = ram_load_cleanup,
2660 };
2661
2662 void ram_mig_init(void)
2663 {
2664     qemu_mutex_init(&XBZRLE.lock);
2665     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2666 }