migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "qemu-common.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qapi-event.h"
  33 #include "qemu/cutils.h"
  34 #include "qemu/bitops.h"
  35 #include "qemu/bitmap.h"
  36 #include "qemu/timer.h"
  37 #include "qemu/main-loop.h"
  38 #include "xbzrle.h"
  39 #include "ram.h"
  40 #include "migration/migration.h"
  41 #include "migration/misc.h"
  42 #include "qemu-file.h"
  43 #include "migration/vmstate.h"
  44 #include "postcopy-ram.h"
  45 #include "exec/address-spaces.h"
  46 #include "migration/page_cache.h"
  47 #include "qemu/error-report.h"
  48 #include "trace.h"
  49 #include "exec/ram_addr.h"
  50 #include "qemu/rcu_queue.h"
  51 #include "migration/colo.h"
  52
  53 /***********************************************************/
  54 /* ram save/restore */
  55
  56 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  57  * worked for pages that where filled with the same char.  We switched
  58  * it to only search for the zero value.  And to avoid confusion with
  59  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  60  */
  61
  62 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  63 #define RAM_SAVE_FLAG_ZERO     0x02
  64 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  65 #define RAM_SAVE_FLAG_PAGE     0x08
  66 #define RAM_SAVE_FLAG_EOS      0x10
  67 #define RAM_SAVE_FLAG_CONTINUE 0x20
  68 #define RAM_SAVE_FLAG_XBZRLE   0x40
  69 /* 0x80 is reserved in migration.h start with 0x100 next */
  70 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  71
  72 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  73 {
  74     return buffer_is_zero(p, size);
  75 }
  76
  77 XBZRLECacheStats xbzrle_counters;
  78
  79 /* struct contains XBZRLE cache and a static page
  80    used by the compression */
  81 static struct {
  82     /* buffer used for XBZRLE encoding */
  83     uint8_t *encoded_buf;
  84     /* buffer for storing page content */
  85     uint8_t *current_buf;
  86     /* Cache for XBZRLE, Protected by lock. */
  87     PageCache *cache;
  88     QemuMutex lock;
  89     /* it will store a page full of zeros */
  90     uint8_t *zero_target_page;
  91 } XBZRLE;
  92
  93 /* buffer used for XBZRLE decoding */
  94 static uint8_t *xbzrle_decoded_buf;
  95
  96 static void XBZRLE_cache_lock(void)
  97 {
  98     if (migrate_use_xbzrle())
  99         qemu_mutex_lock(&XBZRLE.lock);
 100 }
 101
 102 static void XBZRLE_cache_unlock(void)
 103 {
 104     if (migrate_use_xbzrle())
 105         qemu_mutex_unlock(&XBZRLE.lock);
 106 }
 107
 108 /**
 109  * xbzrle_cache_resize: resize the xbzrle cache
 110  *
 111  * This function is called from qmp_migrate_set_cache_size in main
 112  * thread, possibly while a migration is in progress.  A running
 113  * migration may be using the cache and might finish during this call,
 114  * hence changes to the cache are protected by XBZRLE.lock().
 115  *
 116  * Returns the new_size or negative in case of error.
 117  *
 118  * @new_size: new cache size
 119  */
 120 int64_t xbzrle_cache_resize(int64_t new_size)
 121 {
 122     PageCache *new_cache;
 123     int64_t ret;
 124
 125     if (new_size < TARGET_PAGE_SIZE) {
 126         return -1;
 127     }
 128
 129     XBZRLE_cache_lock();
 130
 131     if (XBZRLE.cache != NULL) {
 132         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 133             goto out_new_size;
 134         }
 135         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 136                                         TARGET_PAGE_SIZE);
 137         if (!new_cache) {
 138             error_report("Error creating cache");
 139             ret = -1;
 140             goto out;
 141         }
 142
 143         cache_fini(XBZRLE.cache);
 144         XBZRLE.cache = new_cache;
 145     }
 146
 147 out_new_size:
 148     ret = pow2floor(new_size);
 149 out:
 150     XBZRLE_cache_unlock();
 151     return ret;
 152 }
 153
 154 /*
 155  * An outstanding page request, on the source, having been received
 156  * and queued
 157  */
 158 struct RAMSrcPageRequest {
 159     RAMBlock *rb;
 160     hwaddr    offset;
 161     hwaddr    len;
 162
 163     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 164 };
 165
 166 /* State of RAM for migration */
 167 struct RAMState {
 168     /* QEMUFile used for this migration */
 169     QEMUFile *f;
 170     /* Last block that we have visited searching for dirty pages */
 171     RAMBlock *last_seen_block;
 172     /* Last block from where we have sent data */
 173     RAMBlock *last_sent_block;
 174     /* Last dirty target page we have sent */
 175     ram_addr_t last_page;
 176     /* last ram version we have seen */
 177     uint32_t last_version;
 178     /* We are in the first round */
 179     bool ram_bulk_stage;
 180     /* How many times we have dirty too many pages */
 181     int dirty_rate_high_cnt;
 182     /* these variables are used for bitmap sync */
 183     /* last time we did a full bitmap_sync */
 184     int64_t time_last_bitmap_sync;
 185     /* bytes transferred at start_time */
 186     uint64_t bytes_xfer_prev;
 187     /* number of dirty pages since start_time */
 188     uint64_t num_dirty_pages_period;
 189     /* xbzrle misses since the beginning of the period */
 190     uint64_t xbzrle_cache_miss_prev;
 191     /* number of iterations at the beginning of period */
 192     uint64_t iterations_prev;
 193     /* Iterations since start */
 194     uint64_t iterations;
 195     /* protects modification of the bitmap */
 196     uint64_t migration_dirty_pages;
 197     /* number of dirty bits in the bitmap */
 198     QemuMutex bitmap_mutex;
 199     /* The RAMBlock used in the last src_page_requests */
 200     RAMBlock *last_req_rb;
 201     /* Queue of outstanding page requests from the destination */
 202     QemuMutex src_page_req_mutex;
 203     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 204 };
 205 typedef struct RAMState RAMState;
 206
 207 static RAMState *ram_state;
 208
 209 uint64_t ram_bytes_remaining(void)
 210 {
 211     return ram_state->migration_dirty_pages * TARGET_PAGE_SIZE;
 212 }
 213
 214 MigrationStats ram_counters;
 215
 216 /* used by the search for pages to send */
 217 struct PageSearchStatus {
 218     /* Current block being searched */
 219     RAMBlock    *block;
 220     /* Current page to search from */
 221     unsigned long page;
 222     /* Set once we wrap around */
 223     bool         complete_round;
 224 };
 225 typedef struct PageSearchStatus PageSearchStatus;
 226
 227 struct CompressParam {
 228     bool done;
 229     bool quit;
 230     QEMUFile *file;
 231     QemuMutex mutex;
 232     QemuCond cond;
 233     RAMBlock *block;
 234     ram_addr_t offset;
 235 };
 236 typedef struct CompressParam CompressParam;
 237
 238 struct DecompressParam {
 239     bool done;
 240     bool quit;
 241     QemuMutex mutex;
 242     QemuCond cond;
 243     void *des;
 244     uint8_t *compbuf;
 245     int len;
 246 };
 247 typedef struct DecompressParam DecompressParam;
 248
 249 static CompressParam *comp_param;
 250 static QemuThread *compress_threads;
 251 /* comp_done_cond is used to wake up the migration thread when
 252  * one of the compression threads has finished the compression.
 253  * comp_done_lock is used to co-work with comp_done_cond.
 254  */
 255 static QemuMutex comp_done_lock;
 256 static QemuCond comp_done_cond;
 257 /* The empty QEMUFileOps will be used by file in CompressParam */
 258 static const QEMUFileOps empty_ops = { };
 259
 260 static DecompressParam *decomp_param;
 261 static QemuThread *decompress_threads;
 262 static QemuMutex decomp_done_lock;
 263 static QemuCond decomp_done_cond;
 264
 265 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 266                                 ram_addr_t offset);
 267
 268 static void *do_data_compress(void *opaque)
 269 {
 270     CompressParam *param = opaque;
 271     RAMBlock *block;
 272     ram_addr_t offset;
 273
 274     qemu_mutex_lock(&param->mutex);
 275     while (!param->quit) {
 276         if (param->block) {
 277             block = param->block;
 278             offset = param->offset;
 279             param->block = NULL;
 280             qemu_mutex_unlock(&param->mutex);
 281
 282             do_compress_ram_page(param->file, block, offset);
 283
 284             qemu_mutex_lock(&comp_done_lock);
 285             param->done = true;
 286             qemu_cond_signal(&comp_done_cond);
 287             qemu_mutex_unlock(&comp_done_lock);
 288
 289             qemu_mutex_lock(&param->mutex);
 290         } else {
 291             qemu_cond_wait(&param->cond, &param->mutex);
 292         }
 293     }
 294     qemu_mutex_unlock(&param->mutex);
 295
 296     return NULL;
 297 }
 298
 299 static inline void terminate_compression_threads(void)
 300 {
 301     int idx, thread_count;
 302
 303     thread_count = migrate_compress_threads();
 304
 305     for (idx = 0; idx < thread_count; idx++) {
 306         qemu_mutex_lock(&comp_param[idx].mutex);
 307         comp_param[idx].quit = true;
 308         qemu_cond_signal(&comp_param[idx].cond);
 309         qemu_mutex_unlock(&comp_param[idx].mutex);
 310     }
 311 }
 312
 313 void migrate_compress_threads_join(void)
 314 {
 315     int i, thread_count;
 316
 317     if (!migrate_use_compression()) {
 318         return;
 319     }
 320     terminate_compression_threads();
 321     thread_count = migrate_compress_threads();
 322     for (i = 0; i < thread_count; i++) {
 323         qemu_thread_join(compress_threads + i);
 324         qemu_fclose(comp_param[i].file);
 325         qemu_mutex_destroy(&comp_param[i].mutex);
 326         qemu_cond_destroy(&comp_param[i].cond);
 327     }
 328     qemu_mutex_destroy(&comp_done_lock);
 329     qemu_cond_destroy(&comp_done_cond);
 330     g_free(compress_threads);
 331     g_free(comp_param);
 332     compress_threads = NULL;
 333     comp_param = NULL;
 334 }
 335
 336 void migrate_compress_threads_create(void)
 337 {
 338     int i, thread_count;
 339
 340     if (!migrate_use_compression()) {
 341         return;
 342     }
 343     thread_count = migrate_compress_threads();
 344     compress_threads = g_new0(QemuThread, thread_count);
 345     comp_param = g_new0(CompressParam, thread_count);
 346     qemu_cond_init(&comp_done_cond);
 347     qemu_mutex_init(&comp_done_lock);
 348     for (i = 0; i < thread_count; i++) {
 349         /* comp_param[i].file is just used as a dummy buffer to save data,
 350          * set its ops to empty.
 351          */
 352         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 353         comp_param[i].done = true;
 354         comp_param[i].quit = false;
 355         qemu_mutex_init(&comp_param[i].mutex);
 356         qemu_cond_init(&comp_param[i].cond);
 357         qemu_thread_create(compress_threads + i, "compress",
 358                            do_data_compress, comp_param + i,
 359                            QEMU_THREAD_JOINABLE);
 360     }
 361 }
 362
 363 /**
 364  * save_page_header: write page header to wire
 365  *
 366  * If this is the 1st block, it also writes the block identification
 367  *
 368  * Returns the number of bytes written
 369  *
 370  * @f: QEMUFile where to send the data
 371  * @block: block that contains the page we want to send
 372  * @offset: offset inside the block for the page
 373  *          in the lower bits, it contains flags
 374  */
 375 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 376                                ram_addr_t offset)
 377 {
 378     size_t size, len;
 379
 380     if (block == rs->last_sent_block) {
 381         offset |= RAM_SAVE_FLAG_CONTINUE;
 382     }
 383     qemu_put_be64(f, offset);
 384     size = 8;
 385
 386     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 387         len = strlen(block->idstr);
 388         qemu_put_byte(f, len);
 389         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 390         size += 1 + len;
 391         rs->last_sent_block = block;
 392     }
 393     return size;
 394 }
 395
 396 /**
 397  * mig_throttle_guest_down: throotle down the guest
 398  *
 399  * Reduce amount of guest cpu execution to hopefully slow down memory
 400  * writes. If guest dirty memory rate is reduced below the rate at
 401  * which we can transfer pages to the destination then we should be
 402  * able to complete migration. Some workloads dirty memory way too
 403  * fast and will not effectively converge, even with auto-converge.
 404  */
 405 static void mig_throttle_guest_down(void)
 406 {
 407     MigrationState *s = migrate_get_current();
 408     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 409     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 410
 411     /* We have not started throttling yet. Let's start it. */
 412     if (!cpu_throttle_active()) {
 413         cpu_throttle_set(pct_initial);
 414     } else {
 415         /* Throttling already on, just increase the rate */
 416         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 417     }
 418 }
 419
 420 /**
 421  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 422  *
 423  * @rs: current RAM state
 424  * @current_addr: address for the zero page
 425  *
 426  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 427  * The important thing is that a stale (not-yet-0'd) page be replaced
 428  * by the new data.
 429  * As a bonus, if the page wasn't in the cache it gets added so that
 430  * when a small write is made into the 0'd page it gets XBZRLE sent.
 431  */
 432 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 433 {
 434     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 435         return;
 436     }
 437
 438     /* We don't care if this fails to allocate a new cache page
 439      * as long as it updated an old one */
 440     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 441                  ram_counters.dirty_sync_count);
 442 }
 443
 444 #define ENCODING_FLAG_XBZRLE 0x1
 445
 446 /**
 447  * save_xbzrle_page: compress and send current page
 448  *
 449  * Returns: 1 means that we wrote the page
 450  *          0 means that page is identical to the one already sent
 451  *          -1 means that xbzrle would be longer than normal
 452  *
 453  * @rs: current RAM state
 454  * @current_data: pointer to the address of the page contents
 455  * @current_addr: addr of the page
 456  * @block: block that contains the page we want to send
 457  * @offset: offset inside the block for the page
 458  * @last_stage: if we are at the completion stage
 459  */
 460 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 461                             ram_addr_t current_addr, RAMBlock *block,
 462                             ram_addr_t offset, bool last_stage)
 463 {
 464     int encoded_len = 0, bytes_xbzrle;
 465     uint8_t *prev_cached_page;
 466
 467     if (!cache_is_cached(XBZRLE.cache, current_addr,
 468                          ram_counters.dirty_sync_count)) {
 469         xbzrle_counters.cache_miss++;
 470         if (!last_stage) {
 471             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 472                              ram_counters.dirty_sync_count) == -1) {
 473                 return -1;
 474             } else {
 475                 /* update *current_data when the page has been
 476                    inserted into cache */
 477                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 478             }
 479         }
 480         return -1;
 481     }
 482
 483     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 484
 485     /* save current buffer into memory */
 486     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 487
 488     /* XBZRLE encoding (if there is no overflow) */
 489     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 490                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 491                                        TARGET_PAGE_SIZE);
 492     if (encoded_len == 0) {
 493         trace_save_xbzrle_page_skipping();
 494         return 0;
 495     } else if (encoded_len == -1) {
 496         trace_save_xbzrle_page_overflow();
 497         xbzrle_counters.overflow++;
 498         /* update data in the cache */
 499         if (!last_stage) {
 500             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 501             *current_data = prev_cached_page;
 502         }
 503         return -1;
 504     }
 505
 506     /* we need to update the data in the cache, in order to get the same data */
 507     if (!last_stage) {
 508         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 509     }
 510
 511     /* Send XBZRLE based compressed page */
 512     bytes_xbzrle = save_page_header(rs, rs->f, block,
 513                                     offset | RAM_SAVE_FLAG_XBZRLE);
 514     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 515     qemu_put_be16(rs->f, encoded_len);
 516     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 517     bytes_xbzrle += encoded_len + 1 + 2;
 518     xbzrle_counters.pages++;
 519     xbzrle_counters.bytes += bytes_xbzrle;
 520     ram_counters.transferred += bytes_xbzrle;
 521
 522     return 1;
 523 }
 524
 525 /**
 526  * migration_bitmap_find_dirty: find the next dirty page from start
 527  *
 528  * Called with rcu_read_lock() to protect migration_bitmap
 529  *
 530  * Returns the byte offset within memory region of the start of a dirty page
 531  *
 532  * @rs: current RAM state
 533  * @rb: RAMBlock where to search for dirty pages
 534  * @start: page where we start the search
 535  */
 536 static inline
 537 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 538                                           unsigned long start)
 539 {
 540     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 541     unsigned long *bitmap = rb->bmap;
 542     unsigned long next;
 543
 544     if (rs->ram_bulk_stage && start > 0) {
 545         next = start + 1;
 546     } else {
 547         next = find_next_bit(bitmap, size, start);
 548     }
 549
 550     return next;
 551 }
 552
 553 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 554                                                 RAMBlock *rb,
 555                                                 unsigned long page)
 556 {
 557     bool ret;
 558
 559     ret = test_and_clear_bit(page, rb->bmap);
 560
 561     if (ret) {
 562         rs->migration_dirty_pages--;
 563     }
 564     return ret;
 565 }
 566
 567 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 568                                         ram_addr_t start, ram_addr_t length)
 569 {
 570     rs->migration_dirty_pages +=
 571         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 572                                               &rs->num_dirty_pages_period);
 573 }
 574
 575 /**
 576  * ram_pagesize_summary: calculate all the pagesizes of a VM
 577  *
 578  * Returns a summary bitmap of the page sizes of all RAMBlocks
 579  *
 580  * For VMs with just normal pages this is equivalent to the host page
 581  * size. If it's got some huge pages then it's the OR of all the
 582  * different page sizes.
 583  */
 584 uint64_t ram_pagesize_summary(void)
 585 {
 586     RAMBlock *block;
 587     uint64_t summary = 0;
 588
 589     RAMBLOCK_FOREACH(block) {
 590         summary |= block->page_size;
 591     }
 592
 593     return summary;
 594 }
 595
 596 static void migration_bitmap_sync(RAMState *rs)
 597 {
 598     RAMBlock *block;
 599     int64_t end_time;
 600     uint64_t bytes_xfer_now;
 601
 602     ram_counters.dirty_sync_count++;
 603
 604     if (!rs->time_last_bitmap_sync) {
 605         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 606     }
 607
 608     trace_migration_bitmap_sync_start();
 609     memory_global_dirty_log_sync();
 610
 611     qemu_mutex_lock(&rs->bitmap_mutex);
 612     rcu_read_lock();
 613     RAMBLOCK_FOREACH(block) {
 614         migration_bitmap_sync_range(rs, block, 0, block->used_length);
 615     }
 616     rcu_read_unlock();
 617     qemu_mutex_unlock(&rs->bitmap_mutex);
 618
 619     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 620
 621     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 622
 623     /* more than 1 second = 1000 millisecons */
 624     if (end_time > rs->time_last_bitmap_sync + 1000) {
 625         /* calculate period counters */
 626         ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 627             / (end_time - rs->time_last_bitmap_sync);
 628         bytes_xfer_now = ram_counters.transferred;
 629
 630         if (migrate_auto_converge()) {
 631             /* The following detection logic can be refined later. For now:
 632                Check to see if the dirtied bytes is 50% more than the approx.
 633                amount of bytes that just got transferred since the last time we
 634                were in this routine. If that happens twice, start or increase
 635                throttling */
 636
 637             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 638                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 639                 (++rs->dirty_rate_high_cnt >= 2)) {
 640                     trace_migration_throttle();
 641                     rs->dirty_rate_high_cnt = 0;
 642                     mig_throttle_guest_down();
 643             }
 644         }
 645
 646         if (migrate_use_xbzrle()) {
 647             if (rs->iterations_prev != rs->iterations) {
 648                 xbzrle_counters.cache_miss_rate =
 649                    (double)(xbzrle_counters.cache_miss -
 650                             rs->xbzrle_cache_miss_prev) /
 651                    (rs->iterations - rs->iterations_prev);
 652             }
 653             rs->iterations_prev = rs->iterations;
 654             rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 655         }
 656
 657         /* reset period counters */
 658         rs->time_last_bitmap_sync = end_time;
 659         rs->num_dirty_pages_period = 0;
 660         rs->bytes_xfer_prev = bytes_xfer_now;
 661     }
 662     if (migrate_use_events()) {
 663         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
 664     }
 665 }
 666
 667 /**
 668  * save_zero_page: send the zero page to the stream
 669  *
 670  * Returns the number of pages written.
 671  *
 672  * @rs: current RAM state
 673  * @block: block that contains the page we want to send
 674  * @offset: offset inside the block for the page
 675  * @p: pointer to the page
 676  */
 677 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
 678                           uint8_t *p)
 679 {
 680     int pages = -1;
 681
 682     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 683         ram_counters.duplicate++;
 684         ram_counters.transferred +=
 685             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
 686         qemu_put_byte(rs->f, 0);
 687         ram_counters.transferred += 1;
 688         pages = 1;
 689     }
 690
 691     return pages;
 692 }
 693
 694 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 695 {
 696     if (!migrate_release_ram() || !migration_in_postcopy()) {
 697         return;
 698     }
 699
 700     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 701 }
 702
 703 /**
 704  * ram_save_page: send the given page to the stream
 705  *
 706  * Returns the number of pages written.
 707  *          < 0 - error
 708  *          >=0 - Number of pages written - this might legally be 0
 709  *                if xbzrle noticed the page was the same.
 710  *
 711  * @rs: current RAM state
 712  * @block: block that contains the page we want to send
 713  * @offset: offset inside the block for the page
 714  * @last_stage: if we are at the completion stage
 715  */
 716 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
 717 {
 718     int pages = -1;
 719     uint64_t bytes_xmit;
 720     ram_addr_t current_addr;
 721     uint8_t *p;
 722     int ret;
 723     bool send_async = true;
 724     RAMBlock *block = pss->block;
 725     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 726
 727     p = block->host + offset;
 728     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
 729
 730     /* In doubt sent page as normal */
 731     bytes_xmit = 0;
 732     ret = ram_control_save_page(rs->f, block->offset,
 733                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 734     if (bytes_xmit) {
 735         ram_counters.transferred += bytes_xmit;
 736         pages = 1;
 737     }
 738
 739     XBZRLE_cache_lock();
 740
 741     current_addr = block->offset + offset;
 742
 743     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 744         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 745             if (bytes_xmit > 0) {
 746                 ram_counters.normal++;
 747             } else if (bytes_xmit == 0) {
 748                 ram_counters.duplicate++;
 749             }
 750         }
 751     } else {
 752         pages = save_zero_page(rs, block, offset, p);
 753         if (pages > 0) {
 754             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 755              * page would be stale
 756              */
 757             xbzrle_cache_zero_page(rs, current_addr);
 758             ram_release_pages(block->idstr, offset, pages);
 759         } else if (!rs->ram_bulk_stage &&
 760                    !migration_in_postcopy() && migrate_use_xbzrle()) {
 761             pages = save_xbzrle_page(rs, &p, current_addr, block,
 762                                      offset, last_stage);
 763             if (!last_stage) {
 764                 /* Can't send this cached data async, since the cache page
 765                  * might get updated before it gets to the wire
 766                  */
 767                 send_async = false;
 768             }
 769         }
 770     }
 771
 772     /* XBZRLE overflow or normal page */
 773     if (pages == -1) {
 774         ram_counters.transferred +=
 775             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
 776         if (send_async) {
 777             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
 778                                   migrate_release_ram() &
 779                                   migration_in_postcopy());
 780         } else {
 781             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
 782         }
 783         ram_counters.transferred += TARGET_PAGE_SIZE;
 784         pages = 1;
 785         ram_counters.normal++;
 786     }
 787
 788     XBZRLE_cache_unlock();
 789
 790     return pages;
 791 }
 792
 793 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 794                                 ram_addr_t offset)
 795 {
 796     RAMState *rs = ram_state;
 797     int bytes_sent, blen;
 798     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 799
 800     bytes_sent = save_page_header(rs, f, block, offset |
 801                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
 802     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 803                                      migrate_compress_level());
 804     if (blen < 0) {
 805         bytes_sent = 0;
 806         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
 807         error_report("compressed data failed!");
 808     } else {
 809         bytes_sent += blen;
 810         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
 811     }
 812
 813     return bytes_sent;
 814 }
 815
 816 static void flush_compressed_data(RAMState *rs)
 817 {
 818     int idx, len, thread_count;
 819
 820     if (!migrate_use_compression()) {
 821         return;
 822     }
 823     thread_count = migrate_compress_threads();
 824
 825     qemu_mutex_lock(&comp_done_lock);
 826     for (idx = 0; idx < thread_count; idx++) {
 827         while (!comp_param[idx].done) {
 828             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 829         }
 830     }
 831     qemu_mutex_unlock(&comp_done_lock);
 832
 833     for (idx = 0; idx < thread_count; idx++) {
 834         qemu_mutex_lock(&comp_param[idx].mutex);
 835         if (!comp_param[idx].quit) {
 836             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 837             ram_counters.transferred += len;
 838         }
 839         qemu_mutex_unlock(&comp_param[idx].mutex);
 840     }
 841 }
 842
 843 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 844                                        ram_addr_t offset)
 845 {
 846     param->block = block;
 847     param->offset = offset;
 848 }
 849
 850 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
 851                                            ram_addr_t offset)
 852 {
 853     int idx, thread_count, bytes_xmit = -1, pages = -1;
 854
 855     thread_count = migrate_compress_threads();
 856     qemu_mutex_lock(&comp_done_lock);
 857     while (true) {
 858         for (idx = 0; idx < thread_count; idx++) {
 859             if (comp_param[idx].done) {
 860                 comp_param[idx].done = false;
 861                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 862                 qemu_mutex_lock(&comp_param[idx].mutex);
 863                 set_compress_params(&comp_param[idx], block, offset);
 864                 qemu_cond_signal(&comp_param[idx].cond);
 865                 qemu_mutex_unlock(&comp_param[idx].mutex);
 866                 pages = 1;
 867                 ram_counters.normal++;
 868                 ram_counters.transferred += bytes_xmit;
 869                 break;
 870             }
 871         }
 872         if (pages > 0) {
 873             break;
 874         } else {
 875             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 876         }
 877     }
 878     qemu_mutex_unlock(&comp_done_lock);
 879
 880     return pages;
 881 }
 882
 883 /**
 884  * ram_save_compressed_page: compress the given page and send it to the stream
 885  *
 886  * Returns the number of pages written.
 887  *
 888  * @rs: current RAM state
 889  * @block: block that contains the page we want to send
 890  * @offset: offset inside the block for the page
 891  * @last_stage: if we are at the completion stage
 892  */
 893 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
 894                                     bool last_stage)
 895 {
 896     int pages = -1;
 897     uint64_t bytes_xmit = 0;
 898     uint8_t *p;
 899     int ret, blen;
 900     RAMBlock *block = pss->block;
 901     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 902
 903     p = block->host + offset;
 904
 905     ret = ram_control_save_page(rs->f, block->offset,
 906                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
 907     if (bytes_xmit) {
 908         ram_counters.transferred += bytes_xmit;
 909         pages = 1;
 910     }
 911     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 912         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 913             if (bytes_xmit > 0) {
 914                 ram_counters.normal++;
 915             } else if (bytes_xmit == 0) {
 916                 ram_counters.duplicate++;
 917             }
 918         }
 919     } else {
 920         /* When starting the process of a new block, the first page of
 921          * the block should be sent out before other pages in the same
 922          * block, and all the pages in last block should have been sent
 923          * out, keeping this order is important, because the 'cont' flag
 924          * is used to avoid resending the block name.
 925          */
 926         if (block != rs->last_sent_block) {
 927             flush_compressed_data(rs);
 928             pages = save_zero_page(rs, block, offset, p);
 929             if (pages == -1) {
 930                 /* Make sure the first page is sent out before other pages */
 931                 bytes_xmit = save_page_header(rs, rs->f, block, offset |
 932                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
 933                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
 934                                                  migrate_compress_level());
 935                 if (blen > 0) {
 936                     ram_counters.transferred += bytes_xmit + blen;
 937                     ram_counters.normal++;
 938                     pages = 1;
 939                 } else {
 940                     qemu_file_set_error(rs->f, blen);
 941                     error_report("compressed data failed!");
 942                 }
 943             }
 944             if (pages > 0) {
 945                 ram_release_pages(block->idstr, offset, pages);
 946             }
 947         } else {
 948             pages = save_zero_page(rs, block, offset, p);
 949             if (pages == -1) {
 950                 pages = compress_page_with_multi_thread(rs, block, offset);
 951             } else {
 952                 ram_release_pages(block->idstr, offset, pages);
 953             }
 954         }
 955     }
 956
 957     return pages;
 958 }
 959
 960 /**
 961  * find_dirty_block: find the next dirty page and update any state
 962  * associated with the search process.
 963  *
 964  * Returns if a page is found
 965  *
 966  * @rs: current RAM state
 967  * @pss: data about the state of the current dirty page scan
 968  * @again: set to false if the search has scanned the whole of RAM
 969  */
 970 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
 971 {
 972     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
 973     if (pss->complete_round && pss->block == rs->last_seen_block &&
 974         pss->page >= rs->last_page) {
 975         /*
 976          * We've been once around the RAM and haven't found anything.
 977          * Give up.
 978          */
 979         *again = false;
 980         return false;
 981     }
 982     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
 983         /* Didn't find anything in this RAM Block */
 984         pss->page = 0;
 985         pss->block = QLIST_NEXT_RCU(pss->block, next);
 986         if (!pss->block) {
 987             /* Hit the end of the list */
 988             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
 989             /* Flag that we've looped */
 990             pss->complete_round = true;
 991             rs->ram_bulk_stage = false;
 992             if (migrate_use_xbzrle()) {
 993                 /* If xbzrle is on, stop using the data compression at this
 994                  * point. In theory, xbzrle can do better than compression.
 995                  */
 996                 flush_compressed_data(rs);
 997             }
 998         }
 999         /* Didn't find anything this time, but try again on the new block */
1000         *again = true;
1001         return false;
1002     } else {
1003         /* Can go around again, but... */
1004         *again = true;
1005         /* We've found something so probably don't need to */
1006         return true;
1007     }
1008 }
1009
1010 /**
1011  * unqueue_page: gets a page of the queue
1012  *
1013  * Helper for 'get_queued_page' - gets a page off the queue
1014  *
1015  * Returns the block of the page (or NULL if none available)
1016  *
1017  * @rs: current RAM state
1018  * @offset: used to return the offset within the RAMBlock
1019  */
1020 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1021 {
1022     RAMBlock *block = NULL;
1023
1024     qemu_mutex_lock(&rs->src_page_req_mutex);
1025     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1026         struct RAMSrcPageRequest *entry =
1027                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1028         block = entry->rb;
1029         *offset = entry->offset;
1030
1031         if (entry->len > TARGET_PAGE_SIZE) {
1032             entry->len -= TARGET_PAGE_SIZE;
1033             entry->offset += TARGET_PAGE_SIZE;
1034         } else {
1035             memory_region_unref(block->mr);
1036             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1037             g_free(entry);
1038         }
1039     }
1040     qemu_mutex_unlock(&rs->src_page_req_mutex);
1041
1042     return block;
1043 }
1044
1045 /**
1046  * get_queued_page: unqueue a page from the postocpy requests
1047  *
1048  * Skips pages that are already sent (!dirty)
1049  *
1050  * Returns if a queued page is found
1051  *
1052  * @rs: current RAM state
1053  * @pss: data about the state of the current dirty page scan
1054  */
1055 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1056 {
1057     RAMBlock  *block;
1058     ram_addr_t offset;
1059     bool dirty;
1060
1061     do {
1062         block = unqueue_page(rs, &offset);
1063         /*
1064          * We're sending this page, and since it's postcopy nothing else
1065          * will dirty it, and we must make sure it doesn't get sent again
1066          * even if this queue request was received after the background
1067          * search already sent it.
1068          */
1069         if (block) {
1070             unsigned long page;
1071
1072             page = offset >> TARGET_PAGE_BITS;
1073             dirty = test_bit(page, block->bmap);
1074             if (!dirty) {
1075                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1076                        page, test_bit(page, block->unsentmap));
1077             } else {
1078                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1079             }
1080         }
1081
1082     } while (block && !dirty);
1083
1084     if (block) {
1085         /*
1086          * As soon as we start servicing pages out of order, then we have
1087          * to kill the bulk stage, since the bulk stage assumes
1088          * in (migration_bitmap_find_and_reset_dirty) that every page is
1089          * dirty, that's no longer true.
1090          */
1091         rs->ram_bulk_stage = false;
1092
1093         /*
1094          * We want the background search to continue from the queued page
1095          * since the guest is likely to want other pages near to the page
1096          * it just requested.
1097          */
1098         pss->block = block;
1099         pss->page = offset >> TARGET_PAGE_BITS;
1100     }
1101
1102     return !!block;
1103 }
1104
1105 /**
1106  * migration_page_queue_free: drop any remaining pages in the ram
1107  * request queue
1108  *
1109  * It should be empty at the end anyway, but in error cases there may
1110  * be some left.  in case that there is any page left, we drop it.
1111  *
1112  */
1113 static void migration_page_queue_free(RAMState *rs)
1114 {
1115     struct RAMSrcPageRequest *mspr, *next_mspr;
1116     /* This queue generally should be empty - but in the case of a failed
1117      * migration might have some droppings in.
1118      */
1119     rcu_read_lock();
1120     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1121         memory_region_unref(mspr->rb->mr);
1122         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1123         g_free(mspr);
1124     }
1125     rcu_read_unlock();
1126 }
1127
1128 /**
1129  * ram_save_queue_pages: queue the page for transmission
1130  *
1131  * A request from postcopy destination for example.
1132  *
1133  * Returns zero on success or negative on error
1134  *
1135  * @rbname: Name of the RAMBLock of the request. NULL means the
1136  *          same that last one.
1137  * @start: starting address from the start of the RAMBlock
1138  * @len: length (in bytes) to send
1139  */
1140 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1141 {
1142     RAMBlock *ramblock;
1143     RAMState *rs = ram_state;
1144
1145     ram_counters.postcopy_requests++;
1146     rcu_read_lock();
1147     if (!rbname) {
1148         /* Reuse last RAMBlock */
1149         ramblock = rs->last_req_rb;
1150
1151         if (!ramblock) {
1152             /*
1153              * Shouldn't happen, we can't reuse the last RAMBlock if
1154              * it's the 1st request.
1155              */
1156             error_report("ram_save_queue_pages no previous block");
1157             goto err;
1158         }
1159     } else {
1160         ramblock = qemu_ram_block_by_name(rbname);
1161
1162         if (!ramblock) {
1163             /* We shouldn't be asked for a non-existent RAMBlock */
1164             error_report("ram_save_queue_pages no block '%s'", rbname);
1165             goto err;
1166         }
1167         rs->last_req_rb = ramblock;
1168     }
1169     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1170     if (start+len > ramblock->used_length) {
1171         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1172                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1173                      __func__, start, len, ramblock->used_length);
1174         goto err;
1175     }
1176
1177     struct RAMSrcPageRequest *new_entry =
1178         g_malloc0(sizeof(struct RAMSrcPageRequest));
1179     new_entry->rb = ramblock;
1180     new_entry->offset = start;
1181     new_entry->len = len;
1182
1183     memory_region_ref(ramblock->mr);
1184     qemu_mutex_lock(&rs->src_page_req_mutex);
1185     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1186     qemu_mutex_unlock(&rs->src_page_req_mutex);
1187     rcu_read_unlock();
1188
1189     return 0;
1190
1191 err:
1192     rcu_read_unlock();
1193     return -1;
1194 }
1195
1196 /**
1197  * ram_save_target_page: save one target page
1198  *
1199  * Returns the number of pages written
1200  *
1201  * @rs: current RAM state
1202  * @ms: current migration state
1203  * @pss: data about the page we want to send
1204  * @last_stage: if we are at the completion stage
1205  */
1206 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1207                                 bool last_stage)
1208 {
1209     int res = 0;
1210
1211     /* Check the pages is dirty and if it is send it */
1212     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1213         /*
1214          * If xbzrle is on, stop using the data compression after first
1215          * round of migration even if compression is enabled. In theory,
1216          * xbzrle can do better than compression.
1217          */
1218         if (migrate_use_compression() &&
1219             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1220             res = ram_save_compressed_page(rs, pss, last_stage);
1221         } else {
1222             res = ram_save_page(rs, pss, last_stage);
1223         }
1224
1225         if (res < 0) {
1226             return res;
1227         }
1228         if (pss->block->unsentmap) {
1229             clear_bit(pss->page, pss->block->unsentmap);
1230         }
1231     }
1232
1233     return res;
1234 }
1235
1236 /**
1237  * ram_save_host_page: save a whole host page
1238  *
1239  * Starting at *offset send pages up to the end of the current host
1240  * page. It's valid for the initial offset to point into the middle of
1241  * a host page in which case the remainder of the hostpage is sent.
1242  * Only dirty target pages are sent. Note that the host page size may
1243  * be a huge page for this block.
1244  * The saving stops at the boundary of the used_length of the block
1245  * if the RAMBlock isn't a multiple of the host page size.
1246  *
1247  * Returns the number of pages written or negative on error
1248  *
1249  * @rs: current RAM state
1250  * @ms: current migration state
1251  * @pss: data about the page we want to send
1252  * @last_stage: if we are at the completion stage
1253  */
1254 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1255                               bool last_stage)
1256 {
1257     int tmppages, pages = 0;
1258     size_t pagesize_bits =
1259         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1260
1261     do {
1262         tmppages = ram_save_target_page(rs, pss, last_stage);
1263         if (tmppages < 0) {
1264             return tmppages;
1265         }
1266
1267         pages += tmppages;
1268         pss->page++;
1269     } while ((pss->page & (pagesize_bits - 1)) &&
1270              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1271
1272     /* The offset we leave with is the last one we looked at */
1273     pss->page--;
1274     return pages;
1275 }
1276
1277 /**
1278  * ram_find_and_save_block: finds a dirty page and sends it to f
1279  *
1280  * Called within an RCU critical section.
1281  *
1282  * Returns the number of pages written where zero means no dirty pages
1283  *
1284  * @rs: current RAM state
1285  * @last_stage: if we are at the completion stage
1286  *
1287  * On systems where host-page-size > target-page-size it will send all the
1288  * pages in a host page that are dirty.
1289  */
1290
1291 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1292 {
1293     PageSearchStatus pss;
1294     int pages = 0;
1295     bool again, found;
1296
1297     /* No dirty page as there is zero RAM */
1298     if (!ram_bytes_total()) {
1299         return pages;
1300     }
1301
1302     pss.block = rs->last_seen_block;
1303     pss.page = rs->last_page;
1304     pss.complete_round = false;
1305
1306     if (!pss.block) {
1307         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1308     }
1309
1310     do {
1311         again = true;
1312         found = get_queued_page(rs, &pss);
1313
1314         if (!found) {
1315             /* priority queue empty, so just search for something dirty */
1316             found = find_dirty_block(rs, &pss, &again);
1317         }
1318
1319         if (found) {
1320             pages = ram_save_host_page(rs, &pss, last_stage);
1321         }
1322     } while (!pages && again);
1323
1324     rs->last_seen_block = pss.block;
1325     rs->last_page = pss.page;
1326
1327     return pages;
1328 }
1329
1330 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1331 {
1332     uint64_t pages = size / TARGET_PAGE_SIZE;
1333
1334     if (zero) {
1335         ram_counters.duplicate += pages;
1336     } else {
1337         ram_counters.normal += pages;
1338         ram_counters.transferred += size;
1339         qemu_update_position(f, size);
1340     }
1341 }
1342
1343 uint64_t ram_bytes_total(void)
1344 {
1345     RAMBlock *block;
1346     uint64_t total = 0;
1347
1348     rcu_read_lock();
1349     RAMBLOCK_FOREACH(block) {
1350         total += block->used_length;
1351     }
1352     rcu_read_unlock();
1353     return total;
1354 }
1355
1356 void free_xbzrle_decoded_buf(void)
1357 {
1358     g_free(xbzrle_decoded_buf);
1359     xbzrle_decoded_buf = NULL;
1360 }
1361
1362 static void ram_migration_cleanup(void *opaque)
1363 {
1364     RAMState **rsp = opaque;
1365     RAMBlock *block;
1366
1367     /* caller have hold iothread lock or is in a bh, so there is
1368      * no writing race against this migration_bitmap
1369      */
1370     memory_global_dirty_log_stop();
1371
1372     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1373         g_free(block->bmap);
1374         block->bmap = NULL;
1375         g_free(block->unsentmap);
1376         block->unsentmap = NULL;
1377     }
1378
1379     XBZRLE_cache_lock();
1380     if (XBZRLE.cache) {
1381         cache_fini(XBZRLE.cache);
1382         g_free(XBZRLE.encoded_buf);
1383         g_free(XBZRLE.current_buf);
1384         g_free(XBZRLE.zero_target_page);
1385         XBZRLE.cache = NULL;
1386         XBZRLE.encoded_buf = NULL;
1387         XBZRLE.current_buf = NULL;
1388         XBZRLE.zero_target_page = NULL;
1389     }
1390     XBZRLE_cache_unlock();
1391     migration_page_queue_free(*rsp);
1392     g_free(*rsp);
1393     *rsp = NULL;
1394 }
1395
1396 static void ram_state_reset(RAMState *rs)
1397 {
1398     rs->last_seen_block = NULL;
1399     rs->last_sent_block = NULL;
1400     rs->last_page = 0;
1401     rs->last_version = ram_list.version;
1402     rs->ram_bulk_stage = true;
1403 }
1404
1405 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1406
1407 /*
1408  * 'expected' is the value you expect the bitmap mostly to be full
1409  * of; it won't bother printing lines that are all this value.
1410  * If 'todump' is null the migration bitmap is dumped.
1411  */
1412 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1413                            unsigned long pages)
1414 {
1415     int64_t cur;
1416     int64_t linelen = 128;
1417     char linebuf[129];
1418
1419     for (cur = 0; cur < pages; cur += linelen) {
1420         int64_t curb;
1421         bool found = false;
1422         /*
1423          * Last line; catch the case where the line length
1424          * is longer than remaining ram
1425          */
1426         if (cur + linelen > pages) {
1427             linelen = pages - cur;
1428         }
1429         for (curb = 0; curb < linelen; curb++) {
1430             bool thisbit = test_bit(cur + curb, todump);
1431             linebuf[curb] = thisbit ? '1' : '.';
1432             found = found || (thisbit != expected);
1433         }
1434         if (found) {
1435             linebuf[curb] = '\0';
1436             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1437         }
1438     }
1439 }
1440
1441 /* **** functions for postcopy ***** */
1442
1443 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1444 {
1445     struct RAMBlock *block;
1446
1447     RAMBLOCK_FOREACH(block) {
1448         unsigned long *bitmap = block->bmap;
1449         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1450         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1451
1452         while (run_start < range) {
1453             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1454             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1455                               (run_end - run_start) << TARGET_PAGE_BITS);
1456             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1457         }
1458     }
1459 }
1460
1461 /**
1462  * postcopy_send_discard_bm_ram: discard a RAMBlock
1463  *
1464  * Returns zero on success
1465  *
1466  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1467  * Note: At this point the 'unsentmap' is the processed bitmap combined
1468  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1469  *
1470  * @ms: current migration state
1471  * @pds: state for postcopy
1472  * @start: RAMBlock starting page
1473  * @length: RAMBlock size
1474  */
1475 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1476                                         PostcopyDiscardState *pds,
1477                                         RAMBlock *block)
1478 {
1479     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1480     unsigned long current;
1481     unsigned long *unsentmap = block->unsentmap;
1482
1483     for (current = 0; current < end; ) {
1484         unsigned long one = find_next_bit(unsentmap, end, current);
1485
1486         if (one <= end) {
1487             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1488             unsigned long discard_length;
1489
1490             if (zero >= end) {
1491                 discard_length = end - one;
1492             } else {
1493                 discard_length = zero - one;
1494             }
1495             if (discard_length) {
1496                 postcopy_discard_send_range(ms, pds, one, discard_length);
1497             }
1498             current = one + discard_length;
1499         } else {
1500             current = one;
1501         }
1502     }
1503
1504     return 0;
1505 }
1506
1507 /**
1508  * postcopy_each_ram_send_discard: discard all RAMBlocks
1509  *
1510  * Returns 0 for success or negative for error
1511  *
1512  * Utility for the outgoing postcopy code.
1513  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1514  *   passing it bitmap indexes and name.
1515  * (qemu_ram_foreach_block ends up passing unscaled lengths
1516  *  which would mean postcopy code would have to deal with target page)
1517  *
1518  * @ms: current migration state
1519  */
1520 static int postcopy_each_ram_send_discard(MigrationState *ms)
1521 {
1522     struct RAMBlock *block;
1523     int ret;
1524
1525     RAMBLOCK_FOREACH(block) {
1526         PostcopyDiscardState *pds =
1527             postcopy_discard_send_init(ms, block->idstr);
1528
1529         /*
1530          * Postcopy sends chunks of bitmap over the wire, but it
1531          * just needs indexes at this point, avoids it having
1532          * target page specific code.
1533          */
1534         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1535         postcopy_discard_send_finish(ms, pds);
1536         if (ret) {
1537             return ret;
1538         }
1539     }
1540
1541     return 0;
1542 }
1543
1544 /**
1545  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1546  *
1547  * Helper for postcopy_chunk_hostpages; it's called twice to
1548  * canonicalize the two bitmaps, that are similar, but one is
1549  * inverted.
1550  *
1551  * Postcopy requires that all target pages in a hostpage are dirty or
1552  * clean, not a mix.  This function canonicalizes the bitmaps.
1553  *
1554  * @ms: current migration state
1555  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1556  *               otherwise we need to canonicalize partially dirty host pages
1557  * @block: block that contains the page we want to canonicalize
1558  * @pds: state for postcopy
1559  */
1560 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1561                                           RAMBlock *block,
1562                                           PostcopyDiscardState *pds)
1563 {
1564     RAMState *rs = ram_state;
1565     unsigned long *bitmap = block->bmap;
1566     unsigned long *unsentmap = block->unsentmap;
1567     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1568     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1569     unsigned long run_start;
1570
1571     if (block->page_size == TARGET_PAGE_SIZE) {
1572         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1573         return;
1574     }
1575
1576     if (unsent_pass) {
1577         /* Find a sent page */
1578         run_start = find_next_zero_bit(unsentmap, pages, 0);
1579     } else {
1580         /* Find a dirty page */
1581         run_start = find_next_bit(bitmap, pages, 0);
1582     }
1583
1584     while (run_start < pages) {
1585         bool do_fixup = false;
1586         unsigned long fixup_start_addr;
1587         unsigned long host_offset;
1588
1589         /*
1590          * If the start of this run of pages is in the middle of a host
1591          * page, then we need to fixup this host page.
1592          */
1593         host_offset = run_start % host_ratio;
1594         if (host_offset) {
1595             do_fixup = true;
1596             run_start -= host_offset;
1597             fixup_start_addr = run_start;
1598             /* For the next pass */
1599             run_start = run_start + host_ratio;
1600         } else {
1601             /* Find the end of this run */
1602             unsigned long run_end;
1603             if (unsent_pass) {
1604                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1605             } else {
1606                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1607             }
1608             /*
1609              * If the end isn't at the start of a host page, then the
1610              * run doesn't finish at the end of a host page
1611              * and we need to discard.
1612              */
1613             host_offset = run_end % host_ratio;
1614             if (host_offset) {
1615                 do_fixup = true;
1616                 fixup_start_addr = run_end - host_offset;
1617                 /*
1618                  * This host page has gone, the next loop iteration starts
1619                  * from after the fixup
1620                  */
1621                 run_start = fixup_start_addr + host_ratio;
1622             } else {
1623                 /*
1624                  * No discards on this iteration, next loop starts from
1625                  * next sent/dirty page
1626                  */
1627                 run_start = run_end + 1;
1628             }
1629         }
1630
1631         if (do_fixup) {
1632             unsigned long page;
1633
1634             /* Tell the destination to discard this page */
1635             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1636                 /* For the unsent_pass we:
1637                  *     discard partially sent pages
1638                  * For the !unsent_pass (dirty) we:
1639                  *     discard partially dirty pages that were sent
1640                  *     (any partially sent pages were already discarded
1641                  *     by the previous unsent_pass)
1642                  */
1643                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1644                                             host_ratio);
1645             }
1646
1647             /* Clean up the bitmap */
1648             for (page = fixup_start_addr;
1649                  page < fixup_start_addr + host_ratio; page++) {
1650                 /* All pages in this host page are now not sent */
1651                 set_bit(page, unsentmap);
1652
1653                 /*
1654                  * Remark them as dirty, updating the count for any pages
1655                  * that weren't previously dirty.
1656                  */
1657                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1658             }
1659         }
1660
1661         if (unsent_pass) {
1662             /* Find the next sent page for the next iteration */
1663             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1664         } else {
1665             /* Find the next dirty page for the next iteration */
1666             run_start = find_next_bit(bitmap, pages, run_start);
1667         }
1668     }
1669 }
1670
1671 /**
1672  * postcopy_chuck_hostpages: discrad any partially sent host page
1673  *
1674  * Utility for the outgoing postcopy code.
1675  *
1676  * Discard any partially sent host-page size chunks, mark any partially
1677  * dirty host-page size chunks as all dirty.  In this case the host-page
1678  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1679  *
1680  * Returns zero on success
1681  *
1682  * @ms: current migration state
1683  * @block: block we want to work with
1684  */
1685 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1686 {
1687     PostcopyDiscardState *pds =
1688         postcopy_discard_send_init(ms, block->idstr);
1689
1690     /* First pass: Discard all partially sent host pages */
1691     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1692     /*
1693      * Second pass: Ensure that all partially dirty host pages are made
1694      * fully dirty.
1695      */
1696     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1697
1698     postcopy_discard_send_finish(ms, pds);
1699     return 0;
1700 }
1701
1702 /**
1703  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1704  *
1705  * Returns zero on success
1706  *
1707  * Transmit the set of pages to be discarded after precopy to the target
1708  * these are pages that:
1709  *     a) Have been previously transmitted but are now dirty again
1710  *     b) Pages that have never been transmitted, this ensures that
1711  *        any pages on the destination that have been mapped by background
1712  *        tasks get discarded (transparent huge pages is the specific concern)
1713  * Hopefully this is pretty sparse
1714  *
1715  * @ms: current migration state
1716  */
1717 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1718 {
1719     RAMState *rs = ram_state;
1720     RAMBlock *block;
1721     int ret;
1722
1723     rcu_read_lock();
1724
1725     /* This should be our last sync, the src is now paused */
1726     migration_bitmap_sync(rs);
1727
1728     /* Easiest way to make sure we don't resume in the middle of a host-page */
1729     rs->last_seen_block = NULL;
1730     rs->last_sent_block = NULL;
1731     rs->last_page = 0;
1732
1733     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1734         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1735         unsigned long *bitmap = block->bmap;
1736         unsigned long *unsentmap = block->unsentmap;
1737
1738         if (!unsentmap) {
1739             /* We don't have a safe way to resize the sentmap, so
1740              * if the bitmap was resized it will be NULL at this
1741              * point.
1742              */
1743             error_report("migration ram resized during precopy phase");
1744             rcu_read_unlock();
1745             return -EINVAL;
1746         }
1747         /* Deal with TPS != HPS and huge pages */
1748         ret = postcopy_chunk_hostpages(ms, block);
1749         if (ret) {
1750             rcu_read_unlock();
1751             return ret;
1752         }
1753
1754         /*
1755          * Update the unsentmap to be unsentmap = unsentmap | dirty
1756          */
1757         bitmap_or(unsentmap, unsentmap, bitmap, pages);
1758 #ifdef DEBUG_POSTCOPY
1759         ram_debug_dump_bitmap(unsentmap, true, pages);
1760 #endif
1761     }
1762     trace_ram_postcopy_send_discard_bitmap();
1763
1764     ret = postcopy_each_ram_send_discard(ms);
1765     rcu_read_unlock();
1766
1767     return ret;
1768 }
1769
1770 /**
1771  * ram_discard_range: discard dirtied pages at the beginning of postcopy
1772  *
1773  * Returns zero on success
1774  *
1775  * @rbname: name of the RAMBlock of the request. NULL means the
1776  *          same that last one.
1777  * @start: RAMBlock starting page
1778  * @length: RAMBlock size
1779  */
1780 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1781 {
1782     int ret = -1;
1783
1784     trace_ram_discard_range(rbname, start, length);
1785
1786     rcu_read_lock();
1787     RAMBlock *rb = qemu_ram_block_by_name(rbname);
1788
1789     if (!rb) {
1790         error_report("ram_discard_range: Failed to find block '%s'", rbname);
1791         goto err;
1792     }
1793
1794     ret = ram_block_discard_range(rb, start, length);
1795
1796 err:
1797     rcu_read_unlock();
1798
1799     return ret;
1800 }
1801
1802 static int ram_state_init(RAMState **rsp)
1803 {
1804     *rsp = g_new0(RAMState, 1);
1805
1806     qemu_mutex_init(&(*rsp)->bitmap_mutex);
1807     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
1808     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
1809
1810     if (migrate_use_xbzrle()) {
1811         XBZRLE_cache_lock();
1812         XBZRLE.zero_target_page = g_malloc0(TARGET_PAGE_SIZE);
1813         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1814                                   TARGET_PAGE_SIZE,
1815                                   TARGET_PAGE_SIZE);
1816         if (!XBZRLE.cache) {
1817             XBZRLE_cache_unlock();
1818             error_report("Error creating cache");
1819             g_free(*rsp);
1820             *rsp = NULL;
1821             return -1;
1822         }
1823         XBZRLE_cache_unlock();
1824
1825         /* We prefer not to abort if there is no memory */
1826         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1827         if (!XBZRLE.encoded_buf) {
1828             error_report("Error allocating encoded_buf");
1829             g_free(*rsp);
1830             *rsp = NULL;
1831             return -1;
1832         }
1833
1834         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1835         if (!XBZRLE.current_buf) {
1836             error_report("Error allocating current_buf");
1837             g_free(XBZRLE.encoded_buf);
1838             XBZRLE.encoded_buf = NULL;
1839             g_free(*rsp);
1840             *rsp = NULL;
1841             return -1;
1842         }
1843     }
1844
1845     /* For memory_global_dirty_log_start below.  */
1846     qemu_mutex_lock_iothread();
1847
1848     qemu_mutex_lock_ramlist();
1849     rcu_read_lock();
1850     ram_state_reset(*rsp);
1851
1852     /* Skip setting bitmap if there is no RAM */
1853     if (ram_bytes_total()) {
1854         RAMBlock *block;
1855
1856         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1857             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
1858
1859             block->bmap = bitmap_new(pages);
1860             bitmap_set(block->bmap, 0, pages);
1861             if (migrate_postcopy_ram()) {
1862                 block->unsentmap = bitmap_new(pages);
1863                 bitmap_set(block->unsentmap, 0, pages);
1864             }
1865         }
1866     }
1867
1868     /*
1869      * Count the total number of pages used by ram blocks not including any
1870      * gaps due to alignment or unplugs.
1871      */
1872     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1873
1874     memory_global_dirty_log_start();
1875     migration_bitmap_sync(*rsp);
1876     qemu_mutex_unlock_ramlist();
1877     qemu_mutex_unlock_iothread();
1878     rcu_read_unlock();
1879
1880     return 0;
1881 }
1882
1883 /*
1884  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1885  * long-running RCU critical section.  When rcu-reclaims in the code
1886  * start to become numerous it will be necessary to reduce the
1887  * granularity of these critical sections.
1888  */
1889
1890 /**
1891  * ram_save_setup: Setup RAM for migration
1892  *
1893  * Returns zero to indicate success and negative for error
1894  *
1895  * @f: QEMUFile where to send the data
1896  * @opaque: RAMState pointer
1897  */
1898 static int ram_save_setup(QEMUFile *f, void *opaque)
1899 {
1900     RAMState **rsp = opaque;
1901     RAMBlock *block;
1902
1903     /* migration has already setup the bitmap, reuse it. */
1904     if (!migration_in_colo_state()) {
1905         if (ram_state_init(rsp) != 0) {
1906             return -1;
1907         }
1908     }
1909     (*rsp)->f = f;
1910
1911     rcu_read_lock();
1912
1913     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1914
1915     RAMBLOCK_FOREACH(block) {
1916         qemu_put_byte(f, strlen(block->idstr));
1917         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1918         qemu_put_be64(f, block->used_length);
1919         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
1920             qemu_put_be64(f, block->page_size);
1921         }
1922     }
1923
1924     rcu_read_unlock();
1925
1926     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1927     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1928
1929     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1930
1931     return 0;
1932 }
1933
1934 /**
1935  * ram_save_iterate: iterative stage for migration
1936  *
1937  * Returns zero to indicate success and negative for error
1938  *
1939  * @f: QEMUFile where to send the data
1940  * @opaque: RAMState pointer
1941  */
1942 static int ram_save_iterate(QEMUFile *f, void *opaque)
1943 {
1944     RAMState **temp = opaque;
1945     RAMState *rs = *temp;
1946     int ret;
1947     int i;
1948     int64_t t0;
1949     int done = 0;
1950
1951     rcu_read_lock();
1952     if (ram_list.version != rs->last_version) {
1953         ram_state_reset(rs);
1954     }
1955
1956     /* Read version before ram_list.blocks */
1957     smp_rmb();
1958
1959     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1960
1961     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1962     i = 0;
1963     while ((ret = qemu_file_rate_limit(f)) == 0) {
1964         int pages;
1965
1966         pages = ram_find_and_save_block(rs, false);
1967         /* no more pages to sent */
1968         if (pages == 0) {
1969             done = 1;
1970             break;
1971         }
1972         rs->iterations++;
1973
1974         /* we want to check in the 1st loop, just in case it was the 1st time
1975            and we had to sync the dirty bitmap.
1976            qemu_get_clock_ns() is a bit expensive, so we only check each some
1977            iterations
1978         */
1979         if ((i & 63) == 0) {
1980             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
1981             if (t1 > MAX_WAIT) {
1982                 trace_ram_save_iterate_big_wait(t1, i);
1983                 break;
1984             }
1985         }
1986         i++;
1987     }
1988     flush_compressed_data(rs);
1989     rcu_read_unlock();
1990
1991     /*
1992      * Must occur before EOS (or any QEMUFile operation)
1993      * because of RDMA protocol.
1994      */
1995     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
1996
1997     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1998     ram_counters.transferred += 8;
1999
2000     ret = qemu_file_get_error(f);
2001     if (ret < 0) {
2002         return ret;
2003     }
2004
2005     return done;
2006 }
2007
2008 /**
2009  * ram_save_complete: function called to send the remaining amount of ram
2010  *
2011  * Returns zero to indicate success
2012  *
2013  * Called with iothread lock
2014  *
2015  * @f: QEMUFile where to send the data
2016  * @opaque: RAMState pointer
2017  */
2018 static int ram_save_complete(QEMUFile *f, void *opaque)
2019 {
2020     RAMState **temp = opaque;
2021     RAMState *rs = *temp;
2022
2023     rcu_read_lock();
2024
2025     if (!migration_in_postcopy()) {
2026         migration_bitmap_sync(rs);
2027     }
2028
2029     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2030
2031     /* try transferring iterative blocks of memory */
2032
2033     /* flush all remaining blocks regardless of rate limiting */
2034     while (true) {
2035         int pages;
2036
2037         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2038         /* no more blocks to sent */
2039         if (pages == 0) {
2040             break;
2041         }
2042     }
2043
2044     flush_compressed_data(rs);
2045     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2046
2047     rcu_read_unlock();
2048
2049     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2050
2051     return 0;
2052 }
2053
2054 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2055                              uint64_t *non_postcopiable_pending,
2056                              uint64_t *postcopiable_pending)
2057 {
2058     RAMState **temp = opaque;
2059     RAMState *rs = *temp;
2060     uint64_t remaining_size;
2061
2062     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2063
2064     if (!migration_in_postcopy() &&
2065         remaining_size < max_size) {
2066         qemu_mutex_lock_iothread();
2067         rcu_read_lock();
2068         migration_bitmap_sync(rs);
2069         rcu_read_unlock();
2070         qemu_mutex_unlock_iothread();
2071         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2072     }
2073
2074     /* We can do postcopy, and all the data is postcopiable */
2075     *postcopiable_pending += remaining_size;
2076 }
2077
2078 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2079 {
2080     unsigned int xh_len;
2081     int xh_flags;
2082     uint8_t *loaded_data;
2083
2084     if (!xbzrle_decoded_buf) {
2085         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2086     }
2087     loaded_data = xbzrle_decoded_buf;
2088
2089     /* extract RLE header */
2090     xh_flags = qemu_get_byte(f);
2091     xh_len = qemu_get_be16(f);
2092
2093     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2094         error_report("Failed to load XBZRLE page - wrong compression!");
2095         return -1;
2096     }
2097
2098     if (xh_len > TARGET_PAGE_SIZE) {
2099         error_report("Failed to load XBZRLE page - len overflow!");
2100         return -1;
2101     }
2102     /* load data and decode */
2103     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2104
2105     /* decode RLE */
2106     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2107                              TARGET_PAGE_SIZE) == -1) {
2108         error_report("Failed to load XBZRLE page - decode error!");
2109         return -1;
2110     }
2111
2112     return 0;
2113 }
2114
2115 /**
2116  * ram_block_from_stream: read a RAMBlock id from the migration stream
2117  *
2118  * Must be called from within a rcu critical section.
2119  *
2120  * Returns a pointer from within the RCU-protected ram_list.
2121  *
2122  * @f: QEMUFile where to read the data from
2123  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2124  */
2125 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2126 {
2127     static RAMBlock *block = NULL;
2128     char id[256];
2129     uint8_t len;
2130
2131     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2132         if (!block) {
2133             error_report("Ack, bad migration stream!");
2134             return NULL;
2135         }
2136         return block;
2137     }
2138
2139     len = qemu_get_byte(f);
2140     qemu_get_buffer(f, (uint8_t *)id, len);
2141     id[len] = 0;
2142
2143     block = qemu_ram_block_by_name(id);
2144     if (!block) {
2145         error_report("Can't find block %s", id);
2146         return NULL;
2147     }
2148
2149     return block;
2150 }
2151
2152 static inline void *host_from_ram_block_offset(RAMBlock *block,
2153                                                ram_addr_t offset)
2154 {
2155     if (!offset_in_ramblock(block, offset)) {
2156         return NULL;
2157     }
2158
2159     return block->host + offset;
2160 }
2161
2162 /**
2163  * ram_handle_compressed: handle the zero page case
2164  *
2165  * If a page (or a whole RDMA chunk) has been
2166  * determined to be zero, then zap it.
2167  *
2168  * @host: host address for the zero page
2169  * @ch: what the page is filled from.  We only support zero
2170  * @size: size of the zero page
2171  */
2172 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2173 {
2174     if (ch != 0 || !is_zero_range(host, size)) {
2175         memset(host, ch, size);
2176     }
2177 }
2178
2179 static void *do_data_decompress(void *opaque)
2180 {
2181     DecompressParam *param = opaque;
2182     unsigned long pagesize;
2183     uint8_t *des;
2184     int len;
2185
2186     qemu_mutex_lock(&param->mutex);
2187     while (!param->quit) {
2188         if (param->des) {
2189             des = param->des;
2190             len = param->len;
2191             param->des = 0;
2192             qemu_mutex_unlock(&param->mutex);
2193
2194             pagesize = TARGET_PAGE_SIZE;
2195             /* uncompress() will return failed in some case, especially
2196              * when the page is dirted when doing the compression, it's
2197              * not a problem because the dirty page will be retransferred
2198              * and uncompress() won't break the data in other pages.
2199              */
2200             uncompress((Bytef *)des, &pagesize,
2201                        (const Bytef *)param->compbuf, len);
2202
2203             qemu_mutex_lock(&decomp_done_lock);
2204             param->done = true;
2205             qemu_cond_signal(&decomp_done_cond);
2206             qemu_mutex_unlock(&decomp_done_lock);
2207
2208             qemu_mutex_lock(&param->mutex);
2209         } else {
2210             qemu_cond_wait(&param->cond, &param->mutex);
2211         }
2212     }
2213     qemu_mutex_unlock(&param->mutex);
2214
2215     return NULL;
2216 }
2217
2218 static void wait_for_decompress_done(void)
2219 {
2220     int idx, thread_count;
2221
2222     if (!migrate_use_compression()) {
2223         return;
2224     }
2225
2226     thread_count = migrate_decompress_threads();
2227     qemu_mutex_lock(&decomp_done_lock);
2228     for (idx = 0; idx < thread_count; idx++) {
2229         while (!decomp_param[idx].done) {
2230             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2231         }
2232     }
2233     qemu_mutex_unlock(&decomp_done_lock);
2234 }
2235
2236 void migrate_decompress_threads_create(void)
2237 {
2238     int i, thread_count;
2239
2240     thread_count = migrate_decompress_threads();
2241     decompress_threads = g_new0(QemuThread, thread_count);
2242     decomp_param = g_new0(DecompressParam, thread_count);
2243     qemu_mutex_init(&decomp_done_lock);
2244     qemu_cond_init(&decomp_done_cond);
2245     for (i = 0; i < thread_count; i++) {
2246         qemu_mutex_init(&decomp_param[i].mutex);
2247         qemu_cond_init(&decomp_param[i].cond);
2248         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2249         decomp_param[i].done = true;
2250         decomp_param[i].quit = false;
2251         qemu_thread_create(decompress_threads + i, "decompress",
2252                            do_data_decompress, decomp_param + i,
2253                            QEMU_THREAD_JOINABLE);
2254     }
2255 }
2256
2257 void migrate_decompress_threads_join(void)
2258 {
2259     int i, thread_count;
2260
2261     thread_count = migrate_decompress_threads();
2262     for (i = 0; i < thread_count; i++) {
2263         qemu_mutex_lock(&decomp_param[i].mutex);
2264         decomp_param[i].quit = true;
2265         qemu_cond_signal(&decomp_param[i].cond);
2266         qemu_mutex_unlock(&decomp_param[i].mutex);
2267     }
2268     for (i = 0; i < thread_count; i++) {
2269         qemu_thread_join(decompress_threads + i);
2270         qemu_mutex_destroy(&decomp_param[i].mutex);
2271         qemu_cond_destroy(&decomp_param[i].cond);
2272         g_free(decomp_param[i].compbuf);
2273     }
2274     g_free(decompress_threads);
2275     g_free(decomp_param);
2276     decompress_threads = NULL;
2277     decomp_param = NULL;
2278 }
2279
2280 static void decompress_data_with_multi_threads(QEMUFile *f,
2281                                                void *host, int len)
2282 {
2283     int idx, thread_count;
2284
2285     thread_count = migrate_decompress_threads();
2286     qemu_mutex_lock(&decomp_done_lock);
2287     while (true) {
2288         for (idx = 0; idx < thread_count; idx++) {
2289             if (decomp_param[idx].done) {
2290                 decomp_param[idx].done = false;
2291                 qemu_mutex_lock(&decomp_param[idx].mutex);
2292                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2293                 decomp_param[idx].des = host;
2294                 decomp_param[idx].len = len;
2295                 qemu_cond_signal(&decomp_param[idx].cond);
2296                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2297                 break;
2298             }
2299         }
2300         if (idx < thread_count) {
2301             break;
2302         } else {
2303             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2304         }
2305     }
2306     qemu_mutex_unlock(&decomp_done_lock);
2307 }
2308
2309 /**
2310  * ram_postcopy_incoming_init: allocate postcopy data structures
2311  *
2312  * Returns 0 for success and negative if there was one error
2313  *
2314  * @mis: current migration incoming state
2315  *
2316  * Allocate data structures etc needed by incoming migration with
2317  * postcopy-ram. postcopy-ram's similarly names
2318  * postcopy_ram_incoming_init does the work.
2319  */
2320 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2321 {
2322     unsigned long ram_pages = last_ram_page();
2323
2324     return postcopy_ram_incoming_init(mis, ram_pages);
2325 }
2326
2327 /**
2328  * ram_load_postcopy: load a page in postcopy case
2329  *
2330  * Returns 0 for success or -errno in case of error
2331  *
2332  * Called in postcopy mode by ram_load().
2333  * rcu_read_lock is taken prior to this being called.
2334  *
2335  * @f: QEMUFile where to send the data
2336  */
2337 static int ram_load_postcopy(QEMUFile *f)
2338 {
2339     int flags = 0, ret = 0;
2340     bool place_needed = false;
2341     bool matching_page_sizes = false;
2342     MigrationIncomingState *mis = migration_incoming_get_current();
2343     /* Temporary page that is later 'placed' */
2344     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2345     void *last_host = NULL;
2346     bool all_zero = false;
2347
2348     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2349         ram_addr_t addr;
2350         void *host = NULL;
2351         void *page_buffer = NULL;
2352         void *place_source = NULL;
2353         RAMBlock *block = NULL;
2354         uint8_t ch;
2355
2356         addr = qemu_get_be64(f);
2357         flags = addr & ~TARGET_PAGE_MASK;
2358         addr &= TARGET_PAGE_MASK;
2359
2360         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2361         place_needed = false;
2362         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2363             block = ram_block_from_stream(f, flags);
2364
2365             host = host_from_ram_block_offset(block, addr);
2366             if (!host) {
2367                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2368                 ret = -EINVAL;
2369                 break;
2370             }
2371             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2372             /*
2373              * Postcopy requires that we place whole host pages atomically;
2374              * these may be huge pages for RAMBlocks that are backed by
2375              * hugetlbfs.
2376              * To make it atomic, the data is read into a temporary page
2377              * that's moved into place later.
2378              * The migration protocol uses,  possibly smaller, target-pages
2379              * however the source ensures it always sends all the components
2380              * of a host page in order.
2381              */
2382             page_buffer = postcopy_host_page +
2383                           ((uintptr_t)host & (block->page_size - 1));
2384             /* If all TP are zero then we can optimise the place */
2385             if (!((uintptr_t)host & (block->page_size - 1))) {
2386                 all_zero = true;
2387             } else {
2388                 /* not the 1st TP within the HP */
2389                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2390                     error_report("Non-sequential target page %p/%p",
2391                                   host, last_host);
2392                     ret = -EINVAL;
2393                     break;
2394                 }
2395             }
2396
2397
2398             /*
2399              * If it's the last part of a host page then we place the host
2400              * page
2401              */
2402             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2403                                      (block->page_size - 1)) == 0;
2404             place_source = postcopy_host_page;
2405         }
2406         last_host = host;
2407
2408         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2409         case RAM_SAVE_FLAG_ZERO:
2410             ch = qemu_get_byte(f);
2411             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2412             if (ch) {
2413                 all_zero = false;
2414             }
2415             break;
2416
2417         case RAM_SAVE_FLAG_PAGE:
2418             all_zero = false;
2419             if (!place_needed || !matching_page_sizes) {
2420                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2421             } else {
2422                 /* Avoids the qemu_file copy during postcopy, which is
2423                  * going to do a copy later; can only do it when we
2424                  * do this read in one go (matching page sizes)
2425                  */
2426                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2427                                          TARGET_PAGE_SIZE);
2428             }
2429             break;
2430         case RAM_SAVE_FLAG_EOS:
2431             /* normal exit */
2432             break;
2433         default:
2434             error_report("Unknown combination of migration flags: %#x"
2435                          " (postcopy mode)", flags);
2436             ret = -EINVAL;
2437         }
2438
2439         if (place_needed) {
2440             /* This gets called at the last target page in the host page */
2441             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2442
2443             if (all_zero) {
2444                 ret = postcopy_place_page_zero(mis, place_dest,
2445                                                block->page_size);
2446             } else {
2447                 ret = postcopy_place_page(mis, place_dest,
2448                                           place_source, block->page_size);
2449             }
2450         }
2451         if (!ret) {
2452             ret = qemu_file_get_error(f);
2453         }
2454     }
2455
2456     return ret;
2457 }
2458
2459 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2460 {
2461     int flags = 0, ret = 0;
2462     static uint64_t seq_iter;
2463     int len = 0;
2464     /*
2465      * If system is running in postcopy mode, page inserts to host memory must
2466      * be atomic
2467      */
2468     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2469     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2470     bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2471
2472     seq_iter++;
2473
2474     if (version_id != 4) {
2475         ret = -EINVAL;
2476     }
2477
2478     /* This RCU critical section can be very long running.
2479      * When RCU reclaims in the code start to become numerous,
2480      * it will be necessary to reduce the granularity of this
2481      * critical section.
2482      */
2483     rcu_read_lock();
2484
2485     if (postcopy_running) {
2486         ret = ram_load_postcopy(f);
2487     }
2488
2489     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2490         ram_addr_t addr, total_ram_bytes;
2491         void *host = NULL;
2492         uint8_t ch;
2493
2494         addr = qemu_get_be64(f);
2495         flags = addr & ~TARGET_PAGE_MASK;
2496         addr &= TARGET_PAGE_MASK;
2497
2498         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2499                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2500             RAMBlock *block = ram_block_from_stream(f, flags);
2501
2502             host = host_from_ram_block_offset(block, addr);
2503             if (!host) {
2504                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2505                 ret = -EINVAL;
2506                 break;
2507             }
2508             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2509         }
2510
2511         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2512         case RAM_SAVE_FLAG_MEM_SIZE:
2513             /* Synchronize RAM block list */
2514             total_ram_bytes = addr;
2515             while (!ret && total_ram_bytes) {
2516                 RAMBlock *block;
2517                 char id[256];
2518                 ram_addr_t length;
2519
2520                 len = qemu_get_byte(f);
2521                 qemu_get_buffer(f, (uint8_t *)id, len);
2522                 id[len] = 0;
2523                 length = qemu_get_be64(f);
2524
2525                 block = qemu_ram_block_by_name(id);
2526                 if (block) {
2527                     if (length != block->used_length) {
2528                         Error *local_err = NULL;
2529
2530                         ret = qemu_ram_resize(block, length,
2531                                               &local_err);
2532                         if (local_err) {
2533                             error_report_err(local_err);
2534                         }
2535                     }
2536                     /* For postcopy we need to check hugepage sizes match */
2537                     if (postcopy_advised &&
2538                         block->page_size != qemu_host_page_size) {
2539                         uint64_t remote_page_size = qemu_get_be64(f);
2540                         if (remote_page_size != block->page_size) {
2541                             error_report("Mismatched RAM page size %s "
2542                                          "(local) %zd != %" PRId64,
2543                                          id, block->page_size,
2544                                          remote_page_size);
2545                             ret = -EINVAL;
2546                         }
2547                     }
2548                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2549                                           block->idstr);
2550                 } else {
2551                     error_report("Unknown ramblock \"%s\", cannot "
2552                                  "accept migration", id);
2553                     ret = -EINVAL;
2554                 }
2555
2556                 total_ram_bytes -= length;
2557             }
2558             break;
2559
2560         case RAM_SAVE_FLAG_ZERO:
2561             ch = qemu_get_byte(f);
2562             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2563             break;
2564
2565         case RAM_SAVE_FLAG_PAGE:
2566             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2567             break;
2568
2569         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2570             len = qemu_get_be32(f);
2571             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2572                 error_report("Invalid compressed data length: %d", len);
2573                 ret = -EINVAL;
2574                 break;
2575             }
2576             decompress_data_with_multi_threads(f, host, len);
2577             break;
2578
2579         case RAM_SAVE_FLAG_XBZRLE:
2580             if (load_xbzrle(f, addr, host) < 0) {
2581                 error_report("Failed to decompress XBZRLE page at "
2582                              RAM_ADDR_FMT, addr);
2583                 ret = -EINVAL;
2584                 break;
2585             }
2586             break;
2587         case RAM_SAVE_FLAG_EOS:
2588             /* normal exit */
2589             break;
2590         default:
2591             if (flags & RAM_SAVE_FLAG_HOOK) {
2592                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2593             } else {
2594                 error_report("Unknown combination of migration flags: %#x",
2595                              flags);
2596                 ret = -EINVAL;
2597             }
2598         }
2599         if (!ret) {
2600             ret = qemu_file_get_error(f);
2601         }
2602     }
2603
2604     wait_for_decompress_done();
2605     rcu_read_unlock();
2606     trace_ram_load_complete(ret, seq_iter);
2607     return ret;
2608 }
2609
2610 static SaveVMHandlers savevm_ram_handlers = {
2611     .save_live_setup = ram_save_setup,
2612     .save_live_iterate = ram_save_iterate,
2613     .save_live_complete_postcopy = ram_save_complete,
2614     .save_live_complete_precopy = ram_save_complete,
2615     .save_live_pending = ram_save_pending,
2616     .load_state = ram_load,
2617     .cleanup = ram_migration_cleanup,
2618 };
2619
2620 void ram_mig_init(void)
2621 {
2622     qemu_mutex_init(&XBZRLE.lock);
2623     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2624 }