migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "cpu.h"
  30 #include <zlib.h>
  31 #include "qapi-event.h"
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "migration/register.h"
  40 #include "migration/misc.h"
  41 #include "qemu-file.h"
  42 #include "postcopy-ram.h"
  43 #include "migration/page_cache.h"
  44 #include "qemu/error-report.h"
  45 #include "trace.h"
  46 #include "exec/ram_addr.h"
  47 #include "qemu/rcu_queue.h"
  48 #include "migration/colo.h"
  49
  50 /***********************************************************/
  51 /* ram save/restore */
  52
  53 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  54  * worked for pages that where filled with the same char.  We switched
  55  * it to only search for the zero value.  And to avoid confusion with
  56  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  57  */
  58
  59 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  60 #define RAM_SAVE_FLAG_ZERO     0x02
  61 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  62 #define RAM_SAVE_FLAG_PAGE     0x08
  63 #define RAM_SAVE_FLAG_EOS      0x10
  64 #define RAM_SAVE_FLAG_CONTINUE 0x20
  65 #define RAM_SAVE_FLAG_XBZRLE   0x40
  66 /* 0x80 is reserved in migration.h start with 0x100 next */
  67 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  68
  69 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  70 {
  71     return buffer_is_zero(p, size);
  72 }
  73
  74 XBZRLECacheStats xbzrle_counters;
  75
  76 /* struct contains XBZRLE cache and a static page
  77    used by the compression */
  78 static struct {
  79     /* buffer used for XBZRLE encoding */
  80     uint8_t *encoded_buf;
  81     /* buffer for storing page content */
  82     uint8_t *current_buf;
  83     /* Cache for XBZRLE, Protected by lock. */
  84     PageCache *cache;
  85     QemuMutex lock;
  86     /* it will store a page full of zeros */
  87     uint8_t *zero_target_page;
  88 } XBZRLE;
  89
  90 /* buffer used for XBZRLE decoding */
  91 static uint8_t *xbzrle_decoded_buf;
  92
  93 static void XBZRLE_cache_lock(void)
  94 {
  95     if (migrate_use_xbzrle())
  96         qemu_mutex_lock(&XBZRLE.lock);
  97 }
  98
  99 static void XBZRLE_cache_unlock(void)
 100 {
 101     if (migrate_use_xbzrle())
 102         qemu_mutex_unlock(&XBZRLE.lock);
 103 }
 104
 105 /**
 106  * xbzrle_cache_resize: resize the xbzrle cache
 107  *
 108  * This function is called from qmp_migrate_set_cache_size in main
 109  * thread, possibly while a migration is in progress.  A running
 110  * migration may be using the cache and might finish during this call,
 111  * hence changes to the cache are protected by XBZRLE.lock().
 112  *
 113  * Returns the new_size or negative in case of error.
 114  *
 115  * @new_size: new cache size
 116  */
 117 int64_t xbzrle_cache_resize(int64_t new_size)
 118 {
 119     PageCache *new_cache;
 120     int64_t ret;
 121
 122     if (new_size < TARGET_PAGE_SIZE) {
 123         return -1;
 124     }
 125
 126     XBZRLE_cache_lock();
 127
 128     if (XBZRLE.cache != NULL) {
 129         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 130             goto out_new_size;
 131         }
 132         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 133                                         TARGET_PAGE_SIZE);
 134         if (!new_cache) {
 135             error_report("Error creating cache");
 136             ret = -1;
 137             goto out;
 138         }
 139
 140         cache_fini(XBZRLE.cache);
 141         XBZRLE.cache = new_cache;
 142     }
 143
 144 out_new_size:
 145     ret = pow2floor(new_size);
 146 out:
 147     XBZRLE_cache_unlock();
 148     return ret;
 149 }
 150
 151 /*
 152  * An outstanding page request, on the source, having been received
 153  * and queued
 154  */
 155 struct RAMSrcPageRequest {
 156     RAMBlock *rb;
 157     hwaddr    offset;
 158     hwaddr    len;
 159
 160     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 161 };
 162
 163 /* State of RAM for migration */
 164 struct RAMState {
 165     /* QEMUFile used for this migration */
 166     QEMUFile *f;
 167     /* Last block that we have visited searching for dirty pages */
 168     RAMBlock *last_seen_block;
 169     /* Last block from where we have sent data */
 170     RAMBlock *last_sent_block;
 171     /* Last dirty target page we have sent */
 172     ram_addr_t last_page;
 173     /* last ram version we have seen */
 174     uint32_t last_version;
 175     /* We are in the first round */
 176     bool ram_bulk_stage;
 177     /* How many times we have dirty too many pages */
 178     int dirty_rate_high_cnt;
 179     /* these variables are used for bitmap sync */
 180     /* last time we did a full bitmap_sync */
 181     int64_t time_last_bitmap_sync;
 182     /* bytes transferred at start_time */
 183     uint64_t bytes_xfer_prev;
 184     /* number of dirty pages since start_time */
 185     uint64_t num_dirty_pages_period;
 186     /* xbzrle misses since the beginning of the period */
 187     uint64_t xbzrle_cache_miss_prev;
 188     /* number of iterations at the beginning of period */
 189     uint64_t iterations_prev;
 190     /* Iterations since start */
 191     uint64_t iterations;
 192     /* protects modification of the bitmap */
 193     uint64_t migration_dirty_pages;
 194     /* number of dirty bits in the bitmap */
 195     QemuMutex bitmap_mutex;
 196     /* The RAMBlock used in the last src_page_requests */
 197     RAMBlock *last_req_rb;
 198     /* Queue of outstanding page requests from the destination */
 199     QemuMutex src_page_req_mutex;
 200     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 201 };
 202 typedef struct RAMState RAMState;
 203
 204 static RAMState *ram_state;
 205
 206 uint64_t ram_bytes_remaining(void)
 207 {
 208     return ram_state->migration_dirty_pages * TARGET_PAGE_SIZE;
 209 }
 210
 211 MigrationStats ram_counters;
 212
 213 /* used by the search for pages to send */
 214 struct PageSearchStatus {
 215     /* Current block being searched */
 216     RAMBlock    *block;
 217     /* Current page to search from */
 218     unsigned long page;
 219     /* Set once we wrap around */
 220     bool         complete_round;
 221 };
 222 typedef struct PageSearchStatus PageSearchStatus;
 223
 224 struct CompressParam {
 225     bool done;
 226     bool quit;
 227     QEMUFile *file;
 228     QemuMutex mutex;
 229     QemuCond cond;
 230     RAMBlock *block;
 231     ram_addr_t offset;
 232 };
 233 typedef struct CompressParam CompressParam;
 234
 235 struct DecompressParam {
 236     bool done;
 237     bool quit;
 238     QemuMutex mutex;
 239     QemuCond cond;
 240     void *des;
 241     uint8_t *compbuf;
 242     int len;
 243 };
 244 typedef struct DecompressParam DecompressParam;
 245
 246 static CompressParam *comp_param;
 247 static QemuThread *compress_threads;
 248 /* comp_done_cond is used to wake up the migration thread when
 249  * one of the compression threads has finished the compression.
 250  * comp_done_lock is used to co-work with comp_done_cond.
 251  */
 252 static QemuMutex comp_done_lock;
 253 static QemuCond comp_done_cond;
 254 /* The empty QEMUFileOps will be used by file in CompressParam */
 255 static const QEMUFileOps empty_ops = { };
 256
 257 static DecompressParam *decomp_param;
 258 static QemuThread *decompress_threads;
 259 static QemuMutex decomp_done_lock;
 260 static QemuCond decomp_done_cond;
 261
 262 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 263                                 ram_addr_t offset);
 264
 265 static void *do_data_compress(void *opaque)
 266 {
 267     CompressParam *param = opaque;
 268     RAMBlock *block;
 269     ram_addr_t offset;
 270
 271     qemu_mutex_lock(&param->mutex);
 272     while (!param->quit) {
 273         if (param->block) {
 274             block = param->block;
 275             offset = param->offset;
 276             param->block = NULL;
 277             qemu_mutex_unlock(&param->mutex);
 278
 279             do_compress_ram_page(param->file, block, offset);
 280
 281             qemu_mutex_lock(&comp_done_lock);
 282             param->done = true;
 283             qemu_cond_signal(&comp_done_cond);
 284             qemu_mutex_unlock(&comp_done_lock);
 285
 286             qemu_mutex_lock(&param->mutex);
 287         } else {
 288             qemu_cond_wait(&param->cond, &param->mutex);
 289         }
 290     }
 291     qemu_mutex_unlock(&param->mutex);
 292
 293     return NULL;
 294 }
 295
 296 static inline void terminate_compression_threads(void)
 297 {
 298     int idx, thread_count;
 299
 300     thread_count = migrate_compress_threads();
 301
 302     for (idx = 0; idx < thread_count; idx++) {
 303         qemu_mutex_lock(&comp_param[idx].mutex);
 304         comp_param[idx].quit = true;
 305         qemu_cond_signal(&comp_param[idx].cond);
 306         qemu_mutex_unlock(&comp_param[idx].mutex);
 307     }
 308 }
 309
 310 void migrate_compress_threads_join(void)
 311 {
 312     int i, thread_count;
 313
 314     if (!migrate_use_compression()) {
 315         return;
 316     }
 317     terminate_compression_threads();
 318     thread_count = migrate_compress_threads();
 319     for (i = 0; i < thread_count; i++) {
 320         qemu_thread_join(compress_threads + i);
 321         qemu_fclose(comp_param[i].file);
 322         qemu_mutex_destroy(&comp_param[i].mutex);
 323         qemu_cond_destroy(&comp_param[i].cond);
 324     }
 325     qemu_mutex_destroy(&comp_done_lock);
 326     qemu_cond_destroy(&comp_done_cond);
 327     g_free(compress_threads);
 328     g_free(comp_param);
 329     compress_threads = NULL;
 330     comp_param = NULL;
 331 }
 332
 333 void migrate_compress_threads_create(void)
 334 {
 335     int i, thread_count;
 336
 337     if (!migrate_use_compression()) {
 338         return;
 339     }
 340     thread_count = migrate_compress_threads();
 341     compress_threads = g_new0(QemuThread, thread_count);
 342     comp_param = g_new0(CompressParam, thread_count);
 343     qemu_cond_init(&comp_done_cond);
 344     qemu_mutex_init(&comp_done_lock);
 345     for (i = 0; i < thread_count; i++) {
 346         /* comp_param[i].file is just used as a dummy buffer to save data,
 347          * set its ops to empty.
 348          */
 349         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 350         comp_param[i].done = true;
 351         comp_param[i].quit = false;
 352         qemu_mutex_init(&comp_param[i].mutex);
 353         qemu_cond_init(&comp_param[i].cond);
 354         qemu_thread_create(compress_threads + i, "compress",
 355                            do_data_compress, comp_param + i,
 356                            QEMU_THREAD_JOINABLE);
 357     }
 358 }
 359
 360 /**
 361  * save_page_header: write page header to wire
 362  *
 363  * If this is the 1st block, it also writes the block identification
 364  *
 365  * Returns the number of bytes written
 366  *
 367  * @f: QEMUFile where to send the data
 368  * @block: block that contains the page we want to send
 369  * @offset: offset inside the block for the page
 370  *          in the lower bits, it contains flags
 371  */
 372 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 373                                ram_addr_t offset)
 374 {
 375     size_t size, len;
 376
 377     if (block == rs->last_sent_block) {
 378         offset |= RAM_SAVE_FLAG_CONTINUE;
 379     }
 380     qemu_put_be64(f, offset);
 381     size = 8;
 382
 383     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 384         len = strlen(block->idstr);
 385         qemu_put_byte(f, len);
 386         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 387         size += 1 + len;
 388         rs->last_sent_block = block;
 389     }
 390     return size;
 391 }
 392
 393 /**
 394  * mig_throttle_guest_down: throotle down the guest
 395  *
 396  * Reduce amount of guest cpu execution to hopefully slow down memory
 397  * writes. If guest dirty memory rate is reduced below the rate at
 398  * which we can transfer pages to the destination then we should be
 399  * able to complete migration. Some workloads dirty memory way too
 400  * fast and will not effectively converge, even with auto-converge.
 401  */
 402 static void mig_throttle_guest_down(void)
 403 {
 404     MigrationState *s = migrate_get_current();
 405     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 406     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 407
 408     /* We have not started throttling yet. Let's start it. */
 409     if (!cpu_throttle_active()) {
 410         cpu_throttle_set(pct_initial);
 411     } else {
 412         /* Throttling already on, just increase the rate */
 413         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 414     }
 415 }
 416
 417 /**
 418  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 419  *
 420  * @rs: current RAM state
 421  * @current_addr: address for the zero page
 422  *
 423  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 424  * The important thing is that a stale (not-yet-0'd) page be replaced
 425  * by the new data.
 426  * As a bonus, if the page wasn't in the cache it gets added so that
 427  * when a small write is made into the 0'd page it gets XBZRLE sent.
 428  */
 429 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 430 {
 431     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 432         return;
 433     }
 434
 435     /* We don't care if this fails to allocate a new cache page
 436      * as long as it updated an old one */
 437     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 438                  ram_counters.dirty_sync_count);
 439 }
 440
 441 #define ENCODING_FLAG_XBZRLE 0x1
 442
 443 /**
 444  * save_xbzrle_page: compress and send current page
 445  *
 446  * Returns: 1 means that we wrote the page
 447  *          0 means that page is identical to the one already sent
 448  *          -1 means that xbzrle would be longer than normal
 449  *
 450  * @rs: current RAM state
 451  * @current_data: pointer to the address of the page contents
 452  * @current_addr: addr of the page
 453  * @block: block that contains the page we want to send
 454  * @offset: offset inside the block for the page
 455  * @last_stage: if we are at the completion stage
 456  */
 457 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 458                             ram_addr_t current_addr, RAMBlock *block,
 459                             ram_addr_t offset, bool last_stage)
 460 {
 461     int encoded_len = 0, bytes_xbzrle;
 462     uint8_t *prev_cached_page;
 463
 464     if (!cache_is_cached(XBZRLE.cache, current_addr,
 465                          ram_counters.dirty_sync_count)) {
 466         xbzrle_counters.cache_miss++;
 467         if (!last_stage) {
 468             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 469                              ram_counters.dirty_sync_count) == -1) {
 470                 return -1;
 471             } else {
 472                 /* update *current_data when the page has been
 473                    inserted into cache */
 474                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 475             }
 476         }
 477         return -1;
 478     }
 479
 480     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 481
 482     /* save current buffer into memory */
 483     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 484
 485     /* XBZRLE encoding (if there is no overflow) */
 486     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 487                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 488                                        TARGET_PAGE_SIZE);
 489     if (encoded_len == 0) {
 490         trace_save_xbzrle_page_skipping();
 491         return 0;
 492     } else if (encoded_len == -1) {
 493         trace_save_xbzrle_page_overflow();
 494         xbzrle_counters.overflow++;
 495         /* update data in the cache */
 496         if (!last_stage) {
 497             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 498             *current_data = prev_cached_page;
 499         }
 500         return -1;
 501     }
 502
 503     /* we need to update the data in the cache, in order to get the same data */
 504     if (!last_stage) {
 505         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 506     }
 507
 508     /* Send XBZRLE based compressed page */
 509     bytes_xbzrle = save_page_header(rs, rs->f, block,
 510                                     offset | RAM_SAVE_FLAG_XBZRLE);
 511     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 512     qemu_put_be16(rs->f, encoded_len);
 513     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 514     bytes_xbzrle += encoded_len + 1 + 2;
 515     xbzrle_counters.pages++;
 516     xbzrle_counters.bytes += bytes_xbzrle;
 517     ram_counters.transferred += bytes_xbzrle;
 518
 519     return 1;
 520 }
 521
 522 /**
 523  * migration_bitmap_find_dirty: find the next dirty page from start
 524  *
 525  * Called with rcu_read_lock() to protect migration_bitmap
 526  *
 527  * Returns the byte offset within memory region of the start of a dirty page
 528  *
 529  * @rs: current RAM state
 530  * @rb: RAMBlock where to search for dirty pages
 531  * @start: page where we start the search
 532  */
 533 static inline
 534 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 535                                           unsigned long start)
 536 {
 537     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 538     unsigned long *bitmap = rb->bmap;
 539     unsigned long next;
 540
 541     if (rs->ram_bulk_stage && start > 0) {
 542         next = start + 1;
 543     } else {
 544         next = find_next_bit(bitmap, size, start);
 545     }
 546
 547     return next;
 548 }
 549
 550 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 551                                                 RAMBlock *rb,
 552                                                 unsigned long page)
 553 {
 554     bool ret;
 555
 556     ret = test_and_clear_bit(page, rb->bmap);
 557
 558     if (ret) {
 559         rs->migration_dirty_pages--;
 560     }
 561     return ret;
 562 }
 563
 564 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 565                                         ram_addr_t start, ram_addr_t length)
 566 {
 567     rs->migration_dirty_pages +=
 568         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 569                                               &rs->num_dirty_pages_period);
 570 }
 571
 572 /**
 573  * ram_pagesize_summary: calculate all the pagesizes of a VM
 574  *
 575  * Returns a summary bitmap of the page sizes of all RAMBlocks
 576  *
 577  * For VMs with just normal pages this is equivalent to the host page
 578  * size. If it's got some huge pages then it's the OR of all the
 579  * different page sizes.
 580  */
 581 uint64_t ram_pagesize_summary(void)
 582 {
 583     RAMBlock *block;
 584     uint64_t summary = 0;
 585
 586     RAMBLOCK_FOREACH(block) {
 587         summary |= block->page_size;
 588     }
 589
 590     return summary;
 591 }
 592
 593 static void migration_bitmap_sync(RAMState *rs)
 594 {
 595     RAMBlock *block;
 596     int64_t end_time;
 597     uint64_t bytes_xfer_now;
 598
 599     ram_counters.dirty_sync_count++;
 600
 601     if (!rs->time_last_bitmap_sync) {
 602         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 603     }
 604
 605     trace_migration_bitmap_sync_start();
 606     memory_global_dirty_log_sync();
 607
 608     qemu_mutex_lock(&rs->bitmap_mutex);
 609     rcu_read_lock();
 610     RAMBLOCK_FOREACH(block) {
 611         migration_bitmap_sync_range(rs, block, 0, block->used_length);
 612     }
 613     rcu_read_unlock();
 614     qemu_mutex_unlock(&rs->bitmap_mutex);
 615
 616     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 617
 618     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 619
 620     /* more than 1 second = 1000 millisecons */
 621     if (end_time > rs->time_last_bitmap_sync + 1000) {
 622         /* calculate period counters */
 623         ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 624             / (end_time - rs->time_last_bitmap_sync);
 625         bytes_xfer_now = ram_counters.transferred;
 626
 627         if (migrate_auto_converge()) {
 628             /* The following detection logic can be refined later. For now:
 629                Check to see if the dirtied bytes is 50% more than the approx.
 630                amount of bytes that just got transferred since the last time we
 631                were in this routine. If that happens twice, start or increase
 632                throttling */
 633
 634             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 635                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 636                 (++rs->dirty_rate_high_cnt >= 2)) {
 637                     trace_migration_throttle();
 638                     rs->dirty_rate_high_cnt = 0;
 639                     mig_throttle_guest_down();
 640             }
 641         }
 642
 643         if (migrate_use_xbzrle()) {
 644             if (rs->iterations_prev != rs->iterations) {
 645                 xbzrle_counters.cache_miss_rate =
 646                    (double)(xbzrle_counters.cache_miss -
 647                             rs->xbzrle_cache_miss_prev) /
 648                    (rs->iterations - rs->iterations_prev);
 649             }
 650             rs->iterations_prev = rs->iterations;
 651             rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 652         }
 653
 654         /* reset period counters */
 655         rs->time_last_bitmap_sync = end_time;
 656         rs->num_dirty_pages_period = 0;
 657         rs->bytes_xfer_prev = bytes_xfer_now;
 658     }
 659     if (migrate_use_events()) {
 660         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
 661     }
 662 }
 663
 664 /**
 665  * save_zero_page: send the zero page to the stream
 666  *
 667  * Returns the number of pages written.
 668  *
 669  * @rs: current RAM state
 670  * @block: block that contains the page we want to send
 671  * @offset: offset inside the block for the page
 672  * @p: pointer to the page
 673  */
 674 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
 675                           uint8_t *p)
 676 {
 677     int pages = -1;
 678
 679     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 680         ram_counters.duplicate++;
 681         ram_counters.transferred +=
 682             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
 683         qemu_put_byte(rs->f, 0);
 684         ram_counters.transferred += 1;
 685         pages = 1;
 686     }
 687
 688     return pages;
 689 }
 690
 691 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 692 {
 693     if (!migrate_release_ram() || !migration_in_postcopy()) {
 694         return;
 695     }
 696
 697     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 698 }
 699
 700 /**
 701  * ram_save_page: send the given page to the stream
 702  *
 703  * Returns the number of pages written.
 704  *          < 0 - error
 705  *          >=0 - Number of pages written - this might legally be 0
 706  *                if xbzrle noticed the page was the same.
 707  *
 708  * @rs: current RAM state
 709  * @block: block that contains the page we want to send
 710  * @offset: offset inside the block for the page
 711  * @last_stage: if we are at the completion stage
 712  */
 713 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
 714 {
 715     int pages = -1;
 716     uint64_t bytes_xmit;
 717     ram_addr_t current_addr;
 718     uint8_t *p;
 719     int ret;
 720     bool send_async = true;
 721     RAMBlock *block = pss->block;
 722     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 723
 724     p = block->host + offset;
 725     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
 726
 727     /* In doubt sent page as normal */
 728     bytes_xmit = 0;
 729     ret = ram_control_save_page(rs->f, block->offset,
 730                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 731     if (bytes_xmit) {
 732         ram_counters.transferred += bytes_xmit;
 733         pages = 1;
 734     }
 735
 736     XBZRLE_cache_lock();
 737
 738     current_addr = block->offset + offset;
 739
 740     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 741         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 742             if (bytes_xmit > 0) {
 743                 ram_counters.normal++;
 744             } else if (bytes_xmit == 0) {
 745                 ram_counters.duplicate++;
 746             }
 747         }
 748     } else {
 749         pages = save_zero_page(rs, block, offset, p);
 750         if (pages > 0) {
 751             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 752              * page would be stale
 753              */
 754             xbzrle_cache_zero_page(rs, current_addr);
 755             ram_release_pages(block->idstr, offset, pages);
 756         } else if (!rs->ram_bulk_stage &&
 757                    !migration_in_postcopy() && migrate_use_xbzrle()) {
 758             pages = save_xbzrle_page(rs, &p, current_addr, block,
 759                                      offset, last_stage);
 760             if (!last_stage) {
 761                 /* Can't send this cached data async, since the cache page
 762                  * might get updated before it gets to the wire
 763                  */
 764                 send_async = false;
 765             }
 766         }
 767     }
 768
 769     /* XBZRLE overflow or normal page */
 770     if (pages == -1) {
 771         ram_counters.transferred +=
 772             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
 773         if (send_async) {
 774             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
 775                                   migrate_release_ram() &
 776                                   migration_in_postcopy());
 777         } else {
 778             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
 779         }
 780         ram_counters.transferred += TARGET_PAGE_SIZE;
 781         pages = 1;
 782         ram_counters.normal++;
 783     }
 784
 785     XBZRLE_cache_unlock();
 786
 787     return pages;
 788 }
 789
 790 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 791                                 ram_addr_t offset)
 792 {
 793     RAMState *rs = ram_state;
 794     int bytes_sent, blen;
 795     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 796
 797     bytes_sent = save_page_header(rs, f, block, offset |
 798                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
 799     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 800                                      migrate_compress_level());
 801     if (blen < 0) {
 802         bytes_sent = 0;
 803         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
 804         error_report("compressed data failed!");
 805     } else {
 806         bytes_sent += blen;
 807         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
 808     }
 809
 810     return bytes_sent;
 811 }
 812
 813 static void flush_compressed_data(RAMState *rs)
 814 {
 815     int idx, len, thread_count;
 816
 817     if (!migrate_use_compression()) {
 818         return;
 819     }
 820     thread_count = migrate_compress_threads();
 821
 822     qemu_mutex_lock(&comp_done_lock);
 823     for (idx = 0; idx < thread_count; idx++) {
 824         while (!comp_param[idx].done) {
 825             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 826         }
 827     }
 828     qemu_mutex_unlock(&comp_done_lock);
 829
 830     for (idx = 0; idx < thread_count; idx++) {
 831         qemu_mutex_lock(&comp_param[idx].mutex);
 832         if (!comp_param[idx].quit) {
 833             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 834             ram_counters.transferred += len;
 835         }
 836         qemu_mutex_unlock(&comp_param[idx].mutex);
 837     }
 838 }
 839
 840 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 841                                        ram_addr_t offset)
 842 {
 843     param->block = block;
 844     param->offset = offset;
 845 }
 846
 847 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
 848                                            ram_addr_t offset)
 849 {
 850     int idx, thread_count, bytes_xmit = -1, pages = -1;
 851
 852     thread_count = migrate_compress_threads();
 853     qemu_mutex_lock(&comp_done_lock);
 854     while (true) {
 855         for (idx = 0; idx < thread_count; idx++) {
 856             if (comp_param[idx].done) {
 857                 comp_param[idx].done = false;
 858                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 859                 qemu_mutex_lock(&comp_param[idx].mutex);
 860                 set_compress_params(&comp_param[idx], block, offset);
 861                 qemu_cond_signal(&comp_param[idx].cond);
 862                 qemu_mutex_unlock(&comp_param[idx].mutex);
 863                 pages = 1;
 864                 ram_counters.normal++;
 865                 ram_counters.transferred += bytes_xmit;
 866                 break;
 867             }
 868         }
 869         if (pages > 0) {
 870             break;
 871         } else {
 872             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 873         }
 874     }
 875     qemu_mutex_unlock(&comp_done_lock);
 876
 877     return pages;
 878 }
 879
 880 /**
 881  * ram_save_compressed_page: compress the given page and send it to the stream
 882  *
 883  * Returns the number of pages written.
 884  *
 885  * @rs: current RAM state
 886  * @block: block that contains the page we want to send
 887  * @offset: offset inside the block for the page
 888  * @last_stage: if we are at the completion stage
 889  */
 890 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
 891                                     bool last_stage)
 892 {
 893     int pages = -1;
 894     uint64_t bytes_xmit = 0;
 895     uint8_t *p;
 896     int ret, blen;
 897     RAMBlock *block = pss->block;
 898     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 899
 900     p = block->host + offset;
 901
 902     ret = ram_control_save_page(rs->f, block->offset,
 903                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
 904     if (bytes_xmit) {
 905         ram_counters.transferred += bytes_xmit;
 906         pages = 1;
 907     }
 908     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 909         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 910             if (bytes_xmit > 0) {
 911                 ram_counters.normal++;
 912             } else if (bytes_xmit == 0) {
 913                 ram_counters.duplicate++;
 914             }
 915         }
 916     } else {
 917         /* When starting the process of a new block, the first page of
 918          * the block should be sent out before other pages in the same
 919          * block, and all the pages in last block should have been sent
 920          * out, keeping this order is important, because the 'cont' flag
 921          * is used to avoid resending the block name.
 922          */
 923         if (block != rs->last_sent_block) {
 924             flush_compressed_data(rs);
 925             pages = save_zero_page(rs, block, offset, p);
 926             if (pages == -1) {
 927                 /* Make sure the first page is sent out before other pages */
 928                 bytes_xmit = save_page_header(rs, rs->f, block, offset |
 929                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
 930                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
 931                                                  migrate_compress_level());
 932                 if (blen > 0) {
 933                     ram_counters.transferred += bytes_xmit + blen;
 934                     ram_counters.normal++;
 935                     pages = 1;
 936                 } else {
 937                     qemu_file_set_error(rs->f, blen);
 938                     error_report("compressed data failed!");
 939                 }
 940             }
 941             if (pages > 0) {
 942                 ram_release_pages(block->idstr, offset, pages);
 943             }
 944         } else {
 945             pages = save_zero_page(rs, block, offset, p);
 946             if (pages == -1) {
 947                 pages = compress_page_with_multi_thread(rs, block, offset);
 948             } else {
 949                 ram_release_pages(block->idstr, offset, pages);
 950             }
 951         }
 952     }
 953
 954     return pages;
 955 }
 956
 957 /**
 958  * find_dirty_block: find the next dirty page and update any state
 959  * associated with the search process.
 960  *
 961  * Returns if a page is found
 962  *
 963  * @rs: current RAM state
 964  * @pss: data about the state of the current dirty page scan
 965  * @again: set to false if the search has scanned the whole of RAM
 966  */
 967 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
 968 {
 969     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
 970     if (pss->complete_round && pss->block == rs->last_seen_block &&
 971         pss->page >= rs->last_page) {
 972         /*
 973          * We've been once around the RAM and haven't found anything.
 974          * Give up.
 975          */
 976         *again = false;
 977         return false;
 978     }
 979     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
 980         /* Didn't find anything in this RAM Block */
 981         pss->page = 0;
 982         pss->block = QLIST_NEXT_RCU(pss->block, next);
 983         if (!pss->block) {
 984             /* Hit the end of the list */
 985             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
 986             /* Flag that we've looped */
 987             pss->complete_round = true;
 988             rs->ram_bulk_stage = false;
 989             if (migrate_use_xbzrle()) {
 990                 /* If xbzrle is on, stop using the data compression at this
 991                  * point. In theory, xbzrle can do better than compression.
 992                  */
 993                 flush_compressed_data(rs);
 994             }
 995         }
 996         /* Didn't find anything this time, but try again on the new block */
 997         *again = true;
 998         return false;
 999     } else {
1000         /* Can go around again, but... */
1001         *again = true;
1002         /* We've found something so probably don't need to */
1003         return true;
1004     }
1005 }
1006
1007 /**
1008  * unqueue_page: gets a page of the queue
1009  *
1010  * Helper for 'get_queued_page' - gets a page off the queue
1011  *
1012  * Returns the block of the page (or NULL if none available)
1013  *
1014  * @rs: current RAM state
1015  * @offset: used to return the offset within the RAMBlock
1016  */
1017 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1018 {
1019     RAMBlock *block = NULL;
1020
1021     qemu_mutex_lock(&rs->src_page_req_mutex);
1022     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1023         struct RAMSrcPageRequest *entry =
1024                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1025         block = entry->rb;
1026         *offset = entry->offset;
1027
1028         if (entry->len > TARGET_PAGE_SIZE) {
1029             entry->len -= TARGET_PAGE_SIZE;
1030             entry->offset += TARGET_PAGE_SIZE;
1031         } else {
1032             memory_region_unref(block->mr);
1033             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1034             g_free(entry);
1035         }
1036     }
1037     qemu_mutex_unlock(&rs->src_page_req_mutex);
1038
1039     return block;
1040 }
1041
1042 /**
1043  * get_queued_page: unqueue a page from the postocpy requests
1044  *
1045  * Skips pages that are already sent (!dirty)
1046  *
1047  * Returns if a queued page is found
1048  *
1049  * @rs: current RAM state
1050  * @pss: data about the state of the current dirty page scan
1051  */
1052 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1053 {
1054     RAMBlock  *block;
1055     ram_addr_t offset;
1056     bool dirty;
1057
1058     do {
1059         block = unqueue_page(rs, &offset);
1060         /*
1061          * We're sending this page, and since it's postcopy nothing else
1062          * will dirty it, and we must make sure it doesn't get sent again
1063          * even if this queue request was received after the background
1064          * search already sent it.
1065          */
1066         if (block) {
1067             unsigned long page;
1068
1069             page = offset >> TARGET_PAGE_BITS;
1070             dirty = test_bit(page, block->bmap);
1071             if (!dirty) {
1072                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1073                        page, test_bit(page, block->unsentmap));
1074             } else {
1075                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1076             }
1077         }
1078
1079     } while (block && !dirty);
1080
1081     if (block) {
1082         /*
1083          * As soon as we start servicing pages out of order, then we have
1084          * to kill the bulk stage, since the bulk stage assumes
1085          * in (migration_bitmap_find_and_reset_dirty) that every page is
1086          * dirty, that's no longer true.
1087          */
1088         rs->ram_bulk_stage = false;
1089
1090         /*
1091          * We want the background search to continue from the queued page
1092          * since the guest is likely to want other pages near to the page
1093          * it just requested.
1094          */
1095         pss->block = block;
1096         pss->page = offset >> TARGET_PAGE_BITS;
1097     }
1098
1099     return !!block;
1100 }
1101
1102 /**
1103  * migration_page_queue_free: drop any remaining pages in the ram
1104  * request queue
1105  *
1106  * It should be empty at the end anyway, but in error cases there may
1107  * be some left.  in case that there is any page left, we drop it.
1108  *
1109  */
1110 static void migration_page_queue_free(RAMState *rs)
1111 {
1112     struct RAMSrcPageRequest *mspr, *next_mspr;
1113     /* This queue generally should be empty - but in the case of a failed
1114      * migration might have some droppings in.
1115      */
1116     rcu_read_lock();
1117     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1118         memory_region_unref(mspr->rb->mr);
1119         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1120         g_free(mspr);
1121     }
1122     rcu_read_unlock();
1123 }
1124
1125 /**
1126  * ram_save_queue_pages: queue the page for transmission
1127  *
1128  * A request from postcopy destination for example.
1129  *
1130  * Returns zero on success or negative on error
1131  *
1132  * @rbname: Name of the RAMBLock of the request. NULL means the
1133  *          same that last one.
1134  * @start: starting address from the start of the RAMBlock
1135  * @len: length (in bytes) to send
1136  */
1137 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1138 {
1139     RAMBlock *ramblock;
1140     RAMState *rs = ram_state;
1141
1142     ram_counters.postcopy_requests++;
1143     rcu_read_lock();
1144     if (!rbname) {
1145         /* Reuse last RAMBlock */
1146         ramblock = rs->last_req_rb;
1147
1148         if (!ramblock) {
1149             /*
1150              * Shouldn't happen, we can't reuse the last RAMBlock if
1151              * it's the 1st request.
1152              */
1153             error_report("ram_save_queue_pages no previous block");
1154             goto err;
1155         }
1156     } else {
1157         ramblock = qemu_ram_block_by_name(rbname);
1158
1159         if (!ramblock) {
1160             /* We shouldn't be asked for a non-existent RAMBlock */
1161             error_report("ram_save_queue_pages no block '%s'", rbname);
1162             goto err;
1163         }
1164         rs->last_req_rb = ramblock;
1165     }
1166     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1167     if (start+len > ramblock->used_length) {
1168         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1169                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1170                      __func__, start, len, ramblock->used_length);
1171         goto err;
1172     }
1173
1174     struct RAMSrcPageRequest *new_entry =
1175         g_malloc0(sizeof(struct RAMSrcPageRequest));
1176     new_entry->rb = ramblock;
1177     new_entry->offset = start;
1178     new_entry->len = len;
1179
1180     memory_region_ref(ramblock->mr);
1181     qemu_mutex_lock(&rs->src_page_req_mutex);
1182     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1183     qemu_mutex_unlock(&rs->src_page_req_mutex);
1184     rcu_read_unlock();
1185
1186     return 0;
1187
1188 err:
1189     rcu_read_unlock();
1190     return -1;
1191 }
1192
1193 /**
1194  * ram_save_target_page: save one target page
1195  *
1196  * Returns the number of pages written
1197  *
1198  * @rs: current RAM state
1199  * @ms: current migration state
1200  * @pss: data about the page we want to send
1201  * @last_stage: if we are at the completion stage
1202  */
1203 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1204                                 bool last_stage)
1205 {
1206     int res = 0;
1207
1208     /* Check the pages is dirty and if it is send it */
1209     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1210         /*
1211          * If xbzrle is on, stop using the data compression after first
1212          * round of migration even if compression is enabled. In theory,
1213          * xbzrle can do better than compression.
1214          */
1215         if (migrate_use_compression() &&
1216             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1217             res = ram_save_compressed_page(rs, pss, last_stage);
1218         } else {
1219             res = ram_save_page(rs, pss, last_stage);
1220         }
1221
1222         if (res < 0) {
1223             return res;
1224         }
1225         if (pss->block->unsentmap) {
1226             clear_bit(pss->page, pss->block->unsentmap);
1227         }
1228     }
1229
1230     return res;
1231 }
1232
1233 /**
1234  * ram_save_host_page: save a whole host page
1235  *
1236  * Starting at *offset send pages up to the end of the current host
1237  * page. It's valid for the initial offset to point into the middle of
1238  * a host page in which case the remainder of the hostpage is sent.
1239  * Only dirty target pages are sent. Note that the host page size may
1240  * be a huge page for this block.
1241  * The saving stops at the boundary of the used_length of the block
1242  * if the RAMBlock isn't a multiple of the host page size.
1243  *
1244  * Returns the number of pages written or negative on error
1245  *
1246  * @rs: current RAM state
1247  * @ms: current migration state
1248  * @pss: data about the page we want to send
1249  * @last_stage: if we are at the completion stage
1250  */
1251 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1252                               bool last_stage)
1253 {
1254     int tmppages, pages = 0;
1255     size_t pagesize_bits =
1256         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1257
1258     do {
1259         tmppages = ram_save_target_page(rs, pss, last_stage);
1260         if (tmppages < 0) {
1261             return tmppages;
1262         }
1263
1264         pages += tmppages;
1265         pss->page++;
1266     } while ((pss->page & (pagesize_bits - 1)) &&
1267              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1268
1269     /* The offset we leave with is the last one we looked at */
1270     pss->page--;
1271     return pages;
1272 }
1273
1274 /**
1275  * ram_find_and_save_block: finds a dirty page and sends it to f
1276  *
1277  * Called within an RCU critical section.
1278  *
1279  * Returns the number of pages written where zero means no dirty pages
1280  *
1281  * @rs: current RAM state
1282  * @last_stage: if we are at the completion stage
1283  *
1284  * On systems where host-page-size > target-page-size it will send all the
1285  * pages in a host page that are dirty.
1286  */
1287
1288 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1289 {
1290     PageSearchStatus pss;
1291     int pages = 0;
1292     bool again, found;
1293
1294     /* No dirty page as there is zero RAM */
1295     if (!ram_bytes_total()) {
1296         return pages;
1297     }
1298
1299     pss.block = rs->last_seen_block;
1300     pss.page = rs->last_page;
1301     pss.complete_round = false;
1302
1303     if (!pss.block) {
1304         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1305     }
1306
1307     do {
1308         again = true;
1309         found = get_queued_page(rs, &pss);
1310
1311         if (!found) {
1312             /* priority queue empty, so just search for something dirty */
1313             found = find_dirty_block(rs, &pss, &again);
1314         }
1315
1316         if (found) {
1317             pages = ram_save_host_page(rs, &pss, last_stage);
1318         }
1319     } while (!pages && again);
1320
1321     rs->last_seen_block = pss.block;
1322     rs->last_page = pss.page;
1323
1324     return pages;
1325 }
1326
1327 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1328 {
1329     uint64_t pages = size / TARGET_PAGE_SIZE;
1330
1331     if (zero) {
1332         ram_counters.duplicate += pages;
1333     } else {
1334         ram_counters.normal += pages;
1335         ram_counters.transferred += size;
1336         qemu_update_position(f, size);
1337     }
1338 }
1339
1340 uint64_t ram_bytes_total(void)
1341 {
1342     RAMBlock *block;
1343     uint64_t total = 0;
1344
1345     rcu_read_lock();
1346     RAMBLOCK_FOREACH(block) {
1347         total += block->used_length;
1348     }
1349     rcu_read_unlock();
1350     return total;
1351 }
1352
1353 void free_xbzrle_decoded_buf(void)
1354 {
1355     g_free(xbzrle_decoded_buf);
1356     xbzrle_decoded_buf = NULL;
1357 }
1358
1359 static void ram_migration_cleanup(void *opaque)
1360 {
1361     RAMState **rsp = opaque;
1362     RAMBlock *block;
1363
1364     /* caller have hold iothread lock or is in a bh, so there is
1365      * no writing race against this migration_bitmap
1366      */
1367     memory_global_dirty_log_stop();
1368
1369     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1370         g_free(block->bmap);
1371         block->bmap = NULL;
1372         g_free(block->unsentmap);
1373         block->unsentmap = NULL;
1374     }
1375
1376     XBZRLE_cache_lock();
1377     if (XBZRLE.cache) {
1378         cache_fini(XBZRLE.cache);
1379         g_free(XBZRLE.encoded_buf);
1380         g_free(XBZRLE.current_buf);
1381         g_free(XBZRLE.zero_target_page);
1382         XBZRLE.cache = NULL;
1383         XBZRLE.encoded_buf = NULL;
1384         XBZRLE.current_buf = NULL;
1385         XBZRLE.zero_target_page = NULL;
1386     }
1387     XBZRLE_cache_unlock();
1388     migration_page_queue_free(*rsp);
1389     g_free(*rsp);
1390     *rsp = NULL;
1391 }
1392
1393 static void ram_state_reset(RAMState *rs)
1394 {
1395     rs->last_seen_block = NULL;
1396     rs->last_sent_block = NULL;
1397     rs->last_page = 0;
1398     rs->last_version = ram_list.version;
1399     rs->ram_bulk_stage = true;
1400 }
1401
1402 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1403
1404 /*
1405  * 'expected' is the value you expect the bitmap mostly to be full
1406  * of; it won't bother printing lines that are all this value.
1407  * If 'todump' is null the migration bitmap is dumped.
1408  */
1409 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1410                            unsigned long pages)
1411 {
1412     int64_t cur;
1413     int64_t linelen = 128;
1414     char linebuf[129];
1415
1416     for (cur = 0; cur < pages; cur += linelen) {
1417         int64_t curb;
1418         bool found = false;
1419         /*
1420          * Last line; catch the case where the line length
1421          * is longer than remaining ram
1422          */
1423         if (cur + linelen > pages) {
1424             linelen = pages - cur;
1425         }
1426         for (curb = 0; curb < linelen; curb++) {
1427             bool thisbit = test_bit(cur + curb, todump);
1428             linebuf[curb] = thisbit ? '1' : '.';
1429             found = found || (thisbit != expected);
1430         }
1431         if (found) {
1432             linebuf[curb] = '\0';
1433             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1434         }
1435     }
1436 }
1437
1438 /* **** functions for postcopy ***** */
1439
1440 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1441 {
1442     struct RAMBlock *block;
1443
1444     RAMBLOCK_FOREACH(block) {
1445         unsigned long *bitmap = block->bmap;
1446         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1447         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1448
1449         while (run_start < range) {
1450             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1451             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1452                               (run_end - run_start) << TARGET_PAGE_BITS);
1453             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1454         }
1455     }
1456 }
1457
1458 /**
1459  * postcopy_send_discard_bm_ram: discard a RAMBlock
1460  *
1461  * Returns zero on success
1462  *
1463  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1464  * Note: At this point the 'unsentmap' is the processed bitmap combined
1465  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1466  *
1467  * @ms: current migration state
1468  * @pds: state for postcopy
1469  * @start: RAMBlock starting page
1470  * @length: RAMBlock size
1471  */
1472 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1473                                         PostcopyDiscardState *pds,
1474                                         RAMBlock *block)
1475 {
1476     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1477     unsigned long current;
1478     unsigned long *unsentmap = block->unsentmap;
1479
1480     for (current = 0; current < end; ) {
1481         unsigned long one = find_next_bit(unsentmap, end, current);
1482
1483         if (one <= end) {
1484             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1485             unsigned long discard_length;
1486
1487             if (zero >= end) {
1488                 discard_length = end - one;
1489             } else {
1490                 discard_length = zero - one;
1491             }
1492             if (discard_length) {
1493                 postcopy_discard_send_range(ms, pds, one, discard_length);
1494             }
1495             current = one + discard_length;
1496         } else {
1497             current = one;
1498         }
1499     }
1500
1501     return 0;
1502 }
1503
1504 /**
1505  * postcopy_each_ram_send_discard: discard all RAMBlocks
1506  *
1507  * Returns 0 for success or negative for error
1508  *
1509  * Utility for the outgoing postcopy code.
1510  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1511  *   passing it bitmap indexes and name.
1512  * (qemu_ram_foreach_block ends up passing unscaled lengths
1513  *  which would mean postcopy code would have to deal with target page)
1514  *
1515  * @ms: current migration state
1516  */
1517 static int postcopy_each_ram_send_discard(MigrationState *ms)
1518 {
1519     struct RAMBlock *block;
1520     int ret;
1521
1522     RAMBLOCK_FOREACH(block) {
1523         PostcopyDiscardState *pds =
1524             postcopy_discard_send_init(ms, block->idstr);
1525
1526         /*
1527          * Postcopy sends chunks of bitmap over the wire, but it
1528          * just needs indexes at this point, avoids it having
1529          * target page specific code.
1530          */
1531         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1532         postcopy_discard_send_finish(ms, pds);
1533         if (ret) {
1534             return ret;
1535         }
1536     }
1537
1538     return 0;
1539 }
1540
1541 /**
1542  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1543  *
1544  * Helper for postcopy_chunk_hostpages; it's called twice to
1545  * canonicalize the two bitmaps, that are similar, but one is
1546  * inverted.
1547  *
1548  * Postcopy requires that all target pages in a hostpage are dirty or
1549  * clean, not a mix.  This function canonicalizes the bitmaps.
1550  *
1551  * @ms: current migration state
1552  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1553  *               otherwise we need to canonicalize partially dirty host pages
1554  * @block: block that contains the page we want to canonicalize
1555  * @pds: state for postcopy
1556  */
1557 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1558                                           RAMBlock *block,
1559                                           PostcopyDiscardState *pds)
1560 {
1561     RAMState *rs = ram_state;
1562     unsigned long *bitmap = block->bmap;
1563     unsigned long *unsentmap = block->unsentmap;
1564     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1565     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1566     unsigned long run_start;
1567
1568     if (block->page_size == TARGET_PAGE_SIZE) {
1569         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1570         return;
1571     }
1572
1573     if (unsent_pass) {
1574         /* Find a sent page */
1575         run_start = find_next_zero_bit(unsentmap, pages, 0);
1576     } else {
1577         /* Find a dirty page */
1578         run_start = find_next_bit(bitmap, pages, 0);
1579     }
1580
1581     while (run_start < pages) {
1582         bool do_fixup = false;
1583         unsigned long fixup_start_addr;
1584         unsigned long host_offset;
1585
1586         /*
1587          * If the start of this run of pages is in the middle of a host
1588          * page, then we need to fixup this host page.
1589          */
1590         host_offset = run_start % host_ratio;
1591         if (host_offset) {
1592             do_fixup = true;
1593             run_start -= host_offset;
1594             fixup_start_addr = run_start;
1595             /* For the next pass */
1596             run_start = run_start + host_ratio;
1597         } else {
1598             /* Find the end of this run */
1599             unsigned long run_end;
1600             if (unsent_pass) {
1601                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1602             } else {
1603                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1604             }
1605             /*
1606              * If the end isn't at the start of a host page, then the
1607              * run doesn't finish at the end of a host page
1608              * and we need to discard.
1609              */
1610             host_offset = run_end % host_ratio;
1611             if (host_offset) {
1612                 do_fixup = true;
1613                 fixup_start_addr = run_end - host_offset;
1614                 /*
1615                  * This host page has gone, the next loop iteration starts
1616                  * from after the fixup
1617                  */
1618                 run_start = fixup_start_addr + host_ratio;
1619             } else {
1620                 /*
1621                  * No discards on this iteration, next loop starts from
1622                  * next sent/dirty page
1623                  */
1624                 run_start = run_end + 1;
1625             }
1626         }
1627
1628         if (do_fixup) {
1629             unsigned long page;
1630
1631             /* Tell the destination to discard this page */
1632             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1633                 /* For the unsent_pass we:
1634                  *     discard partially sent pages
1635                  * For the !unsent_pass (dirty) we:
1636                  *     discard partially dirty pages that were sent
1637                  *     (any partially sent pages were already discarded
1638                  *     by the previous unsent_pass)
1639                  */
1640                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1641                                             host_ratio);
1642             }
1643
1644             /* Clean up the bitmap */
1645             for (page = fixup_start_addr;
1646                  page < fixup_start_addr + host_ratio; page++) {
1647                 /* All pages in this host page are now not sent */
1648                 set_bit(page, unsentmap);
1649
1650                 /*
1651                  * Remark them as dirty, updating the count for any pages
1652                  * that weren't previously dirty.
1653                  */
1654                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1655             }
1656         }
1657
1658         if (unsent_pass) {
1659             /* Find the next sent page for the next iteration */
1660             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1661         } else {
1662             /* Find the next dirty page for the next iteration */
1663             run_start = find_next_bit(bitmap, pages, run_start);
1664         }
1665     }
1666 }
1667
1668 /**
1669  * postcopy_chuck_hostpages: discrad any partially sent host page
1670  *
1671  * Utility for the outgoing postcopy code.
1672  *
1673  * Discard any partially sent host-page size chunks, mark any partially
1674  * dirty host-page size chunks as all dirty.  In this case the host-page
1675  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1676  *
1677  * Returns zero on success
1678  *
1679  * @ms: current migration state
1680  * @block: block we want to work with
1681  */
1682 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1683 {
1684     PostcopyDiscardState *pds =
1685         postcopy_discard_send_init(ms, block->idstr);
1686
1687     /* First pass: Discard all partially sent host pages */
1688     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1689     /*
1690      * Second pass: Ensure that all partially dirty host pages are made
1691      * fully dirty.
1692      */
1693     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1694
1695     postcopy_discard_send_finish(ms, pds);
1696     return 0;
1697 }
1698
1699 /**
1700  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1701  *
1702  * Returns zero on success
1703  *
1704  * Transmit the set of pages to be discarded after precopy to the target
1705  * these are pages that:
1706  *     a) Have been previously transmitted but are now dirty again
1707  *     b) Pages that have never been transmitted, this ensures that
1708  *        any pages on the destination that have been mapped by background
1709  *        tasks get discarded (transparent huge pages is the specific concern)
1710  * Hopefully this is pretty sparse
1711  *
1712  * @ms: current migration state
1713  */
1714 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1715 {
1716     RAMState *rs = ram_state;
1717     RAMBlock *block;
1718     int ret;
1719
1720     rcu_read_lock();
1721
1722     /* This should be our last sync, the src is now paused */
1723     migration_bitmap_sync(rs);
1724
1725     /* Easiest way to make sure we don't resume in the middle of a host-page */
1726     rs->last_seen_block = NULL;
1727     rs->last_sent_block = NULL;
1728     rs->last_page = 0;
1729
1730     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1731         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1732         unsigned long *bitmap = block->bmap;
1733         unsigned long *unsentmap = block->unsentmap;
1734
1735         if (!unsentmap) {
1736             /* We don't have a safe way to resize the sentmap, so
1737              * if the bitmap was resized it will be NULL at this
1738              * point.
1739              */
1740             error_report("migration ram resized during precopy phase");
1741             rcu_read_unlock();
1742             return -EINVAL;
1743         }
1744         /* Deal with TPS != HPS and huge pages */
1745         ret = postcopy_chunk_hostpages(ms, block);
1746         if (ret) {
1747             rcu_read_unlock();
1748             return ret;
1749         }
1750
1751         /*
1752          * Update the unsentmap to be unsentmap = unsentmap | dirty
1753          */
1754         bitmap_or(unsentmap, unsentmap, bitmap, pages);
1755 #ifdef DEBUG_POSTCOPY
1756         ram_debug_dump_bitmap(unsentmap, true, pages);
1757 #endif
1758     }
1759     trace_ram_postcopy_send_discard_bitmap();
1760
1761     ret = postcopy_each_ram_send_discard(ms);
1762     rcu_read_unlock();
1763
1764     return ret;
1765 }
1766
1767 /**
1768  * ram_discard_range: discard dirtied pages at the beginning of postcopy
1769  *
1770  * Returns zero on success
1771  *
1772  * @rbname: name of the RAMBlock of the request. NULL means the
1773  *          same that last one.
1774  * @start: RAMBlock starting page
1775  * @length: RAMBlock size
1776  */
1777 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1778 {
1779     int ret = -1;
1780
1781     trace_ram_discard_range(rbname, start, length);
1782
1783     rcu_read_lock();
1784     RAMBlock *rb = qemu_ram_block_by_name(rbname);
1785
1786     if (!rb) {
1787         error_report("ram_discard_range: Failed to find block '%s'", rbname);
1788         goto err;
1789     }
1790
1791     ret = ram_block_discard_range(rb, start, length);
1792
1793 err:
1794     rcu_read_unlock();
1795
1796     return ret;
1797 }
1798
1799 static int ram_state_init(RAMState **rsp)
1800 {
1801     *rsp = g_new0(RAMState, 1);
1802
1803     qemu_mutex_init(&(*rsp)->bitmap_mutex);
1804     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
1805     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
1806
1807     if (migrate_use_xbzrle()) {
1808         XBZRLE_cache_lock();
1809         XBZRLE.zero_target_page = g_malloc0(TARGET_PAGE_SIZE);
1810         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1811                                   TARGET_PAGE_SIZE,
1812                                   TARGET_PAGE_SIZE);
1813         if (!XBZRLE.cache) {
1814             XBZRLE_cache_unlock();
1815             error_report("Error creating cache");
1816             g_free(*rsp);
1817             *rsp = NULL;
1818             return -1;
1819         }
1820         XBZRLE_cache_unlock();
1821
1822         /* We prefer not to abort if there is no memory */
1823         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1824         if (!XBZRLE.encoded_buf) {
1825             error_report("Error allocating encoded_buf");
1826             g_free(*rsp);
1827             *rsp = NULL;
1828             return -1;
1829         }
1830
1831         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1832         if (!XBZRLE.current_buf) {
1833             error_report("Error allocating current_buf");
1834             g_free(XBZRLE.encoded_buf);
1835             XBZRLE.encoded_buf = NULL;
1836             g_free(*rsp);
1837             *rsp = NULL;
1838             return -1;
1839         }
1840     }
1841
1842     /* For memory_global_dirty_log_start below.  */
1843     qemu_mutex_lock_iothread();
1844
1845     qemu_mutex_lock_ramlist();
1846     rcu_read_lock();
1847     ram_state_reset(*rsp);
1848
1849     /* Skip setting bitmap if there is no RAM */
1850     if (ram_bytes_total()) {
1851         RAMBlock *block;
1852
1853         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1854             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
1855
1856             block->bmap = bitmap_new(pages);
1857             bitmap_set(block->bmap, 0, pages);
1858             if (migrate_postcopy_ram()) {
1859                 block->unsentmap = bitmap_new(pages);
1860                 bitmap_set(block->unsentmap, 0, pages);
1861             }
1862         }
1863     }
1864
1865     /*
1866      * Count the total number of pages used by ram blocks not including any
1867      * gaps due to alignment or unplugs.
1868      */
1869     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1870
1871     memory_global_dirty_log_start();
1872     migration_bitmap_sync(*rsp);
1873     qemu_mutex_unlock_ramlist();
1874     qemu_mutex_unlock_iothread();
1875     rcu_read_unlock();
1876
1877     return 0;
1878 }
1879
1880 /*
1881  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1882  * long-running RCU critical section.  When rcu-reclaims in the code
1883  * start to become numerous it will be necessary to reduce the
1884  * granularity of these critical sections.
1885  */
1886
1887 /**
1888  * ram_save_setup: Setup RAM for migration
1889  *
1890  * Returns zero to indicate success and negative for error
1891  *
1892  * @f: QEMUFile where to send the data
1893  * @opaque: RAMState pointer
1894  */
1895 static int ram_save_setup(QEMUFile *f, void *opaque)
1896 {
1897     RAMState **rsp = opaque;
1898     RAMBlock *block;
1899
1900     /* migration has already setup the bitmap, reuse it. */
1901     if (!migration_in_colo_state()) {
1902         if (ram_state_init(rsp) != 0) {
1903             return -1;
1904         }
1905     }
1906     (*rsp)->f = f;
1907
1908     rcu_read_lock();
1909
1910     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1911
1912     RAMBLOCK_FOREACH(block) {
1913         qemu_put_byte(f, strlen(block->idstr));
1914         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1915         qemu_put_be64(f, block->used_length);
1916         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
1917             qemu_put_be64(f, block->page_size);
1918         }
1919     }
1920
1921     rcu_read_unlock();
1922
1923     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1924     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1925
1926     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1927
1928     return 0;
1929 }
1930
1931 /**
1932  * ram_save_iterate: iterative stage for migration
1933  *
1934  * Returns zero to indicate success and negative for error
1935  *
1936  * @f: QEMUFile where to send the data
1937  * @opaque: RAMState pointer
1938  */
1939 static int ram_save_iterate(QEMUFile *f, void *opaque)
1940 {
1941     RAMState **temp = opaque;
1942     RAMState *rs = *temp;
1943     int ret;
1944     int i;
1945     int64_t t0;
1946     int done = 0;
1947
1948     rcu_read_lock();
1949     if (ram_list.version != rs->last_version) {
1950         ram_state_reset(rs);
1951     }
1952
1953     /* Read version before ram_list.blocks */
1954     smp_rmb();
1955
1956     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1957
1958     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1959     i = 0;
1960     while ((ret = qemu_file_rate_limit(f)) == 0) {
1961         int pages;
1962
1963         pages = ram_find_and_save_block(rs, false);
1964         /* no more pages to sent */
1965         if (pages == 0) {
1966             done = 1;
1967             break;
1968         }
1969         rs->iterations++;
1970
1971         /* we want to check in the 1st loop, just in case it was the 1st time
1972            and we had to sync the dirty bitmap.
1973            qemu_get_clock_ns() is a bit expensive, so we only check each some
1974            iterations
1975         */
1976         if ((i & 63) == 0) {
1977             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
1978             if (t1 > MAX_WAIT) {
1979                 trace_ram_save_iterate_big_wait(t1, i);
1980                 break;
1981             }
1982         }
1983         i++;
1984     }
1985     flush_compressed_data(rs);
1986     rcu_read_unlock();
1987
1988     /*
1989      * Must occur before EOS (or any QEMUFile operation)
1990      * because of RDMA protocol.
1991      */
1992     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
1993
1994     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1995     ram_counters.transferred += 8;
1996
1997     ret = qemu_file_get_error(f);
1998     if (ret < 0) {
1999         return ret;
2000     }
2001
2002     return done;
2003 }
2004
2005 /**
2006  * ram_save_complete: function called to send the remaining amount of ram
2007  *
2008  * Returns zero to indicate success
2009  *
2010  * Called with iothread lock
2011  *
2012  * @f: QEMUFile where to send the data
2013  * @opaque: RAMState pointer
2014  */
2015 static int ram_save_complete(QEMUFile *f, void *opaque)
2016 {
2017     RAMState **temp = opaque;
2018     RAMState *rs = *temp;
2019
2020     rcu_read_lock();
2021
2022     if (!migration_in_postcopy()) {
2023         migration_bitmap_sync(rs);
2024     }
2025
2026     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2027
2028     /* try transferring iterative blocks of memory */
2029
2030     /* flush all remaining blocks regardless of rate limiting */
2031     while (true) {
2032         int pages;
2033
2034         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2035         /* no more blocks to sent */
2036         if (pages == 0) {
2037             break;
2038         }
2039     }
2040
2041     flush_compressed_data(rs);
2042     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2043
2044     rcu_read_unlock();
2045
2046     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2047
2048     return 0;
2049 }
2050
2051 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2052                              uint64_t *non_postcopiable_pending,
2053                              uint64_t *postcopiable_pending)
2054 {
2055     RAMState **temp = opaque;
2056     RAMState *rs = *temp;
2057     uint64_t remaining_size;
2058
2059     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2060
2061     if (!migration_in_postcopy() &&
2062         remaining_size < max_size) {
2063         qemu_mutex_lock_iothread();
2064         rcu_read_lock();
2065         migration_bitmap_sync(rs);
2066         rcu_read_unlock();
2067         qemu_mutex_unlock_iothread();
2068         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2069     }
2070
2071     /* We can do postcopy, and all the data is postcopiable */
2072     *postcopiable_pending += remaining_size;
2073 }
2074
2075 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2076 {
2077     unsigned int xh_len;
2078     int xh_flags;
2079     uint8_t *loaded_data;
2080
2081     if (!xbzrle_decoded_buf) {
2082         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2083     }
2084     loaded_data = xbzrle_decoded_buf;
2085
2086     /* extract RLE header */
2087     xh_flags = qemu_get_byte(f);
2088     xh_len = qemu_get_be16(f);
2089
2090     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2091         error_report("Failed to load XBZRLE page - wrong compression!");
2092         return -1;
2093     }
2094
2095     if (xh_len > TARGET_PAGE_SIZE) {
2096         error_report("Failed to load XBZRLE page - len overflow!");
2097         return -1;
2098     }
2099     /* load data and decode */
2100     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2101
2102     /* decode RLE */
2103     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2104                              TARGET_PAGE_SIZE) == -1) {
2105         error_report("Failed to load XBZRLE page - decode error!");
2106         return -1;
2107     }
2108
2109     return 0;
2110 }
2111
2112 /**
2113  * ram_block_from_stream: read a RAMBlock id from the migration stream
2114  *
2115  * Must be called from within a rcu critical section.
2116  *
2117  * Returns a pointer from within the RCU-protected ram_list.
2118  *
2119  * @f: QEMUFile where to read the data from
2120  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2121  */
2122 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2123 {
2124     static RAMBlock *block = NULL;
2125     char id[256];
2126     uint8_t len;
2127
2128     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2129         if (!block) {
2130             error_report("Ack, bad migration stream!");
2131             return NULL;
2132         }
2133         return block;
2134     }
2135
2136     len = qemu_get_byte(f);
2137     qemu_get_buffer(f, (uint8_t *)id, len);
2138     id[len] = 0;
2139
2140     block = qemu_ram_block_by_name(id);
2141     if (!block) {
2142         error_report("Can't find block %s", id);
2143         return NULL;
2144     }
2145
2146     return block;
2147 }
2148
2149 static inline void *host_from_ram_block_offset(RAMBlock *block,
2150                                                ram_addr_t offset)
2151 {
2152     if (!offset_in_ramblock(block, offset)) {
2153         return NULL;
2154     }
2155
2156     return block->host + offset;
2157 }
2158
2159 /**
2160  * ram_handle_compressed: handle the zero page case
2161  *
2162  * If a page (or a whole RDMA chunk) has been
2163  * determined to be zero, then zap it.
2164  *
2165  * @host: host address for the zero page
2166  * @ch: what the page is filled from.  We only support zero
2167  * @size: size of the zero page
2168  */
2169 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2170 {
2171     if (ch != 0 || !is_zero_range(host, size)) {
2172         memset(host, ch, size);
2173     }
2174 }
2175
2176 static void *do_data_decompress(void *opaque)
2177 {
2178     DecompressParam *param = opaque;
2179     unsigned long pagesize;
2180     uint8_t *des;
2181     int len;
2182
2183     qemu_mutex_lock(&param->mutex);
2184     while (!param->quit) {
2185         if (param->des) {
2186             des = param->des;
2187             len = param->len;
2188             param->des = 0;
2189             qemu_mutex_unlock(&param->mutex);
2190
2191             pagesize = TARGET_PAGE_SIZE;
2192             /* uncompress() will return failed in some case, especially
2193              * when the page is dirted when doing the compression, it's
2194              * not a problem because the dirty page will be retransferred
2195              * and uncompress() won't break the data in other pages.
2196              */
2197             uncompress((Bytef *)des, &pagesize,
2198                        (const Bytef *)param->compbuf, len);
2199
2200             qemu_mutex_lock(&decomp_done_lock);
2201             param->done = true;
2202             qemu_cond_signal(&decomp_done_cond);
2203             qemu_mutex_unlock(&decomp_done_lock);
2204
2205             qemu_mutex_lock(&param->mutex);
2206         } else {
2207             qemu_cond_wait(&param->cond, &param->mutex);
2208         }
2209     }
2210     qemu_mutex_unlock(&param->mutex);
2211
2212     return NULL;
2213 }
2214
2215 static void wait_for_decompress_done(void)
2216 {
2217     int idx, thread_count;
2218
2219     if (!migrate_use_compression()) {
2220         return;
2221     }
2222
2223     thread_count = migrate_decompress_threads();
2224     qemu_mutex_lock(&decomp_done_lock);
2225     for (idx = 0; idx < thread_count; idx++) {
2226         while (!decomp_param[idx].done) {
2227             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2228         }
2229     }
2230     qemu_mutex_unlock(&decomp_done_lock);
2231 }
2232
2233 void migrate_decompress_threads_create(void)
2234 {
2235     int i, thread_count;
2236
2237     if (!migrate_use_compression()) {
2238         return;
2239     }
2240     thread_count = migrate_decompress_threads();
2241     decompress_threads = g_new0(QemuThread, thread_count);
2242     decomp_param = g_new0(DecompressParam, thread_count);
2243     qemu_mutex_init(&decomp_done_lock);
2244     qemu_cond_init(&decomp_done_cond);
2245     for (i = 0; i < thread_count; i++) {
2246         qemu_mutex_init(&decomp_param[i].mutex);
2247         qemu_cond_init(&decomp_param[i].cond);
2248         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2249         decomp_param[i].done = true;
2250         decomp_param[i].quit = false;
2251         qemu_thread_create(decompress_threads + i, "decompress",
2252                            do_data_decompress, decomp_param + i,
2253                            QEMU_THREAD_JOINABLE);
2254     }
2255 }
2256
2257 void migrate_decompress_threads_join(void)
2258 {
2259     int i, thread_count;
2260
2261     if (!migrate_use_compression()) {
2262         return;
2263     }
2264     thread_count = migrate_decompress_threads();
2265     for (i = 0; i < thread_count; i++) {
2266         qemu_mutex_lock(&decomp_param[i].mutex);
2267         decomp_param[i].quit = true;
2268         qemu_cond_signal(&decomp_param[i].cond);
2269         qemu_mutex_unlock(&decomp_param[i].mutex);
2270     }
2271     for (i = 0; i < thread_count; i++) {
2272         qemu_thread_join(decompress_threads + i);
2273         qemu_mutex_destroy(&decomp_param[i].mutex);
2274         qemu_cond_destroy(&decomp_param[i].cond);
2275         g_free(decomp_param[i].compbuf);
2276     }
2277     g_free(decompress_threads);
2278     g_free(decomp_param);
2279     decompress_threads = NULL;
2280     decomp_param = NULL;
2281 }
2282
2283 static void decompress_data_with_multi_threads(QEMUFile *f,
2284                                                void *host, int len)
2285 {
2286     int idx, thread_count;
2287
2288     thread_count = migrate_decompress_threads();
2289     qemu_mutex_lock(&decomp_done_lock);
2290     while (true) {
2291         for (idx = 0; idx < thread_count; idx++) {
2292             if (decomp_param[idx].done) {
2293                 decomp_param[idx].done = false;
2294                 qemu_mutex_lock(&decomp_param[idx].mutex);
2295                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2296                 decomp_param[idx].des = host;
2297                 decomp_param[idx].len = len;
2298                 qemu_cond_signal(&decomp_param[idx].cond);
2299                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2300                 break;
2301             }
2302         }
2303         if (idx < thread_count) {
2304             break;
2305         } else {
2306             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2307         }
2308     }
2309     qemu_mutex_unlock(&decomp_done_lock);
2310 }
2311
2312 /**
2313  * ram_postcopy_incoming_init: allocate postcopy data structures
2314  *
2315  * Returns 0 for success and negative if there was one error
2316  *
2317  * @mis: current migration incoming state
2318  *
2319  * Allocate data structures etc needed by incoming migration with
2320  * postcopy-ram. postcopy-ram's similarly names
2321  * postcopy_ram_incoming_init does the work.
2322  */
2323 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2324 {
2325     unsigned long ram_pages = last_ram_page();
2326
2327     return postcopy_ram_incoming_init(mis, ram_pages);
2328 }
2329
2330 /**
2331  * ram_load_postcopy: load a page in postcopy case
2332  *
2333  * Returns 0 for success or -errno in case of error
2334  *
2335  * Called in postcopy mode by ram_load().
2336  * rcu_read_lock is taken prior to this being called.
2337  *
2338  * @f: QEMUFile where to send the data
2339  */
2340 static int ram_load_postcopy(QEMUFile *f)
2341 {
2342     int flags = 0, ret = 0;
2343     bool place_needed = false;
2344     bool matching_page_sizes = false;
2345     MigrationIncomingState *mis = migration_incoming_get_current();
2346     /* Temporary page that is later 'placed' */
2347     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2348     void *last_host = NULL;
2349     bool all_zero = false;
2350
2351     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2352         ram_addr_t addr;
2353         void *host = NULL;
2354         void *page_buffer = NULL;
2355         void *place_source = NULL;
2356         RAMBlock *block = NULL;
2357         uint8_t ch;
2358
2359         addr = qemu_get_be64(f);
2360         flags = addr & ~TARGET_PAGE_MASK;
2361         addr &= TARGET_PAGE_MASK;
2362
2363         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2364         place_needed = false;
2365         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2366             block = ram_block_from_stream(f, flags);
2367
2368             host = host_from_ram_block_offset(block, addr);
2369             if (!host) {
2370                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2371                 ret = -EINVAL;
2372                 break;
2373             }
2374             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2375             /*
2376              * Postcopy requires that we place whole host pages atomically;
2377              * these may be huge pages for RAMBlocks that are backed by
2378              * hugetlbfs.
2379              * To make it atomic, the data is read into a temporary page
2380              * that's moved into place later.
2381              * The migration protocol uses,  possibly smaller, target-pages
2382              * however the source ensures it always sends all the components
2383              * of a host page in order.
2384              */
2385             page_buffer = postcopy_host_page +
2386                           ((uintptr_t)host & (block->page_size - 1));
2387             /* If all TP are zero then we can optimise the place */
2388             if (!((uintptr_t)host & (block->page_size - 1))) {
2389                 all_zero = true;
2390             } else {
2391                 /* not the 1st TP within the HP */
2392                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2393                     error_report("Non-sequential target page %p/%p",
2394                                   host, last_host);
2395                     ret = -EINVAL;
2396                     break;
2397                 }
2398             }
2399
2400
2401             /*
2402              * If it's the last part of a host page then we place the host
2403              * page
2404              */
2405             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2406                                      (block->page_size - 1)) == 0;
2407             place_source = postcopy_host_page;
2408         }
2409         last_host = host;
2410
2411         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2412         case RAM_SAVE_FLAG_ZERO:
2413             ch = qemu_get_byte(f);
2414             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2415             if (ch) {
2416                 all_zero = false;
2417             }
2418             break;
2419
2420         case RAM_SAVE_FLAG_PAGE:
2421             all_zero = false;
2422             if (!place_needed || !matching_page_sizes) {
2423                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2424             } else {
2425                 /* Avoids the qemu_file copy during postcopy, which is
2426                  * going to do a copy later; can only do it when we
2427                  * do this read in one go (matching page sizes)
2428                  */
2429                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2430                                          TARGET_PAGE_SIZE);
2431             }
2432             break;
2433         case RAM_SAVE_FLAG_EOS:
2434             /* normal exit */
2435             break;
2436         default:
2437             error_report("Unknown combination of migration flags: %#x"
2438                          " (postcopy mode)", flags);
2439             ret = -EINVAL;
2440         }
2441
2442         if (place_needed) {
2443             /* This gets called at the last target page in the host page */
2444             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2445
2446             if (all_zero) {
2447                 ret = postcopy_place_page_zero(mis, place_dest,
2448                                                block->page_size);
2449             } else {
2450                 ret = postcopy_place_page(mis, place_dest,
2451                                           place_source, block->page_size);
2452             }
2453         }
2454         if (!ret) {
2455             ret = qemu_file_get_error(f);
2456         }
2457     }
2458
2459     return ret;
2460 }
2461
2462 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2463 {
2464     int flags = 0, ret = 0, invalid_flags = 0;
2465     static uint64_t seq_iter;
2466     int len = 0;
2467     /*
2468      * If system is running in postcopy mode, page inserts to host memory must
2469      * be atomic
2470      */
2471     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2472     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2473     bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2474
2475     seq_iter++;
2476
2477     if (version_id != 4) {
2478         ret = -EINVAL;
2479     }
2480
2481     if (!migrate_use_compression()) {
2482         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2483     }
2484     /* This RCU critical section can be very long running.
2485      * When RCU reclaims in the code start to become numerous,
2486      * it will be necessary to reduce the granularity of this
2487      * critical section.
2488      */
2489     rcu_read_lock();
2490
2491     if (postcopy_running) {
2492         ret = ram_load_postcopy(f);
2493     }
2494
2495     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2496         ram_addr_t addr, total_ram_bytes;
2497         void *host = NULL;
2498         uint8_t ch;
2499
2500         addr = qemu_get_be64(f);
2501         flags = addr & ~TARGET_PAGE_MASK;
2502         addr &= TARGET_PAGE_MASK;
2503
2504         if (flags & invalid_flags) {
2505             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2506                 error_report("Received an unexpected compressed page");
2507             }
2508
2509             ret = -EINVAL;
2510             break;
2511         }
2512
2513         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2514                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2515             RAMBlock *block = ram_block_from_stream(f, flags);
2516
2517             host = host_from_ram_block_offset(block, addr);
2518             if (!host) {
2519                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2520                 ret = -EINVAL;
2521                 break;
2522             }
2523             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2524         }
2525
2526         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2527         case RAM_SAVE_FLAG_MEM_SIZE:
2528             /* Synchronize RAM block list */
2529             total_ram_bytes = addr;
2530             while (!ret && total_ram_bytes) {
2531                 RAMBlock *block;
2532                 char id[256];
2533                 ram_addr_t length;
2534
2535                 len = qemu_get_byte(f);
2536                 qemu_get_buffer(f, (uint8_t *)id, len);
2537                 id[len] = 0;
2538                 length = qemu_get_be64(f);
2539
2540                 block = qemu_ram_block_by_name(id);
2541                 if (block) {
2542                     if (length != block->used_length) {
2543                         Error *local_err = NULL;
2544
2545                         ret = qemu_ram_resize(block, length,
2546                                               &local_err);
2547                         if (local_err) {
2548                             error_report_err(local_err);
2549                         }
2550                     }
2551                     /* For postcopy we need to check hugepage sizes match */
2552                     if (postcopy_advised &&
2553                         block->page_size != qemu_host_page_size) {
2554                         uint64_t remote_page_size = qemu_get_be64(f);
2555                         if (remote_page_size != block->page_size) {
2556                             error_report("Mismatched RAM page size %s "
2557                                          "(local) %zd != %" PRId64,
2558                                          id, block->page_size,
2559                                          remote_page_size);
2560                             ret = -EINVAL;
2561                         }
2562                     }
2563                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2564                                           block->idstr);
2565                 } else {
2566                     error_report("Unknown ramblock \"%s\", cannot "
2567                                  "accept migration", id);
2568                     ret = -EINVAL;
2569                 }
2570
2571                 total_ram_bytes -= length;
2572             }
2573             break;
2574
2575         case RAM_SAVE_FLAG_ZERO:
2576             ch = qemu_get_byte(f);
2577             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2578             break;
2579
2580         case RAM_SAVE_FLAG_PAGE:
2581             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2582             break;
2583
2584         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2585             len = qemu_get_be32(f);
2586             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2587                 error_report("Invalid compressed data length: %d", len);
2588                 ret = -EINVAL;
2589                 break;
2590             }
2591             decompress_data_with_multi_threads(f, host, len);
2592             break;
2593
2594         case RAM_SAVE_FLAG_XBZRLE:
2595             if (load_xbzrle(f, addr, host) < 0) {
2596                 error_report("Failed to decompress XBZRLE page at "
2597                              RAM_ADDR_FMT, addr);
2598                 ret = -EINVAL;
2599                 break;
2600             }
2601             break;
2602         case RAM_SAVE_FLAG_EOS:
2603             /* normal exit */
2604             break;
2605         default:
2606             if (flags & RAM_SAVE_FLAG_HOOK) {
2607                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2608             } else {
2609                 error_report("Unknown combination of migration flags: %#x",
2610                              flags);
2611                 ret = -EINVAL;
2612             }
2613         }
2614         if (!ret) {
2615             ret = qemu_file_get_error(f);
2616         }
2617     }
2618
2619     wait_for_decompress_done();
2620     rcu_read_unlock();
2621     trace_ram_load_complete(ret, seq_iter);
2622     return ret;
2623 }
2624
2625 static SaveVMHandlers savevm_ram_handlers = {
2626     .save_live_setup = ram_save_setup,
2627     .save_live_iterate = ram_save_iterate,
2628     .save_live_complete_postcopy = ram_save_complete,
2629     .save_live_complete_precopy = ram_save_complete,
2630     .save_live_pending = ram_save_pending,
2631     .load_state = ram_load,
2632     .cleanup = ram_migration_cleanup,
2633 };
2634
2635 void ram_mig_init(void)
2636 {
2637     qemu_mutex_init(&XBZRLE.lock);
2638     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2639 }