migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "qemu-common.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qapi-event.h"
  33 #include "qemu/cutils.h"
  34 #include "qemu/bitops.h"
  35 #include "qemu/bitmap.h"
  36 #include "qemu/timer.h"
  37 #include "qemu/main-loop.h"
  38 #include "migration/migration.h"
  39 #include "migration/postcopy-ram.h"
  40 #include "exec/address-spaces.h"
  41 #include "migration/page_cache.h"
  42 #include "qemu/error-report.h"
  43 #include "trace.h"
  44 #include "exec/ram_addr.h"
  45 #include "qemu/rcu_queue.h"
  46 #include "migration/colo.h"
  47
  48 /***********************************************************/
  49 /* ram save/restore */
  50
  51 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  52 #define RAM_SAVE_FLAG_COMPRESS 0x02
  53 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  54 #define RAM_SAVE_FLAG_PAGE     0x08
  55 #define RAM_SAVE_FLAG_EOS      0x10
  56 #define RAM_SAVE_FLAG_CONTINUE 0x20
  57 #define RAM_SAVE_FLAG_XBZRLE   0x40
  58 /* 0x80 is reserved in migration.h start with 0x100 next */
  59 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  60
  61 static uint8_t *ZERO_TARGET_PAGE;
  62
  63 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  64 {
  65     return buffer_is_zero(p, size);
  66 }
  67
  68 /* struct contains XBZRLE cache and a static page
  69    used by the compression */
  70 static struct {
  71     /* buffer used for XBZRLE encoding */
  72     uint8_t *encoded_buf;
  73     /* buffer for storing page content */
  74     uint8_t *current_buf;
  75     /* Cache for XBZRLE, Protected by lock. */
  76     PageCache *cache;
  77     QemuMutex lock;
  78 } XBZRLE;
  79
  80 /* buffer used for XBZRLE decoding */
  81 static uint8_t *xbzrle_decoded_buf;
  82
  83 static void XBZRLE_cache_lock(void)
  84 {
  85     if (migrate_use_xbzrle())
  86         qemu_mutex_lock(&XBZRLE.lock);
  87 }
  88
  89 static void XBZRLE_cache_unlock(void)
  90 {
  91     if (migrate_use_xbzrle())
  92         qemu_mutex_unlock(&XBZRLE.lock);
  93 }
  94
  95 /**
  96  * xbzrle_cache_resize: resize the xbzrle cache
  97  *
  98  * This function is called from qmp_migrate_set_cache_size in main
  99  * thread, possibly while a migration is in progress.  A running
 100  * migration may be using the cache and might finish during this call,
 101  * hence changes to the cache are protected by XBZRLE.lock().
 102  *
 103  * Returns the new_size or negative in case of error.
 104  *
 105  * @new_size: new cache size
 106  */
 107 int64_t xbzrle_cache_resize(int64_t new_size)
 108 {
 109     PageCache *new_cache;
 110     int64_t ret;
 111
 112     if (new_size < TARGET_PAGE_SIZE) {
 113         return -1;
 114     }
 115
 116     XBZRLE_cache_lock();
 117
 118     if (XBZRLE.cache != NULL) {
 119         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 120             goto out_new_size;
 121         }
 122         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 123                                         TARGET_PAGE_SIZE);
 124         if (!new_cache) {
 125             error_report("Error creating cache");
 126             ret = -1;
 127             goto out;
 128         }
 129
 130         cache_fini(XBZRLE.cache);
 131         XBZRLE.cache = new_cache;
 132     }
 133
 134 out_new_size:
 135     ret = pow2floor(new_size);
 136 out:
 137     XBZRLE_cache_unlock();
 138     return ret;
 139 }
 140
 141 /* State of RAM for migration */
 142 struct RAMState {
 143     /* Last block that we have visited searching for dirty pages */
 144     RAMBlock *last_seen_block;
 145     /* Last block from where we have sent data */
 146     RAMBlock *last_sent_block;
 147     /* Last offset we have sent data from */
 148     ram_addr_t last_offset;
 149     /* last ram version we have seen */
 150     uint32_t last_version;
 151     /* We are in the first round */
 152     bool ram_bulk_stage;
 153     /* How many times we have dirty too many pages */
 154     int dirty_rate_high_cnt;
 155     /* How many times we have synchronized the bitmap */
 156     uint64_t bitmap_sync_count;
 157     /* these variables are used for bitmap sync */
 158     /* last time we did a full bitmap_sync */
 159     int64_t time_last_bitmap_sync;
 160     /* bytes transferred at start_time */
 161     uint64_t bytes_xfer_prev;
 162     /* number of dirty pages since start_time */
 163     uint64_t num_dirty_pages_period;
 164     /* xbzrle misses since the beginning of the period */
 165     uint64_t xbzrle_cache_miss_prev;
 166     /* number of iterations at the beginning of period */
 167     uint64_t iterations_prev;
 168     /* Accounting fields */
 169     /* number of zero pages.  It used to be pages filled by the same char. */
 170     uint64_t zero_pages;
 171 };
 172 typedef struct RAMState RAMState;
 173
 174 static RAMState ram_state;
 175
 176 /* accounting for migration statistics */
 177 typedef struct AccountingInfo {
 178     uint64_t skipped_pages;
 179     uint64_t norm_pages;
 180     uint64_t iterations;
 181     uint64_t xbzrle_bytes;
 182     uint64_t xbzrle_pages;
 183     uint64_t xbzrle_cache_miss;
 184     double xbzrle_cache_miss_rate;
 185     uint64_t xbzrle_overflows;
 186 } AccountingInfo;
 187
 188 static AccountingInfo acct_info;
 189
 190 static void acct_clear(void)
 191 {
 192     memset(&acct_info, 0, sizeof(acct_info));
 193 }
 194
 195 uint64_t dup_mig_bytes_transferred(void)
 196 {
 197     return ram_state.zero_pages * TARGET_PAGE_SIZE;
 198 }
 199
 200 uint64_t dup_mig_pages_transferred(void)
 201 {
 202     return ram_state.zero_pages;
 203 }
 204
 205 uint64_t skipped_mig_bytes_transferred(void)
 206 {
 207     return acct_info.skipped_pages * TARGET_PAGE_SIZE;
 208 }
 209
 210 uint64_t skipped_mig_pages_transferred(void)
 211 {
 212     return acct_info.skipped_pages;
 213 }
 214
 215 uint64_t norm_mig_bytes_transferred(void)
 216 {
 217     return acct_info.norm_pages * TARGET_PAGE_SIZE;
 218 }
 219
 220 uint64_t norm_mig_pages_transferred(void)
 221 {
 222     return acct_info.norm_pages;
 223 }
 224
 225 uint64_t xbzrle_mig_bytes_transferred(void)
 226 {
 227     return acct_info.xbzrle_bytes;
 228 }
 229
 230 uint64_t xbzrle_mig_pages_transferred(void)
 231 {
 232     return acct_info.xbzrle_pages;
 233 }
 234
 235 uint64_t xbzrle_mig_pages_cache_miss(void)
 236 {
 237     return acct_info.xbzrle_cache_miss;
 238 }
 239
 240 double xbzrle_mig_cache_miss_rate(void)
 241 {
 242     return acct_info.xbzrle_cache_miss_rate;
 243 }
 244
 245 uint64_t xbzrle_mig_pages_overflow(void)
 246 {
 247     return acct_info.xbzrle_overflows;
 248 }
 249
 250 static QemuMutex migration_bitmap_mutex;
 251 static uint64_t migration_dirty_pages;
 252
 253 /* used by the search for pages to send */
 254 struct PageSearchStatus {
 255     /* Current block being searched */
 256     RAMBlock    *block;
 257     /* Current offset to search from */
 258     ram_addr_t   offset;
 259     /* Set once we wrap around */
 260     bool         complete_round;
 261 };
 262 typedef struct PageSearchStatus PageSearchStatus;
 263
 264 static struct BitmapRcu {
 265     struct rcu_head rcu;
 266     /* Main migration bitmap */
 267     unsigned long *bmap;
 268     /* bitmap of pages that haven't been sent even once
 269      * only maintained and used in postcopy at the moment
 270      * where it's used to send the dirtymap at the start
 271      * of the postcopy phase
 272      */
 273     unsigned long *unsentmap;
 274 } *migration_bitmap_rcu;
 275
 276 struct CompressParam {
 277     bool done;
 278     bool quit;
 279     QEMUFile *file;
 280     QemuMutex mutex;
 281     QemuCond cond;
 282     RAMBlock *block;
 283     ram_addr_t offset;
 284 };
 285 typedef struct CompressParam CompressParam;
 286
 287 struct DecompressParam {
 288     bool done;
 289     bool quit;
 290     QemuMutex mutex;
 291     QemuCond cond;
 292     void *des;
 293     uint8_t *compbuf;
 294     int len;
 295 };
 296 typedef struct DecompressParam DecompressParam;
 297
 298 static CompressParam *comp_param;
 299 static QemuThread *compress_threads;
 300 /* comp_done_cond is used to wake up the migration thread when
 301  * one of the compression threads has finished the compression.
 302  * comp_done_lock is used to co-work with comp_done_cond.
 303  */
 304 static QemuMutex comp_done_lock;
 305 static QemuCond comp_done_cond;
 306 /* The empty QEMUFileOps will be used by file in CompressParam */
 307 static const QEMUFileOps empty_ops = { };
 308
 309 static bool compression_switch;
 310 static DecompressParam *decomp_param;
 311 static QemuThread *decompress_threads;
 312 static QemuMutex decomp_done_lock;
 313 static QemuCond decomp_done_cond;
 314
 315 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 316                                 ram_addr_t offset);
 317
 318 static void *do_data_compress(void *opaque)
 319 {
 320     CompressParam *param = opaque;
 321     RAMBlock *block;
 322     ram_addr_t offset;
 323
 324     qemu_mutex_lock(&param->mutex);
 325     while (!param->quit) {
 326         if (param->block) {
 327             block = param->block;
 328             offset = param->offset;
 329             param->block = NULL;
 330             qemu_mutex_unlock(&param->mutex);
 331
 332             do_compress_ram_page(param->file, block, offset);
 333
 334             qemu_mutex_lock(&comp_done_lock);
 335             param->done = true;
 336             qemu_cond_signal(&comp_done_cond);
 337             qemu_mutex_unlock(&comp_done_lock);
 338
 339             qemu_mutex_lock(&param->mutex);
 340         } else {
 341             qemu_cond_wait(&param->cond, &param->mutex);
 342         }
 343     }
 344     qemu_mutex_unlock(&param->mutex);
 345
 346     return NULL;
 347 }
 348
 349 static inline void terminate_compression_threads(void)
 350 {
 351     int idx, thread_count;
 352
 353     thread_count = migrate_compress_threads();
 354
 355     for (idx = 0; idx < thread_count; idx++) {
 356         qemu_mutex_lock(&comp_param[idx].mutex);
 357         comp_param[idx].quit = true;
 358         qemu_cond_signal(&comp_param[idx].cond);
 359         qemu_mutex_unlock(&comp_param[idx].mutex);
 360     }
 361 }
 362
 363 void migrate_compress_threads_join(void)
 364 {
 365     int i, thread_count;
 366
 367     if (!migrate_use_compression()) {
 368         return;
 369     }
 370     terminate_compression_threads();
 371     thread_count = migrate_compress_threads();
 372     for (i = 0; i < thread_count; i++) {
 373         qemu_thread_join(compress_threads + i);
 374         qemu_fclose(comp_param[i].file);
 375         qemu_mutex_destroy(&comp_param[i].mutex);
 376         qemu_cond_destroy(&comp_param[i].cond);
 377     }
 378     qemu_mutex_destroy(&comp_done_lock);
 379     qemu_cond_destroy(&comp_done_cond);
 380     g_free(compress_threads);
 381     g_free(comp_param);
 382     compress_threads = NULL;
 383     comp_param = NULL;
 384 }
 385
 386 void migrate_compress_threads_create(void)
 387 {
 388     int i, thread_count;
 389
 390     if (!migrate_use_compression()) {
 391         return;
 392     }
 393     compression_switch = true;
 394     thread_count = migrate_compress_threads();
 395     compress_threads = g_new0(QemuThread, thread_count);
 396     comp_param = g_new0(CompressParam, thread_count);
 397     qemu_cond_init(&comp_done_cond);
 398     qemu_mutex_init(&comp_done_lock);
 399     for (i = 0; i < thread_count; i++) {
 400         /* comp_param[i].file is just used as a dummy buffer to save data,
 401          * set its ops to empty.
 402          */
 403         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 404         comp_param[i].done = true;
 405         comp_param[i].quit = false;
 406         qemu_mutex_init(&comp_param[i].mutex);
 407         qemu_cond_init(&comp_param[i].cond);
 408         qemu_thread_create(compress_threads + i, "compress",
 409                            do_data_compress, comp_param + i,
 410                            QEMU_THREAD_JOINABLE);
 411     }
 412 }
 413
 414 /**
 415  * save_page_header: write page header to wire
 416  *
 417  * If this is the 1st block, it also writes the block identification
 418  *
 419  * Returns the number of bytes written
 420  *
 421  * @f: QEMUFile where to send the data
 422  * @block: block that contains the page we want to send
 423  * @offset: offset inside the block for the page
 424  *          in the lower bits, it contains flags
 425  */
 426 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
 427 {
 428     size_t size, len;
 429
 430     qemu_put_be64(f, offset);
 431     size = 8;
 432
 433     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 434         len = strlen(block->idstr);
 435         qemu_put_byte(f, len);
 436         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 437         size += 1 + len;
 438     }
 439     return size;
 440 }
 441
 442 /**
 443  * mig_throttle_guest_down: throotle down the guest
 444  *
 445  * Reduce amount of guest cpu execution to hopefully slow down memory
 446  * writes. If guest dirty memory rate is reduced below the rate at
 447  * which we can transfer pages to the destination then we should be
 448  * able to complete migration. Some workloads dirty memory way too
 449  * fast and will not effectively converge, even with auto-converge.
 450  */
 451 static void mig_throttle_guest_down(void)
 452 {
 453     MigrationState *s = migrate_get_current();
 454     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 455     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 456
 457     /* We have not started throttling yet. Let's start it. */
 458     if (!cpu_throttle_active()) {
 459         cpu_throttle_set(pct_initial);
 460     } else {
 461         /* Throttling already on, just increase the rate */
 462         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 463     }
 464 }
 465
 466 /**
 467  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 468  *
 469  * @rs: current RAM state
 470  * @current_addr: address for the zero page
 471  *
 472  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 473  * The important thing is that a stale (not-yet-0'd) page be replaced
 474  * by the new data.
 475  * As a bonus, if the page wasn't in the cache it gets added so that
 476  * when a small write is made into the 0'd page it gets XBZRLE sent.
 477  */
 478 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 479 {
 480     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 481         return;
 482     }
 483
 484     /* We don't care if this fails to allocate a new cache page
 485      * as long as it updated an old one */
 486     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
 487                  rs->bitmap_sync_count);
 488 }
 489
 490 #define ENCODING_FLAG_XBZRLE 0x1
 491
 492 /**
 493  * save_xbzrle_page: compress and send current page
 494  *
 495  * Returns: 1 means that we wrote the page
 496  *          0 means that page is identical to the one already sent
 497  *          -1 means that xbzrle would be longer than normal
 498  *
 499  * @rs: current RAM state
 500  * @f: QEMUFile where to send the data
 501  * @current_data: pointer to the address of the page contents
 502  * @current_addr: addr of the page
 503  * @block: block that contains the page we want to send
 504  * @offset: offset inside the block for the page
 505  * @last_stage: if we are at the completion stage
 506  * @bytes_transferred: increase it with the number of transferred bytes
 507  */
 508 static int save_xbzrle_page(RAMState *rs, QEMUFile *f, uint8_t **current_data,
 509                             ram_addr_t current_addr, RAMBlock *block,
 510                             ram_addr_t offset, bool last_stage,
 511                             uint64_t *bytes_transferred)
 512 {
 513     int encoded_len = 0, bytes_xbzrle;
 514     uint8_t *prev_cached_page;
 515
 516     if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
 517         acct_info.xbzrle_cache_miss++;
 518         if (!last_stage) {
 519             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 520                              rs->bitmap_sync_count) == -1) {
 521                 return -1;
 522             } else {
 523                 /* update *current_data when the page has been
 524                    inserted into cache */
 525                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 526             }
 527         }
 528         return -1;
 529     }
 530
 531     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 532
 533     /* save current buffer into memory */
 534     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 535
 536     /* XBZRLE encoding (if there is no overflow) */
 537     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 538                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 539                                        TARGET_PAGE_SIZE);
 540     if (encoded_len == 0) {
 541         trace_save_xbzrle_page_skipping();
 542         return 0;
 543     } else if (encoded_len == -1) {
 544         trace_save_xbzrle_page_overflow();
 545         acct_info.xbzrle_overflows++;
 546         /* update data in the cache */
 547         if (!last_stage) {
 548             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 549             *current_data = prev_cached_page;
 550         }
 551         return -1;
 552     }
 553
 554     /* we need to update the data in the cache, in order to get the same data */
 555     if (!last_stage) {
 556         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 557     }
 558
 559     /* Send XBZRLE based compressed page */
 560     bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
 561     qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
 562     qemu_put_be16(f, encoded_len);
 563     qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
 564     bytes_xbzrle += encoded_len + 1 + 2;
 565     acct_info.xbzrle_pages++;
 566     acct_info.xbzrle_bytes += bytes_xbzrle;
 567     *bytes_transferred += bytes_xbzrle;
 568
 569     return 1;
 570 }
 571
 572 /**
 573  * migration_bitmap_find_dirty: find the next dirty page from start
 574  *
 575  * Called with rcu_read_lock() to protect migration_bitmap
 576  *
 577  * Returns the byte offset within memory region of the start of a dirty page
 578  *
 579  * @rs: current RAM state
 580  * @rb: RAMBlock where to search for dirty pages
 581  * @start: starting address (typically so we can continue from previous page)
 582  * @ram_addr_abs: pointer into which to store the address of the dirty page
 583  *                within the global ram_addr space
 584  */
 585 static inline
 586 ram_addr_t migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 587                                        ram_addr_t start,
 588                                        ram_addr_t *ram_addr_abs)
 589 {
 590     unsigned long base = rb->offset >> TARGET_PAGE_BITS;
 591     unsigned long nr = base + (start >> TARGET_PAGE_BITS);
 592     uint64_t rb_size = rb->used_length;
 593     unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
 594     unsigned long *bitmap;
 595
 596     unsigned long next;
 597
 598     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 599     if (rs->ram_bulk_stage && nr > base) {
 600         next = nr + 1;
 601     } else {
 602         next = find_next_bit(bitmap, size, nr);
 603     }
 604
 605     *ram_addr_abs = next << TARGET_PAGE_BITS;
 606     return (next - base) << TARGET_PAGE_BITS;
 607 }
 608
 609 static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
 610 {
 611     bool ret;
 612     int nr = addr >> TARGET_PAGE_BITS;
 613     unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 614
 615     ret = test_and_clear_bit(nr, bitmap);
 616
 617     if (ret) {
 618         migration_dirty_pages--;
 619     }
 620     return ret;
 621 }
 622
 623 static void migration_bitmap_sync_range(RAMState *rs, ram_addr_t start,
 624                                         ram_addr_t length)
 625 {
 626     unsigned long *bitmap;
 627     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 628     migration_dirty_pages += cpu_physical_memory_sync_dirty_bitmap(bitmap,
 629                              start, length, &rs->num_dirty_pages_period);
 630 }
 631
 632 static void migration_bitmap_sync_init(RAMState *rs)
 633 {
 634     rs->time_last_bitmap_sync = 0;
 635     rs->bytes_xfer_prev = 0;
 636     rs->num_dirty_pages_period = 0;
 637     rs->xbzrle_cache_miss_prev = 0;
 638     rs->iterations_prev = 0;
 639 }
 640
 641 /**
 642  * ram_pagesize_summary: calculate all the pagesizes of a VM
 643  *
 644  * Returns a summary bitmap of the page sizes of all RAMBlocks
 645  *
 646  * For VMs with just normal pages this is equivalent to the host page
 647  * size. If it's got some huge pages then it's the OR of all the
 648  * different page sizes.
 649  */
 650 uint64_t ram_pagesize_summary(void)
 651 {
 652     RAMBlock *block;
 653     uint64_t summary = 0;
 654
 655     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 656         summary |= block->page_size;
 657     }
 658
 659     return summary;
 660 }
 661
 662 static void migration_bitmap_sync(RAMState *rs)
 663 {
 664     RAMBlock *block;
 665     MigrationState *s = migrate_get_current();
 666     int64_t end_time;
 667     uint64_t bytes_xfer_now;
 668
 669     rs->bitmap_sync_count++;
 670
 671     if (!rs->bytes_xfer_prev) {
 672         rs->bytes_xfer_prev = ram_bytes_transferred();
 673     }
 674
 675     if (!rs->time_last_bitmap_sync) {
 676         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 677     }
 678
 679     trace_migration_bitmap_sync_start();
 680     memory_global_dirty_log_sync();
 681
 682     qemu_mutex_lock(&migration_bitmap_mutex);
 683     rcu_read_lock();
 684     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 685         migration_bitmap_sync_range(rs, block->offset, block->used_length);
 686     }
 687     rcu_read_unlock();
 688     qemu_mutex_unlock(&migration_bitmap_mutex);
 689
 690     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 691
 692     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 693
 694     /* more than 1 second = 1000 millisecons */
 695     if (end_time > rs->time_last_bitmap_sync + 1000) {
 696         if (migrate_auto_converge()) {
 697             /* The following detection logic can be refined later. For now:
 698                Check to see if the dirtied bytes is 50% more than the approx.
 699                amount of bytes that just got transferred since the last time we
 700                were in this routine. If that happens twice, start or increase
 701                throttling */
 702             bytes_xfer_now = ram_bytes_transferred();
 703
 704             if (s->dirty_pages_rate &&
 705                (rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 706                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 707                (rs->dirty_rate_high_cnt++ >= 2)) {
 708                     trace_migration_throttle();
 709                     rs->dirty_rate_high_cnt = 0;
 710                     mig_throttle_guest_down();
 711              }
 712              rs->bytes_xfer_prev = bytes_xfer_now;
 713         }
 714
 715         if (migrate_use_xbzrle()) {
 716             if (rs->iterations_prev != acct_info.iterations) {
 717                 acct_info.xbzrle_cache_miss_rate =
 718                    (double)(acct_info.xbzrle_cache_miss -
 719                             rs->xbzrle_cache_miss_prev) /
 720                    (acct_info.iterations - rs->iterations_prev);
 721             }
 722             rs->iterations_prev = acct_info.iterations;
 723             rs->xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
 724         }
 725         s->dirty_pages_rate = rs->num_dirty_pages_period * 1000
 726             / (end_time - rs->time_last_bitmap_sync);
 727         s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
 728         rs->time_last_bitmap_sync = end_time;
 729         rs->num_dirty_pages_period = 0;
 730     }
 731     s->dirty_sync_count = rs->bitmap_sync_count;
 732     if (migrate_use_events()) {
 733         qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
 734     }
 735 }
 736
 737 /**
 738  * save_zero_page: send the zero page to the stream
 739  *
 740  * Returns the number of pages written.
 741  *
 742  * @rs: current RAM state
 743  * @f: QEMUFile where to send the data
 744  * @block: block that contains the page we want to send
 745  * @offset: offset inside the block for the page
 746  * @p: pointer to the page
 747  * @bytes_transferred: increase it with the number of transferred bytes
 748  */
 749 static int save_zero_page(RAMState *rs, QEMUFile *f, RAMBlock *block,
 750                           ram_addr_t offset,
 751                           uint8_t *p, uint64_t *bytes_transferred)
 752 {
 753     int pages = -1;
 754
 755     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 756         rs->zero_pages++;
 757         *bytes_transferred += save_page_header(f, block,
 758                                                offset | RAM_SAVE_FLAG_COMPRESS);
 759         qemu_put_byte(f, 0);
 760         *bytes_transferred += 1;
 761         pages = 1;
 762     }
 763
 764     return pages;
 765 }
 766
 767 static void ram_release_pages(MigrationState *ms, const char *rbname,
 768                               uint64_t offset, int pages)
 769 {
 770     if (!migrate_release_ram() || !migration_in_postcopy(ms)) {
 771         return;
 772     }
 773
 774     ram_discard_range(NULL, rbname, offset, pages << TARGET_PAGE_BITS);
 775 }
 776
 777 /**
 778  * ram_save_page: send the given page to the stream
 779  *
 780  * Returns the number of pages written.
 781  *          < 0 - error
 782  *          >=0 - Number of pages written - this might legally be 0
 783  *                if xbzrle noticed the page was the same.
 784  *
 785  * @rs: current RAM state
 786  * @ms: current migration state
 787  * @f: QEMUFile where to send the data
 788  * @block: block that contains the page we want to send
 789  * @offset: offset inside the block for the page
 790  * @last_stage: if we are at the completion stage
 791  * @bytes_transferred: increase it with the number of transferred bytes
 792  */
 793 static int ram_save_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
 794                          PageSearchStatus *pss, bool last_stage,
 795                          uint64_t *bytes_transferred)
 796 {
 797     int pages = -1;
 798     uint64_t bytes_xmit;
 799     ram_addr_t current_addr;
 800     uint8_t *p;
 801     int ret;
 802     bool send_async = true;
 803     RAMBlock *block = pss->block;
 804     ram_addr_t offset = pss->offset;
 805
 806     p = block->host + offset;
 807
 808     /* In doubt sent page as normal */
 809     bytes_xmit = 0;
 810     ret = ram_control_save_page(f, block->offset,
 811                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 812     if (bytes_xmit) {
 813         *bytes_transferred += bytes_xmit;
 814         pages = 1;
 815     }
 816
 817     XBZRLE_cache_lock();
 818
 819     current_addr = block->offset + offset;
 820
 821     if (block == rs->last_sent_block) {
 822         offset |= RAM_SAVE_FLAG_CONTINUE;
 823     }
 824     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 825         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 826             if (bytes_xmit > 0) {
 827                 acct_info.norm_pages++;
 828             } else if (bytes_xmit == 0) {
 829                 rs->zero_pages++;
 830             }
 831         }
 832     } else {
 833         pages = save_zero_page(rs, f, block, offset, p, bytes_transferred);
 834         if (pages > 0) {
 835             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 836              * page would be stale
 837              */
 838             xbzrle_cache_zero_page(rs, current_addr);
 839             ram_release_pages(ms, block->idstr, pss->offset, pages);
 840         } else if (!rs->ram_bulk_stage &&
 841                    !migration_in_postcopy(ms) && migrate_use_xbzrle()) {
 842             pages = save_xbzrle_page(rs, f, &p, current_addr, block,
 843                                      offset, last_stage, bytes_transferred);
 844             if (!last_stage) {
 845                 /* Can't send this cached data async, since the cache page
 846                  * might get updated before it gets to the wire
 847                  */
 848                 send_async = false;
 849             }
 850         }
 851     }
 852
 853     /* XBZRLE overflow or normal page */
 854     if (pages == -1) {
 855         *bytes_transferred += save_page_header(f, block,
 856                                                offset | RAM_SAVE_FLAG_PAGE);
 857         if (send_async) {
 858             qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE,
 859                                   migrate_release_ram() &
 860                                   migration_in_postcopy(ms));
 861         } else {
 862             qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
 863         }
 864         *bytes_transferred += TARGET_PAGE_SIZE;
 865         pages = 1;
 866         acct_info.norm_pages++;
 867     }
 868
 869     XBZRLE_cache_unlock();
 870
 871     return pages;
 872 }
 873
 874 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 875                                 ram_addr_t offset)
 876 {
 877     int bytes_sent, blen;
 878     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 879
 880     bytes_sent = save_page_header(f, block, offset |
 881                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
 882     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 883                                      migrate_compress_level());
 884     if (blen < 0) {
 885         bytes_sent = 0;
 886         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
 887         error_report("compressed data failed!");
 888     } else {
 889         bytes_sent += blen;
 890         ram_release_pages(migrate_get_current(), block->idstr,
 891                           offset & TARGET_PAGE_MASK, 1);
 892     }
 893
 894     return bytes_sent;
 895 }
 896
 897 static uint64_t bytes_transferred;
 898
 899 static void flush_compressed_data(QEMUFile *f)
 900 {
 901     int idx, len, thread_count;
 902
 903     if (!migrate_use_compression()) {
 904         return;
 905     }
 906     thread_count = migrate_compress_threads();
 907
 908     qemu_mutex_lock(&comp_done_lock);
 909     for (idx = 0; idx < thread_count; idx++) {
 910         while (!comp_param[idx].done) {
 911             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 912         }
 913     }
 914     qemu_mutex_unlock(&comp_done_lock);
 915
 916     for (idx = 0; idx < thread_count; idx++) {
 917         qemu_mutex_lock(&comp_param[idx].mutex);
 918         if (!comp_param[idx].quit) {
 919             len = qemu_put_qemu_file(f, comp_param[idx].file);
 920             bytes_transferred += len;
 921         }
 922         qemu_mutex_unlock(&comp_param[idx].mutex);
 923     }
 924 }
 925
 926 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 927                                        ram_addr_t offset)
 928 {
 929     param->block = block;
 930     param->offset = offset;
 931 }
 932
 933 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
 934                                            ram_addr_t offset,
 935                                            uint64_t *bytes_transferred)
 936 {
 937     int idx, thread_count, bytes_xmit = -1, pages = -1;
 938
 939     thread_count = migrate_compress_threads();
 940     qemu_mutex_lock(&comp_done_lock);
 941     while (true) {
 942         for (idx = 0; idx < thread_count; idx++) {
 943             if (comp_param[idx].done) {
 944                 comp_param[idx].done = false;
 945                 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
 946                 qemu_mutex_lock(&comp_param[idx].mutex);
 947                 set_compress_params(&comp_param[idx], block, offset);
 948                 qemu_cond_signal(&comp_param[idx].cond);
 949                 qemu_mutex_unlock(&comp_param[idx].mutex);
 950                 pages = 1;
 951                 acct_info.norm_pages++;
 952                 *bytes_transferred += bytes_xmit;
 953                 break;
 954             }
 955         }
 956         if (pages > 0) {
 957             break;
 958         } else {
 959             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 960         }
 961     }
 962     qemu_mutex_unlock(&comp_done_lock);
 963
 964     return pages;
 965 }
 966
 967 /**
 968  * ram_save_compressed_page: compress the given page and send it to the stream
 969  *
 970  * Returns the number of pages written.
 971  *
 972  * @rs: current RAM state
 973  * @ms: current migration state
 974  * @f: QEMUFile where to send the data
 975  * @block: block that contains the page we want to send
 976  * @offset: offset inside the block for the page
 977  * @last_stage: if we are at the completion stage
 978  * @bytes_transferred: increase it with the number of transferred bytes
 979  */
 980 static int ram_save_compressed_page(RAMState *rs, MigrationState *ms,
 981                                     QEMUFile *f,
 982                                     PageSearchStatus *pss, bool last_stage,
 983                                     uint64_t *bytes_transferred)
 984 {
 985     int pages = -1;
 986     uint64_t bytes_xmit = 0;
 987     uint8_t *p;
 988     int ret, blen;
 989     RAMBlock *block = pss->block;
 990     ram_addr_t offset = pss->offset;
 991
 992     p = block->host + offset;
 993
 994     ret = ram_control_save_page(f, block->offset,
 995                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
 996     if (bytes_xmit) {
 997         *bytes_transferred += bytes_xmit;
 998         pages = 1;
 999     }
1000     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
1001         if (ret != RAM_SAVE_CONTROL_DELAYED) {
1002             if (bytes_xmit > 0) {
1003                 acct_info.norm_pages++;
1004             } else if (bytes_xmit == 0) {
1005                 rs->zero_pages++;
1006             }
1007         }
1008     } else {
1009         /* When starting the process of a new block, the first page of
1010          * the block should be sent out before other pages in the same
1011          * block, and all the pages in last block should have been sent
1012          * out, keeping this order is important, because the 'cont' flag
1013          * is used to avoid resending the block name.
1014          */
1015         if (block != rs->last_sent_block) {
1016             flush_compressed_data(f);
1017             pages = save_zero_page(rs, f, block, offset, p, bytes_transferred);
1018             if (pages == -1) {
1019                 /* Make sure the first page is sent out before other pages */
1020                 bytes_xmit = save_page_header(f, block, offset |
1021                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
1022                 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1023                                                  migrate_compress_level());
1024                 if (blen > 0) {
1025                     *bytes_transferred += bytes_xmit + blen;
1026                     acct_info.norm_pages++;
1027                     pages = 1;
1028                 } else {
1029                     qemu_file_set_error(f, blen);
1030                     error_report("compressed data failed!");
1031                 }
1032             }
1033             if (pages > 0) {
1034                 ram_release_pages(ms, block->idstr, pss->offset, pages);
1035             }
1036         } else {
1037             offset |= RAM_SAVE_FLAG_CONTINUE;
1038             pages = save_zero_page(rs, f, block, offset, p, bytes_transferred);
1039             if (pages == -1) {
1040                 pages = compress_page_with_multi_thread(f, block, offset,
1041                                                         bytes_transferred);
1042             } else {
1043                 ram_release_pages(ms, block->idstr, pss->offset, pages);
1044             }
1045         }
1046     }
1047
1048     return pages;
1049 }
1050
1051 /**
1052  * find_dirty_block: find the next dirty page and update any state
1053  * associated with the search process.
1054  *
1055  * Returns if a page is found
1056  *
1057  * @rs: current RAM state
1058  * @f: QEMUFile where to send the data
1059  * @pss: data about the state of the current dirty page scan
1060  * @again: set to false if the search has scanned the whole of RAM
1061  * @ram_addr_abs: pointer into which to store the address of the dirty page
1062  *                within the global ram_addr space
1063  */
1064 static bool find_dirty_block(RAMState *rs, QEMUFile *f, PageSearchStatus *pss,
1065                              bool *again, ram_addr_t *ram_addr_abs)
1066 {
1067     pss->offset = migration_bitmap_find_dirty(rs, pss->block, pss->offset,
1068                                               ram_addr_abs);
1069     if (pss->complete_round && pss->block == rs->last_seen_block &&
1070         pss->offset >= rs->last_offset) {
1071         /*
1072          * We've been once around the RAM and haven't found anything.
1073          * Give up.
1074          */
1075         *again = false;
1076         return false;
1077     }
1078     if (pss->offset >= pss->block->used_length) {
1079         /* Didn't find anything in this RAM Block */
1080         pss->offset = 0;
1081         pss->block = QLIST_NEXT_RCU(pss->block, next);
1082         if (!pss->block) {
1083             /* Hit the end of the list */
1084             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1085             /* Flag that we've looped */
1086             pss->complete_round = true;
1087             rs->ram_bulk_stage = false;
1088             if (migrate_use_xbzrle()) {
1089                 /* If xbzrle is on, stop using the data compression at this
1090                  * point. In theory, xbzrle can do better than compression.
1091                  */
1092                 flush_compressed_data(f);
1093                 compression_switch = false;
1094             }
1095         }
1096         /* Didn't find anything this time, but try again on the new block */
1097         *again = true;
1098         return false;
1099     } else {
1100         /* Can go around again, but... */
1101         *again = true;
1102         /* We've found something so probably don't need to */
1103         return true;
1104     }
1105 }
1106
1107 /**
1108  * unqueue_page: gets a page of the queue
1109  *
1110  * Helper for 'get_queued_page' - gets a page off the queue
1111  *
1112  * Returns the block of the page (or NULL if none available)
1113  *
1114  * @ms: current migration state
1115  * @offset: used to return the offset within the RAMBlock
1116  * @ram_addr_abs: pointer into which to store the address of the dirty page
1117  *                within the global ram_addr space
1118  */
1119 static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1120                               ram_addr_t *ram_addr_abs)
1121 {
1122     RAMBlock *block = NULL;
1123
1124     qemu_mutex_lock(&ms->src_page_req_mutex);
1125     if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1126         struct MigrationSrcPageRequest *entry =
1127                                 QSIMPLEQ_FIRST(&ms->src_page_requests);
1128         block = entry->rb;
1129         *offset = entry->offset;
1130         *ram_addr_abs = (entry->offset + entry->rb->offset) &
1131                         TARGET_PAGE_MASK;
1132
1133         if (entry->len > TARGET_PAGE_SIZE) {
1134             entry->len -= TARGET_PAGE_SIZE;
1135             entry->offset += TARGET_PAGE_SIZE;
1136         } else {
1137             memory_region_unref(block->mr);
1138             QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1139             g_free(entry);
1140         }
1141     }
1142     qemu_mutex_unlock(&ms->src_page_req_mutex);
1143
1144     return block;
1145 }
1146
1147 /**
1148  * get_queued_page: unqueue a page from the postocpy requests
1149  *
1150  * Skips pages that are already sent (!dirty)
1151  *
1152  * Returns if a queued page is found
1153  *
1154  * @rs: current RAM state
1155  * @ms: current migration state
1156  * @pss: data about the state of the current dirty page scan
1157  * @ram_addr_abs: pointer into which to store the address of the dirty page
1158  *                within the global ram_addr space
1159  */
1160 static bool get_queued_page(RAMState *rs, MigrationState *ms,
1161                             PageSearchStatus *pss,
1162                             ram_addr_t *ram_addr_abs)
1163 {
1164     RAMBlock  *block;
1165     ram_addr_t offset;
1166     bool dirty;
1167
1168     do {
1169         block = unqueue_page(ms, &offset, ram_addr_abs);
1170         /*
1171          * We're sending this page, and since it's postcopy nothing else
1172          * will dirty it, and we must make sure it doesn't get sent again
1173          * even if this queue request was received after the background
1174          * search already sent it.
1175          */
1176         if (block) {
1177             unsigned long *bitmap;
1178             bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1179             dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1180             if (!dirty) {
1181                 trace_get_queued_page_not_dirty(
1182                     block->idstr, (uint64_t)offset,
1183                     (uint64_t)*ram_addr_abs,
1184                     test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1185                          atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1186             } else {
1187                 trace_get_queued_page(block->idstr,
1188                                       (uint64_t)offset,
1189                                       (uint64_t)*ram_addr_abs);
1190             }
1191         }
1192
1193     } while (block && !dirty);
1194
1195     if (block) {
1196         /*
1197          * As soon as we start servicing pages out of order, then we have
1198          * to kill the bulk stage, since the bulk stage assumes
1199          * in (migration_bitmap_find_and_reset_dirty) that every page is
1200          * dirty, that's no longer true.
1201          */
1202         rs->ram_bulk_stage = false;
1203
1204         /*
1205          * We want the background search to continue from the queued page
1206          * since the guest is likely to want other pages near to the page
1207          * it just requested.
1208          */
1209         pss->block = block;
1210         pss->offset = offset;
1211     }
1212
1213     return !!block;
1214 }
1215
1216 /**
1217  * migration_page_queue_free: drop any remaining pages in the ram
1218  * request queue
1219  *
1220  * It should be empty at the end anyway, but in error cases there may
1221  * be some left.  in case that there is any page left, we drop it.
1222  *
1223  * @ms: current migration state
1224  */
1225 void migration_page_queue_free(MigrationState *ms)
1226 {
1227     struct MigrationSrcPageRequest *mspr, *next_mspr;
1228     /* This queue generally should be empty - but in the case of a failed
1229      * migration might have some droppings in.
1230      */
1231     rcu_read_lock();
1232     QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1233         memory_region_unref(mspr->rb->mr);
1234         QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1235         g_free(mspr);
1236     }
1237     rcu_read_unlock();
1238 }
1239
1240 /**
1241  * ram_save_queue_pages: queue the page for transmission
1242  *
1243  * A request from postcopy destination for example.
1244  *
1245  * Returns zero on success or negative on error
1246  *
1247  * @ms: current migration state
1248  * @rbname: Name of the RAMBLock of the request. NULL means the
1249  *          same that last one.
1250  * @start: starting address from the start of the RAMBlock
1251  * @len: length (in bytes) to send
1252  */
1253 int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1254                          ram_addr_t start, ram_addr_t len)
1255 {
1256     RAMBlock *ramblock;
1257
1258     ms->postcopy_requests++;
1259     rcu_read_lock();
1260     if (!rbname) {
1261         /* Reuse last RAMBlock */
1262         ramblock = ms->last_req_rb;
1263
1264         if (!ramblock) {
1265             /*
1266              * Shouldn't happen, we can't reuse the last RAMBlock if
1267              * it's the 1st request.
1268              */
1269             error_report("ram_save_queue_pages no previous block");
1270             goto err;
1271         }
1272     } else {
1273         ramblock = qemu_ram_block_by_name(rbname);
1274
1275         if (!ramblock) {
1276             /* We shouldn't be asked for a non-existent RAMBlock */
1277             error_report("ram_save_queue_pages no block '%s'", rbname);
1278             goto err;
1279         }
1280         ms->last_req_rb = ramblock;
1281     }
1282     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1283     if (start+len > ramblock->used_length) {
1284         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1285                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1286                      __func__, start, len, ramblock->used_length);
1287         goto err;
1288     }
1289
1290     struct MigrationSrcPageRequest *new_entry =
1291         g_malloc0(sizeof(struct MigrationSrcPageRequest));
1292     new_entry->rb = ramblock;
1293     new_entry->offset = start;
1294     new_entry->len = len;
1295
1296     memory_region_ref(ramblock->mr);
1297     qemu_mutex_lock(&ms->src_page_req_mutex);
1298     QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1299     qemu_mutex_unlock(&ms->src_page_req_mutex);
1300     rcu_read_unlock();
1301
1302     return 0;
1303
1304 err:
1305     rcu_read_unlock();
1306     return -1;
1307 }
1308
1309 /**
1310  * ram_save_target_page: save one target page
1311  *
1312  * Returns the number of pages written
1313  *
1314  * @rs: current RAM state
1315  * @ms: current migration state
1316  * @f: QEMUFile where to send the data
1317  * @pss: data about the page we want to send
1318  * @last_stage: if we are at the completion stage
1319  * @bytes_transferred: increase it with the number of transferred bytes
1320  * @dirty_ram_abs: address of the start of the dirty page in ram_addr_t space
1321  */
1322 static int ram_save_target_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
1323                                 PageSearchStatus *pss,
1324                                 bool last_stage,
1325                                 uint64_t *bytes_transferred,
1326                                 ram_addr_t dirty_ram_abs)
1327 {
1328     int res = 0;
1329
1330     /* Check the pages is dirty and if it is send it */
1331     if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1332         unsigned long *unsentmap;
1333         if (compression_switch && migrate_use_compression()) {
1334             res = ram_save_compressed_page(rs, ms, f, pss,
1335                                            last_stage,
1336                                            bytes_transferred);
1337         } else {
1338             res = ram_save_page(rs, ms, f, pss, last_stage,
1339                                 bytes_transferred);
1340         }
1341
1342         if (res < 0) {
1343             return res;
1344         }
1345         unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1346         if (unsentmap) {
1347             clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1348         }
1349         /* Only update last_sent_block if a block was actually sent; xbzrle
1350          * might have decided the page was identical so didn't bother writing
1351          * to the stream.
1352          */
1353         if (res > 0) {
1354             rs->last_sent_block = pss->block;
1355         }
1356     }
1357
1358     return res;
1359 }
1360
1361 /**
1362  * ram_save_host_page: save a whole host page
1363  *
1364  * Starting at *offset send pages up to the end of the current host
1365  * page. It's valid for the initial offset to point into the middle of
1366  * a host page in which case the remainder of the hostpage is sent.
1367  * Only dirty target pages are sent. Note that the host page size may
1368  * be a huge page for this block.
1369  *
1370  * Returns the number of pages written or negative on error
1371  *
1372  * @rs: current RAM state
1373  * @ms: current migration state
1374  * @f: QEMUFile where to send the data
1375  * @pss: data about the page we want to send
1376  * @last_stage: if we are at the completion stage
1377  * @bytes_transferred: increase it with the number of transferred bytes
1378  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1379  */
1380 static int ram_save_host_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
1381                               PageSearchStatus *pss,
1382                               bool last_stage,
1383                               uint64_t *bytes_transferred,
1384                               ram_addr_t dirty_ram_abs)
1385 {
1386     int tmppages, pages = 0;
1387     size_t pagesize = qemu_ram_pagesize(pss->block);
1388
1389     do {
1390         tmppages = ram_save_target_page(rs, ms, f, pss, last_stage,
1391                                         bytes_transferred, dirty_ram_abs);
1392         if (tmppages < 0) {
1393             return tmppages;
1394         }
1395
1396         pages += tmppages;
1397         pss->offset += TARGET_PAGE_SIZE;
1398         dirty_ram_abs += TARGET_PAGE_SIZE;
1399     } while (pss->offset & (pagesize - 1));
1400
1401     /* The offset we leave with is the last one we looked at */
1402     pss->offset -= TARGET_PAGE_SIZE;
1403     return pages;
1404 }
1405
1406 /**
1407  * ram_find_and_save_block: finds a dirty page and sends it to f
1408  *
1409  * Called within an RCU critical section.
1410  *
1411  * Returns the number of pages written where zero means no dirty pages
1412  *
1413  * @rs: current RAM state
1414  * @f: QEMUFile where to send the data
1415  * @last_stage: if we are at the completion stage
1416  * @bytes_transferred: increase it with the number of transferred bytes
1417  *
1418  * On systems where host-page-size > target-page-size it will send all the
1419  * pages in a host page that are dirty.
1420  */
1421
1422 static int ram_find_and_save_block(RAMState *rs, QEMUFile *f, bool last_stage,
1423                                    uint64_t *bytes_transferred)
1424 {
1425     PageSearchStatus pss;
1426     MigrationState *ms = migrate_get_current();
1427     int pages = 0;
1428     bool again, found;
1429     ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1430                                  ram_addr_t space */
1431
1432     /* No dirty page as there is zero RAM */
1433     if (!ram_bytes_total()) {
1434         return pages;
1435     }
1436
1437     pss.block = rs->last_seen_block;
1438     pss.offset = rs->last_offset;
1439     pss.complete_round = false;
1440
1441     if (!pss.block) {
1442         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1443     }
1444
1445     do {
1446         again = true;
1447         found = get_queued_page(rs, ms, &pss, &dirty_ram_abs);
1448
1449         if (!found) {
1450             /* priority queue empty, so just search for something dirty */
1451             found = find_dirty_block(rs, f, &pss, &again, &dirty_ram_abs);
1452         }
1453
1454         if (found) {
1455             pages = ram_save_host_page(rs, ms, f, &pss,
1456                                        last_stage, bytes_transferred,
1457                                        dirty_ram_abs);
1458         }
1459     } while (!pages && again);
1460
1461     rs->last_seen_block = pss.block;
1462     rs->last_offset = pss.offset;
1463
1464     return pages;
1465 }
1466
1467 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1468 {
1469     uint64_t pages = size / TARGET_PAGE_SIZE;
1470     RAMState *rs = &ram_state;
1471
1472     if (zero) {
1473         rs->zero_pages += pages;
1474     } else {
1475         acct_info.norm_pages += pages;
1476         bytes_transferred += size;
1477         qemu_update_position(f, size);
1478     }
1479 }
1480
1481 static ram_addr_t ram_save_remaining(void)
1482 {
1483     return migration_dirty_pages;
1484 }
1485
1486 uint64_t ram_bytes_remaining(void)
1487 {
1488     return ram_save_remaining() * TARGET_PAGE_SIZE;
1489 }
1490
1491 uint64_t ram_bytes_transferred(void)
1492 {
1493     return bytes_transferred;
1494 }
1495
1496 uint64_t ram_bytes_total(void)
1497 {
1498     RAMBlock *block;
1499     uint64_t total = 0;
1500
1501     rcu_read_lock();
1502     QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1503         total += block->used_length;
1504     rcu_read_unlock();
1505     return total;
1506 }
1507
1508 void free_xbzrle_decoded_buf(void)
1509 {
1510     g_free(xbzrle_decoded_buf);
1511     xbzrle_decoded_buf = NULL;
1512 }
1513
1514 static void migration_bitmap_free(struct BitmapRcu *bmap)
1515 {
1516     g_free(bmap->bmap);
1517     g_free(bmap->unsentmap);
1518     g_free(bmap);
1519 }
1520
1521 static void ram_migration_cleanup(void *opaque)
1522 {
1523     /* caller have hold iothread lock or is in a bh, so there is
1524      * no writing race against this migration_bitmap
1525      */
1526     struct BitmapRcu *bitmap = migration_bitmap_rcu;
1527     atomic_rcu_set(&migration_bitmap_rcu, NULL);
1528     if (bitmap) {
1529         memory_global_dirty_log_stop();
1530         call_rcu(bitmap, migration_bitmap_free, rcu);
1531     }
1532
1533     XBZRLE_cache_lock();
1534     if (XBZRLE.cache) {
1535         cache_fini(XBZRLE.cache);
1536         g_free(XBZRLE.encoded_buf);
1537         g_free(XBZRLE.current_buf);
1538         g_free(ZERO_TARGET_PAGE);
1539         XBZRLE.cache = NULL;
1540         XBZRLE.encoded_buf = NULL;
1541         XBZRLE.current_buf = NULL;
1542     }
1543     XBZRLE_cache_unlock();
1544 }
1545
1546 static void ram_state_reset(RAMState *rs)
1547 {
1548     rs->last_seen_block = NULL;
1549     rs->last_sent_block = NULL;
1550     rs->last_offset = 0;
1551     rs->last_version = ram_list.version;
1552     rs->ram_bulk_stage = true;
1553 }
1554
1555 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1556
1557 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1558 {
1559     /* called in qemu main thread, so there is
1560      * no writing race against this migration_bitmap
1561      */
1562     if (migration_bitmap_rcu) {
1563         struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1564         bitmap = g_new(struct BitmapRcu, 1);
1565         bitmap->bmap = bitmap_new(new);
1566
1567         /* prevent migration_bitmap content from being set bit
1568          * by migration_bitmap_sync_range() at the same time.
1569          * it is safe to migration if migration_bitmap is cleared bit
1570          * at the same time.
1571          */
1572         qemu_mutex_lock(&migration_bitmap_mutex);
1573         bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1574         bitmap_set(bitmap->bmap, old, new - old);
1575
1576         /* We don't have a way to safely extend the sentmap
1577          * with RCU; so mark it as missing, entry to postcopy
1578          * will fail.
1579          */
1580         bitmap->unsentmap = NULL;
1581
1582         atomic_rcu_set(&migration_bitmap_rcu, bitmap);
1583         qemu_mutex_unlock(&migration_bitmap_mutex);
1584         migration_dirty_pages += new - old;
1585         call_rcu(old_bitmap, migration_bitmap_free, rcu);
1586     }
1587 }
1588
1589 /*
1590  * 'expected' is the value you expect the bitmap mostly to be full
1591  * of; it won't bother printing lines that are all this value.
1592  * If 'todump' is null the migration bitmap is dumped.
1593  */
1594 void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1595 {
1596     int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1597
1598     int64_t cur;
1599     int64_t linelen = 128;
1600     char linebuf[129];
1601
1602     if (!todump) {
1603         todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1604     }
1605
1606     for (cur = 0; cur < ram_pages; cur += linelen) {
1607         int64_t curb;
1608         bool found = false;
1609         /*
1610          * Last line; catch the case where the line length
1611          * is longer than remaining ram
1612          */
1613         if (cur + linelen > ram_pages) {
1614             linelen = ram_pages - cur;
1615         }
1616         for (curb = 0; curb < linelen; curb++) {
1617             bool thisbit = test_bit(cur + curb, todump);
1618             linebuf[curb] = thisbit ? '1' : '.';
1619             found = found || (thisbit != expected);
1620         }
1621         if (found) {
1622             linebuf[curb] = '\0';
1623             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1624         }
1625     }
1626 }
1627
1628 /* **** functions for postcopy ***** */
1629
1630 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1631 {
1632     struct RAMBlock *block;
1633     unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1634
1635     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1636         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1637         unsigned long range = first + (block->used_length >> TARGET_PAGE_BITS);
1638         unsigned long run_start = find_next_zero_bit(bitmap, range, first);
1639
1640         while (run_start < range) {
1641             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1642             ram_discard_range(NULL, block->idstr, run_start << TARGET_PAGE_BITS,
1643                               (run_end - run_start) << TARGET_PAGE_BITS);
1644             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1645         }
1646     }
1647 }
1648
1649 /**
1650  * postcopy_send_discard_bm_ram: discard a RAMBlock
1651  *
1652  * Returns zero on success
1653  *
1654  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1655  * Note: At this point the 'unsentmap' is the processed bitmap combined
1656  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1657  *
1658  * @ms: current migration state
1659  * @pds: state for postcopy
1660  * @start: RAMBlock starting page
1661  * @length: RAMBlock size
1662  */
1663 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1664                                         PostcopyDiscardState *pds,
1665                                         unsigned long start,
1666                                         unsigned long length)
1667 {
1668     unsigned long end = start + length; /* one after the end */
1669     unsigned long current;
1670     unsigned long *unsentmap;
1671
1672     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1673     for (current = start; current < end; ) {
1674         unsigned long one = find_next_bit(unsentmap, end, current);
1675
1676         if (one <= end) {
1677             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1678             unsigned long discard_length;
1679
1680             if (zero >= end) {
1681                 discard_length = end - one;
1682             } else {
1683                 discard_length = zero - one;
1684             }
1685             if (discard_length) {
1686                 postcopy_discard_send_range(ms, pds, one, discard_length);
1687             }
1688             current = one + discard_length;
1689         } else {
1690             current = one;
1691         }
1692     }
1693
1694     return 0;
1695 }
1696
1697 /**
1698  * postcopy_each_ram_send_discard: discard all RAMBlocks
1699  *
1700  * Returns 0 for success or negative for error
1701  *
1702  * Utility for the outgoing postcopy code.
1703  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1704  *   passing it bitmap indexes and name.
1705  * (qemu_ram_foreach_block ends up passing unscaled lengths
1706  *  which would mean postcopy code would have to deal with target page)
1707  *
1708  * @ms: current migration state
1709  */
1710 static int postcopy_each_ram_send_discard(MigrationState *ms)
1711 {
1712     struct RAMBlock *block;
1713     int ret;
1714
1715     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1716         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1717         PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1718                                                                first,
1719                                                                block->idstr);
1720
1721         /*
1722          * Postcopy sends chunks of bitmap over the wire, but it
1723          * just needs indexes at this point, avoids it having
1724          * target page specific code.
1725          */
1726         ret = postcopy_send_discard_bm_ram(ms, pds, first,
1727                                     block->used_length >> TARGET_PAGE_BITS);
1728         postcopy_discard_send_finish(ms, pds);
1729         if (ret) {
1730             return ret;
1731         }
1732     }
1733
1734     return 0;
1735 }
1736
1737 /**
1738  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1739  *
1740  * Helper for postcopy_chunk_hostpages; it's called twice to
1741  * canonicalize the two bitmaps, that are similar, but one is
1742  * inverted.
1743  *
1744  * Postcopy requires that all target pages in a hostpage are dirty or
1745  * clean, not a mix.  This function canonicalizes the bitmaps.
1746  *
1747  * @ms: current migration state
1748  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1749  *               otherwise we need to canonicalize partially dirty host pages
1750  * @block: block that contains the page we want to canonicalize
1751  * @pds: state for postcopy
1752  */
1753 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1754                                           RAMBlock *block,
1755                                           PostcopyDiscardState *pds)
1756 {
1757     unsigned long *bitmap;
1758     unsigned long *unsentmap;
1759     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1760     unsigned long first = block->offset >> TARGET_PAGE_BITS;
1761     unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1762     unsigned long last = first + (len - 1);
1763     unsigned long run_start;
1764
1765     if (block->page_size == TARGET_PAGE_SIZE) {
1766         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1767         return;
1768     }
1769
1770     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1771     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1772
1773     if (unsent_pass) {
1774         /* Find a sent page */
1775         run_start = find_next_zero_bit(unsentmap, last + 1, first);
1776     } else {
1777         /* Find a dirty page */
1778         run_start = find_next_bit(bitmap, last + 1, first);
1779     }
1780
1781     while (run_start <= last) {
1782         bool do_fixup = false;
1783         unsigned long fixup_start_addr;
1784         unsigned long host_offset;
1785
1786         /*
1787          * If the start of this run of pages is in the middle of a host
1788          * page, then we need to fixup this host page.
1789          */
1790         host_offset = run_start % host_ratio;
1791         if (host_offset) {
1792             do_fixup = true;
1793             run_start -= host_offset;
1794             fixup_start_addr = run_start;
1795             /* For the next pass */
1796             run_start = run_start + host_ratio;
1797         } else {
1798             /* Find the end of this run */
1799             unsigned long run_end;
1800             if (unsent_pass) {
1801                 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1802             } else {
1803                 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1804             }
1805             /*
1806              * If the end isn't at the start of a host page, then the
1807              * run doesn't finish at the end of a host page
1808              * and we need to discard.
1809              */
1810             host_offset = run_end % host_ratio;
1811             if (host_offset) {
1812                 do_fixup = true;
1813                 fixup_start_addr = run_end - host_offset;
1814                 /*
1815                  * This host page has gone, the next loop iteration starts
1816                  * from after the fixup
1817                  */
1818                 run_start = fixup_start_addr + host_ratio;
1819             } else {
1820                 /*
1821                  * No discards on this iteration, next loop starts from
1822                  * next sent/dirty page
1823                  */
1824                 run_start = run_end + 1;
1825             }
1826         }
1827
1828         if (do_fixup) {
1829             unsigned long page;
1830
1831             /* Tell the destination to discard this page */
1832             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1833                 /* For the unsent_pass we:
1834                  *     discard partially sent pages
1835                  * For the !unsent_pass (dirty) we:
1836                  *     discard partially dirty pages that were sent
1837                  *     (any partially sent pages were already discarded
1838                  *     by the previous unsent_pass)
1839                  */
1840                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1841                                             host_ratio);
1842             }
1843
1844             /* Clean up the bitmap */
1845             for (page = fixup_start_addr;
1846                  page < fixup_start_addr + host_ratio; page++) {
1847                 /* All pages in this host page are now not sent */
1848                 set_bit(page, unsentmap);
1849
1850                 /*
1851                  * Remark them as dirty, updating the count for any pages
1852                  * that weren't previously dirty.
1853                  */
1854                 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1855             }
1856         }
1857
1858         if (unsent_pass) {
1859             /* Find the next sent page for the next iteration */
1860             run_start = find_next_zero_bit(unsentmap, last + 1,
1861                                            run_start);
1862         } else {
1863             /* Find the next dirty page for the next iteration */
1864             run_start = find_next_bit(bitmap, last + 1, run_start);
1865         }
1866     }
1867 }
1868
1869 /**
1870  * postcopy_chuck_hostpages: discrad any partially sent host page
1871  *
1872  * Utility for the outgoing postcopy code.
1873  *
1874  * Discard any partially sent host-page size chunks, mark any partially
1875  * dirty host-page size chunks as all dirty.  In this case the host-page
1876  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1877  *
1878  * Returns zero on success
1879  *
1880  * @ms: current migration state
1881  */
1882 static int postcopy_chunk_hostpages(MigrationState *ms)
1883 {
1884     RAMState *rs = &ram_state;
1885     struct RAMBlock *block;
1886
1887     /* Easiest way to make sure we don't resume in the middle of a host-page */
1888     rs->last_seen_block = NULL;
1889     rs->last_sent_block = NULL;
1890     rs->last_offset     = 0;
1891
1892     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1893         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1894
1895         PostcopyDiscardState *pds =
1896                          postcopy_discard_send_init(ms, first, block->idstr);
1897
1898         /* First pass: Discard all partially sent host pages */
1899         postcopy_chunk_hostpages_pass(ms, true, block, pds);
1900         /*
1901          * Second pass: Ensure that all partially dirty host pages are made
1902          * fully dirty.
1903          */
1904         postcopy_chunk_hostpages_pass(ms, false, block, pds);
1905
1906         postcopy_discard_send_finish(ms, pds);
1907     } /* ram_list loop */
1908
1909     return 0;
1910 }
1911
1912 /**
1913  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1914  *
1915  * Returns zero on success
1916  *
1917  * Transmit the set of pages to be discarded after precopy to the target
1918  * these are pages that:
1919  *     a) Have been previously transmitted but are now dirty again
1920  *     b) Pages that have never been transmitted, this ensures that
1921  *        any pages on the destination that have been mapped by background
1922  *        tasks get discarded (transparent huge pages is the specific concern)
1923  * Hopefully this is pretty sparse
1924  *
1925  * @ms: current migration state
1926  */
1927 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1928 {
1929     int ret;
1930     unsigned long *bitmap, *unsentmap;
1931
1932     rcu_read_lock();
1933
1934     /* This should be our last sync, the src is now paused */
1935     migration_bitmap_sync(&ram_state);
1936
1937     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1938     if (!unsentmap) {
1939         /* We don't have a safe way to resize the sentmap, so
1940          * if the bitmap was resized it will be NULL at this
1941          * point.
1942          */
1943         error_report("migration ram resized during precopy phase");
1944         rcu_read_unlock();
1945         return -EINVAL;
1946     }
1947
1948     /* Deal with TPS != HPS and huge pages */
1949     ret = postcopy_chunk_hostpages(ms);
1950     if (ret) {
1951         rcu_read_unlock();
1952         return ret;
1953     }
1954
1955     /*
1956      * Update the unsentmap to be unsentmap = unsentmap | dirty
1957      */
1958     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1959     bitmap_or(unsentmap, unsentmap, bitmap,
1960                last_ram_offset() >> TARGET_PAGE_BITS);
1961
1962
1963     trace_ram_postcopy_send_discard_bitmap();
1964 #ifdef DEBUG_POSTCOPY
1965     ram_debug_dump_bitmap(unsentmap, true);
1966 #endif
1967
1968     ret = postcopy_each_ram_send_discard(ms);
1969     rcu_read_unlock();
1970
1971     return ret;
1972 }
1973
1974 /**
1975  * ram_discard_range: discard dirtied pages at the beginning of postcopy
1976  *
1977  * Returns zero on success
1978  *
1979  * @mis: current migration incoming state
1980  * @rbname: name of the RAMBlock of the request. NULL means the
1981  *          same that last one.
1982  * @start: RAMBlock starting page
1983  * @length: RAMBlock size
1984  */
1985 int ram_discard_range(MigrationIncomingState *mis,
1986                       const char *rbname,
1987                       uint64_t start, size_t length)
1988 {
1989     int ret = -1;
1990
1991     trace_ram_discard_range(rbname, start, length);
1992
1993     rcu_read_lock();
1994     RAMBlock *rb = qemu_ram_block_by_name(rbname);
1995
1996     if (!rb) {
1997         error_report("ram_discard_range: Failed to find block '%s'", rbname);
1998         goto err;
1999     }
2000
2001     ret = ram_block_discard_range(rb, start, length);
2002
2003 err:
2004     rcu_read_unlock();
2005
2006     return ret;
2007 }
2008
2009 static int ram_save_init_globals(RAMState *rs)
2010 {
2011     int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
2012
2013     rs->dirty_rate_high_cnt = 0;
2014     rs->bitmap_sync_count = 0;
2015     rs->zero_pages = 0;
2016     migration_bitmap_sync_init(rs);
2017     qemu_mutex_init(&migration_bitmap_mutex);
2018
2019     if (migrate_use_xbzrle()) {
2020         XBZRLE_cache_lock();
2021         ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
2022         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
2023                                   TARGET_PAGE_SIZE,
2024                                   TARGET_PAGE_SIZE);
2025         if (!XBZRLE.cache) {
2026             XBZRLE_cache_unlock();
2027             error_report("Error creating cache");
2028             return -1;
2029         }
2030         XBZRLE_cache_unlock();
2031
2032         /* We prefer not to abort if there is no memory */
2033         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2034         if (!XBZRLE.encoded_buf) {
2035             error_report("Error allocating encoded_buf");
2036             return -1;
2037         }
2038
2039         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2040         if (!XBZRLE.current_buf) {
2041             error_report("Error allocating current_buf");
2042             g_free(XBZRLE.encoded_buf);
2043             XBZRLE.encoded_buf = NULL;
2044             return -1;
2045         }
2046
2047         acct_clear();
2048     }
2049
2050     /* For memory_global_dirty_log_start below.  */
2051     qemu_mutex_lock_iothread();
2052
2053     qemu_mutex_lock_ramlist();
2054     rcu_read_lock();
2055     bytes_transferred = 0;
2056     ram_state_reset(rs);
2057
2058     migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
2059     /* Skip setting bitmap if there is no RAM */
2060     if (ram_bytes_total()) {
2061         ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2062         migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
2063         bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
2064
2065         if (migrate_postcopy_ram()) {
2066             migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
2067             bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
2068         }
2069     }
2070
2071     /*
2072      * Count the total number of pages used by ram blocks not including any
2073      * gaps due to alignment or unplugs.
2074      */
2075     migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2076
2077     memory_global_dirty_log_start();
2078     migration_bitmap_sync(rs);
2079     qemu_mutex_unlock_ramlist();
2080     qemu_mutex_unlock_iothread();
2081     rcu_read_unlock();
2082
2083     return 0;
2084 }
2085
2086 /*
2087  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2088  * long-running RCU critical section.  When rcu-reclaims in the code
2089  * start to become numerous it will be necessary to reduce the
2090  * granularity of these critical sections.
2091  */
2092
2093 /**
2094  * ram_save_setup: Setup RAM for migration
2095  *
2096  * Returns zero to indicate success and negative for error
2097  *
2098  * @f: QEMUFile where to send the data
2099  * @opaque: RAMState pointer
2100  */
2101 static int ram_save_setup(QEMUFile *f, void *opaque)
2102 {
2103     RAMState *rs = opaque;
2104     RAMBlock *block;
2105
2106     /* migration has already setup the bitmap, reuse it. */
2107     if (!migration_in_colo_state()) {
2108         if (ram_save_init_globals(rs) < 0) {
2109             return -1;
2110          }
2111     }
2112
2113     rcu_read_lock();
2114
2115     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2116
2117     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2118         qemu_put_byte(f, strlen(block->idstr));
2119         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2120         qemu_put_be64(f, block->used_length);
2121         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2122             qemu_put_be64(f, block->page_size);
2123         }
2124     }
2125
2126     rcu_read_unlock();
2127
2128     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2129     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2130
2131     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2132
2133     return 0;
2134 }
2135
2136 /**
2137  * ram_save_iterate: iterative stage for migration
2138  *
2139  * Returns zero to indicate success and negative for error
2140  *
2141  * @f: QEMUFile where to send the data
2142  * @opaque: RAMState pointer
2143  */
2144 static int ram_save_iterate(QEMUFile *f, void *opaque)
2145 {
2146     RAMState *rs = opaque;
2147     int ret;
2148     int i;
2149     int64_t t0;
2150     int done = 0;
2151
2152     rcu_read_lock();
2153     if (ram_list.version != rs->last_version) {
2154         ram_state_reset(rs);
2155     }
2156
2157     /* Read version before ram_list.blocks */
2158     smp_rmb();
2159
2160     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2161
2162     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2163     i = 0;
2164     while ((ret = qemu_file_rate_limit(f)) == 0) {
2165         int pages;
2166
2167         pages = ram_find_and_save_block(rs, f, false, &bytes_transferred);
2168         /* no more pages to sent */
2169         if (pages == 0) {
2170             done = 1;
2171             break;
2172         }
2173         acct_info.iterations++;
2174
2175         /* we want to check in the 1st loop, just in case it was the 1st time
2176            and we had to sync the dirty bitmap.
2177            qemu_get_clock_ns() is a bit expensive, so we only check each some
2178            iterations
2179         */
2180         if ((i & 63) == 0) {
2181             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2182             if (t1 > MAX_WAIT) {
2183                 trace_ram_save_iterate_big_wait(t1, i);
2184                 break;
2185             }
2186         }
2187         i++;
2188     }
2189     flush_compressed_data(f);
2190     rcu_read_unlock();
2191
2192     /*
2193      * Must occur before EOS (or any QEMUFile operation)
2194      * because of RDMA protocol.
2195      */
2196     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2197
2198     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2199     bytes_transferred += 8;
2200
2201     ret = qemu_file_get_error(f);
2202     if (ret < 0) {
2203         return ret;
2204     }
2205
2206     return done;
2207 }
2208
2209 /**
2210  * ram_save_complete: function called to send the remaining amount of ram
2211  *
2212  * Returns zero to indicate success
2213  *
2214  * Called with iothread lock
2215  *
2216  * @f: QEMUFile where to send the data
2217  * @opaque: RAMState pointer
2218  */
2219 static int ram_save_complete(QEMUFile *f, void *opaque)
2220 {
2221     RAMState *rs = opaque;
2222
2223     rcu_read_lock();
2224
2225     if (!migration_in_postcopy(migrate_get_current())) {
2226         migration_bitmap_sync(rs);
2227     }
2228
2229     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2230
2231     /* try transferring iterative blocks of memory */
2232
2233     /* flush all remaining blocks regardless of rate limiting */
2234     while (true) {
2235         int pages;
2236
2237         pages = ram_find_and_save_block(rs, f, !migration_in_colo_state(),
2238                                         &bytes_transferred);
2239         /* no more blocks to sent */
2240         if (pages == 0) {
2241             break;
2242         }
2243     }
2244
2245     flush_compressed_data(f);
2246     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2247
2248     rcu_read_unlock();
2249
2250     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2251
2252     return 0;
2253 }
2254
2255 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2256                              uint64_t *non_postcopiable_pending,
2257                              uint64_t *postcopiable_pending)
2258 {
2259     RAMState *rs = opaque;
2260     uint64_t remaining_size;
2261
2262     remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2263
2264     if (!migration_in_postcopy(migrate_get_current()) &&
2265         remaining_size < max_size) {
2266         qemu_mutex_lock_iothread();
2267         rcu_read_lock();
2268         migration_bitmap_sync(rs);
2269         rcu_read_unlock();
2270         qemu_mutex_unlock_iothread();
2271         remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2272     }
2273
2274     /* We can do postcopy, and all the data is postcopiable */
2275     *postcopiable_pending += remaining_size;
2276 }
2277
2278 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2279 {
2280     unsigned int xh_len;
2281     int xh_flags;
2282     uint8_t *loaded_data;
2283
2284     if (!xbzrle_decoded_buf) {
2285         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2286     }
2287     loaded_data = xbzrle_decoded_buf;
2288
2289     /* extract RLE header */
2290     xh_flags = qemu_get_byte(f);
2291     xh_len = qemu_get_be16(f);
2292
2293     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2294         error_report("Failed to load XBZRLE page - wrong compression!");
2295         return -1;
2296     }
2297
2298     if (xh_len > TARGET_PAGE_SIZE) {
2299         error_report("Failed to load XBZRLE page - len overflow!");
2300         return -1;
2301     }
2302     /* load data and decode */
2303     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2304
2305     /* decode RLE */
2306     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2307                              TARGET_PAGE_SIZE) == -1) {
2308         error_report("Failed to load XBZRLE page - decode error!");
2309         return -1;
2310     }
2311
2312     return 0;
2313 }
2314
2315 /**
2316  * ram_block_from_stream: read a RAMBlock id from the migration stream
2317  *
2318  * Must be called from within a rcu critical section.
2319  *
2320  * Returns a pointer from within the RCU-protected ram_list.
2321  *
2322  * @f: QEMUFile where to read the data from
2323  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2324  */
2325 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2326 {
2327     static RAMBlock *block = NULL;
2328     char id[256];
2329     uint8_t len;
2330
2331     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2332         if (!block) {
2333             error_report("Ack, bad migration stream!");
2334             return NULL;
2335         }
2336         return block;
2337     }
2338
2339     len = qemu_get_byte(f);
2340     qemu_get_buffer(f, (uint8_t *)id, len);
2341     id[len] = 0;
2342
2343     block = qemu_ram_block_by_name(id);
2344     if (!block) {
2345         error_report("Can't find block %s", id);
2346         return NULL;
2347     }
2348
2349     return block;
2350 }
2351
2352 static inline void *host_from_ram_block_offset(RAMBlock *block,
2353                                                ram_addr_t offset)
2354 {
2355     if (!offset_in_ramblock(block, offset)) {
2356         return NULL;
2357     }
2358
2359     return block->host + offset;
2360 }
2361
2362 /**
2363  * ram_handle_compressed: handle the zero page case
2364  *
2365  * If a page (or a whole RDMA chunk) has been
2366  * determined to be zero, then zap it.
2367  *
2368  * @host: host address for the zero page
2369  * @ch: what the page is filled from.  We only support zero
2370  * @size: size of the zero page
2371  */
2372 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2373 {
2374     if (ch != 0 || !is_zero_range(host, size)) {
2375         memset(host, ch, size);
2376     }
2377 }
2378
2379 static void *do_data_decompress(void *opaque)
2380 {
2381     DecompressParam *param = opaque;
2382     unsigned long pagesize;
2383     uint8_t *des;
2384     int len;
2385
2386     qemu_mutex_lock(&param->mutex);
2387     while (!param->quit) {
2388         if (param->des) {
2389             des = param->des;
2390             len = param->len;
2391             param->des = 0;
2392             qemu_mutex_unlock(&param->mutex);
2393
2394             pagesize = TARGET_PAGE_SIZE;
2395             /* uncompress() will return failed in some case, especially
2396              * when the page is dirted when doing the compression, it's
2397              * not a problem because the dirty page will be retransferred
2398              * and uncompress() won't break the data in other pages.
2399              */
2400             uncompress((Bytef *)des, &pagesize,
2401                        (const Bytef *)param->compbuf, len);
2402
2403             qemu_mutex_lock(&decomp_done_lock);
2404             param->done = true;
2405             qemu_cond_signal(&decomp_done_cond);
2406             qemu_mutex_unlock(&decomp_done_lock);
2407
2408             qemu_mutex_lock(&param->mutex);
2409         } else {
2410             qemu_cond_wait(&param->cond, &param->mutex);
2411         }
2412     }
2413     qemu_mutex_unlock(&param->mutex);
2414
2415     return NULL;
2416 }
2417
2418 static void wait_for_decompress_done(void)
2419 {
2420     int idx, thread_count;
2421
2422     if (!migrate_use_compression()) {
2423         return;
2424     }
2425
2426     thread_count = migrate_decompress_threads();
2427     qemu_mutex_lock(&decomp_done_lock);
2428     for (idx = 0; idx < thread_count; idx++) {
2429         while (!decomp_param[idx].done) {
2430             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2431         }
2432     }
2433     qemu_mutex_unlock(&decomp_done_lock);
2434 }
2435
2436 void migrate_decompress_threads_create(void)
2437 {
2438     int i, thread_count;
2439
2440     thread_count = migrate_decompress_threads();
2441     decompress_threads = g_new0(QemuThread, thread_count);
2442     decomp_param = g_new0(DecompressParam, thread_count);
2443     qemu_mutex_init(&decomp_done_lock);
2444     qemu_cond_init(&decomp_done_cond);
2445     for (i = 0; i < thread_count; i++) {
2446         qemu_mutex_init(&decomp_param[i].mutex);
2447         qemu_cond_init(&decomp_param[i].cond);
2448         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2449         decomp_param[i].done = true;
2450         decomp_param[i].quit = false;
2451         qemu_thread_create(decompress_threads + i, "decompress",
2452                            do_data_decompress, decomp_param + i,
2453                            QEMU_THREAD_JOINABLE);
2454     }
2455 }
2456
2457 void migrate_decompress_threads_join(void)
2458 {
2459     int i, thread_count;
2460
2461     thread_count = migrate_decompress_threads();
2462     for (i = 0; i < thread_count; i++) {
2463         qemu_mutex_lock(&decomp_param[i].mutex);
2464         decomp_param[i].quit = true;
2465         qemu_cond_signal(&decomp_param[i].cond);
2466         qemu_mutex_unlock(&decomp_param[i].mutex);
2467     }
2468     for (i = 0; i < thread_count; i++) {
2469         qemu_thread_join(decompress_threads + i);
2470         qemu_mutex_destroy(&decomp_param[i].mutex);
2471         qemu_cond_destroy(&decomp_param[i].cond);
2472         g_free(decomp_param[i].compbuf);
2473     }
2474     g_free(decompress_threads);
2475     g_free(decomp_param);
2476     decompress_threads = NULL;
2477     decomp_param = NULL;
2478 }
2479
2480 static void decompress_data_with_multi_threads(QEMUFile *f,
2481                                                void *host, int len)
2482 {
2483     int idx, thread_count;
2484
2485     thread_count = migrate_decompress_threads();
2486     qemu_mutex_lock(&decomp_done_lock);
2487     while (true) {
2488         for (idx = 0; idx < thread_count; idx++) {
2489             if (decomp_param[idx].done) {
2490                 decomp_param[idx].done = false;
2491                 qemu_mutex_lock(&decomp_param[idx].mutex);
2492                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2493                 decomp_param[idx].des = host;
2494                 decomp_param[idx].len = len;
2495                 qemu_cond_signal(&decomp_param[idx].cond);
2496                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2497                 break;
2498             }
2499         }
2500         if (idx < thread_count) {
2501             break;
2502         } else {
2503             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2504         }
2505     }
2506     qemu_mutex_unlock(&decomp_done_lock);
2507 }
2508
2509 /**
2510  * ram_postcopy_incoming_init: allocate postcopy data structures
2511  *
2512  * Returns 0 for success and negative if there was one error
2513  *
2514  * @mis: current migration incoming state
2515  *
2516  * Allocate data structures etc needed by incoming migration with
2517  * postcopy-ram. postcopy-ram's similarly names
2518  * postcopy_ram_incoming_init does the work.
2519  */
2520 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2521 {
2522     size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2523
2524     return postcopy_ram_incoming_init(mis, ram_pages);
2525 }
2526
2527 /**
2528  * ram_load_postcopy: load a page in postcopy case
2529  *
2530  * Returns 0 for success or -errno in case of error
2531  *
2532  * Called in postcopy mode by ram_load().
2533  * rcu_read_lock is taken prior to this being called.
2534  *
2535  * @f: QEMUFile where to send the data
2536  */
2537 static int ram_load_postcopy(QEMUFile *f)
2538 {
2539     int flags = 0, ret = 0;
2540     bool place_needed = false;
2541     bool matching_page_sizes = false;
2542     MigrationIncomingState *mis = migration_incoming_get_current();
2543     /* Temporary page that is later 'placed' */
2544     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2545     void *last_host = NULL;
2546     bool all_zero = false;
2547
2548     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2549         ram_addr_t addr;
2550         void *host = NULL;
2551         void *page_buffer = NULL;
2552         void *place_source = NULL;
2553         RAMBlock *block = NULL;
2554         uint8_t ch;
2555
2556         addr = qemu_get_be64(f);
2557         flags = addr & ~TARGET_PAGE_MASK;
2558         addr &= TARGET_PAGE_MASK;
2559
2560         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2561         place_needed = false;
2562         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2563             block = ram_block_from_stream(f, flags);
2564
2565             host = host_from_ram_block_offset(block, addr);
2566             if (!host) {
2567                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2568                 ret = -EINVAL;
2569                 break;
2570             }
2571             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2572             /*
2573              * Postcopy requires that we place whole host pages atomically;
2574              * these may be huge pages for RAMBlocks that are backed by
2575              * hugetlbfs.
2576              * To make it atomic, the data is read into a temporary page
2577              * that's moved into place later.
2578              * The migration protocol uses,  possibly smaller, target-pages
2579              * however the source ensures it always sends all the components
2580              * of a host page in order.
2581              */
2582             page_buffer = postcopy_host_page +
2583                           ((uintptr_t)host & (block->page_size - 1));
2584             /* If all TP are zero then we can optimise the place */
2585             if (!((uintptr_t)host & (block->page_size - 1))) {
2586                 all_zero = true;
2587             } else {
2588                 /* not the 1st TP within the HP */
2589                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2590                     error_report("Non-sequential target page %p/%p",
2591                                   host, last_host);
2592                     ret = -EINVAL;
2593                     break;
2594                 }
2595             }
2596
2597
2598             /*
2599              * If it's the last part of a host page then we place the host
2600              * page
2601              */
2602             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2603                                      (block->page_size - 1)) == 0;
2604             place_source = postcopy_host_page;
2605         }
2606         last_host = host;
2607
2608         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2609         case RAM_SAVE_FLAG_COMPRESS:
2610             ch = qemu_get_byte(f);
2611             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2612             if (ch) {
2613                 all_zero = false;
2614             }
2615             break;
2616
2617         case RAM_SAVE_FLAG_PAGE:
2618             all_zero = false;
2619             if (!place_needed || !matching_page_sizes) {
2620                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2621             } else {
2622                 /* Avoids the qemu_file copy during postcopy, which is
2623                  * going to do a copy later; can only do it when we
2624                  * do this read in one go (matching page sizes)
2625                  */
2626                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2627                                          TARGET_PAGE_SIZE);
2628             }
2629             break;
2630         case RAM_SAVE_FLAG_EOS:
2631             /* normal exit */
2632             break;
2633         default:
2634             error_report("Unknown combination of migration flags: %#x"
2635                          " (postcopy mode)", flags);
2636             ret = -EINVAL;
2637         }
2638
2639         if (place_needed) {
2640             /* This gets called at the last target page in the host page */
2641             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2642
2643             if (all_zero) {
2644                 ret = postcopy_place_page_zero(mis, place_dest,
2645                                                block->page_size);
2646             } else {
2647                 ret = postcopy_place_page(mis, place_dest,
2648                                           place_source, block->page_size);
2649             }
2650         }
2651         if (!ret) {
2652             ret = qemu_file_get_error(f);
2653         }
2654     }
2655
2656     return ret;
2657 }
2658
2659 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2660 {
2661     int flags = 0, ret = 0;
2662     static uint64_t seq_iter;
2663     int len = 0;
2664     /*
2665      * If system is running in postcopy mode, page inserts to host memory must
2666      * be atomic
2667      */
2668     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2669     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2670     bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2671
2672     seq_iter++;
2673
2674     if (version_id != 4) {
2675         ret = -EINVAL;
2676     }
2677
2678     /* This RCU critical section can be very long running.
2679      * When RCU reclaims in the code start to become numerous,
2680      * it will be necessary to reduce the granularity of this
2681      * critical section.
2682      */
2683     rcu_read_lock();
2684
2685     if (postcopy_running) {
2686         ret = ram_load_postcopy(f);
2687     }
2688
2689     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2690         ram_addr_t addr, total_ram_bytes;
2691         void *host = NULL;
2692         uint8_t ch;
2693
2694         addr = qemu_get_be64(f);
2695         flags = addr & ~TARGET_PAGE_MASK;
2696         addr &= TARGET_PAGE_MASK;
2697
2698         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2699                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2700             RAMBlock *block = ram_block_from_stream(f, flags);
2701
2702             host = host_from_ram_block_offset(block, addr);
2703             if (!host) {
2704                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2705                 ret = -EINVAL;
2706                 break;
2707             }
2708         }
2709
2710         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2711         case RAM_SAVE_FLAG_MEM_SIZE:
2712             /* Synchronize RAM block list */
2713             total_ram_bytes = addr;
2714             while (!ret && total_ram_bytes) {
2715                 RAMBlock *block;
2716                 char id[256];
2717                 ram_addr_t length;
2718
2719                 len = qemu_get_byte(f);
2720                 qemu_get_buffer(f, (uint8_t *)id, len);
2721                 id[len] = 0;
2722                 length = qemu_get_be64(f);
2723
2724                 block = qemu_ram_block_by_name(id);
2725                 if (block) {
2726                     if (length != block->used_length) {
2727                         Error *local_err = NULL;
2728
2729                         ret = qemu_ram_resize(block, length,
2730                                               &local_err);
2731                         if (local_err) {
2732                             error_report_err(local_err);
2733                         }
2734                     }
2735                     /* For postcopy we need to check hugepage sizes match */
2736                     if (postcopy_advised &&
2737                         block->page_size != qemu_host_page_size) {
2738                         uint64_t remote_page_size = qemu_get_be64(f);
2739                         if (remote_page_size != block->page_size) {
2740                             error_report("Mismatched RAM page size %s "
2741                                          "(local) %zd != %" PRId64,
2742                                          id, block->page_size,
2743                                          remote_page_size);
2744                             ret = -EINVAL;
2745                         }
2746                     }
2747                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2748                                           block->idstr);
2749                 } else {
2750                     error_report("Unknown ramblock \"%s\", cannot "
2751                                  "accept migration", id);
2752                     ret = -EINVAL;
2753                 }
2754
2755                 total_ram_bytes -= length;
2756             }
2757             break;
2758
2759         case RAM_SAVE_FLAG_COMPRESS:
2760             ch = qemu_get_byte(f);
2761             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2762             break;
2763
2764         case RAM_SAVE_FLAG_PAGE:
2765             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2766             break;
2767
2768         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2769             len = qemu_get_be32(f);
2770             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2771                 error_report("Invalid compressed data length: %d", len);
2772                 ret = -EINVAL;
2773                 break;
2774             }
2775             decompress_data_with_multi_threads(f, host, len);
2776             break;
2777
2778         case RAM_SAVE_FLAG_XBZRLE:
2779             if (load_xbzrle(f, addr, host) < 0) {
2780                 error_report("Failed to decompress XBZRLE page at "
2781                              RAM_ADDR_FMT, addr);
2782                 ret = -EINVAL;
2783                 break;
2784             }
2785             break;
2786         case RAM_SAVE_FLAG_EOS:
2787             /* normal exit */
2788             break;
2789         default:
2790             if (flags & RAM_SAVE_FLAG_HOOK) {
2791                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2792             } else {
2793                 error_report("Unknown combination of migration flags: %#x",
2794                              flags);
2795                 ret = -EINVAL;
2796             }
2797         }
2798         if (!ret) {
2799             ret = qemu_file_get_error(f);
2800         }
2801     }
2802
2803     wait_for_decompress_done();
2804     rcu_read_unlock();
2805     trace_ram_load_complete(ret, seq_iter);
2806     return ret;
2807 }
2808
2809 static SaveVMHandlers savevm_ram_handlers = {
2810     .save_live_setup = ram_save_setup,
2811     .save_live_iterate = ram_save_iterate,
2812     .save_live_complete_postcopy = ram_save_complete,
2813     .save_live_complete_precopy = ram_save_complete,
2814     .save_live_pending = ram_save_pending,
2815     .load_state = ram_load,
2816     .cleanup = ram_migration_cleanup,
2817 };
2818
2819 void ram_mig_init(void)
2820 {
2821     qemu_mutex_init(&XBZRLE.lock);
2822     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2823 }