migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "qemu-common.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qapi-event.h"
  33 #include "qemu/cutils.h"
  34 #include "qemu/bitops.h"
  35 #include "qemu/bitmap.h"
  36 #include "qemu/timer.h"
  37 #include "qemu/main-loop.h"
  38 #include "migration/migration.h"
  39 #include "migration/postcopy-ram.h"
  40 #include "exec/address-spaces.h"
  41 #include "migration/page_cache.h"
  42 #include "qemu/error-report.h"
  43 #include "trace.h"
  44 #include "exec/ram_addr.h"
  45 #include "qemu/rcu_queue.h"
  46
  47 #ifdef DEBUG_MIGRATION_RAM
  48 #define DPRINTF(fmt, ...) \
  49     do { fprintf(stdout, "migration_ram: " fmt, ## __VA_ARGS__); } while (0)
  50 #else
  51 #define DPRINTF(fmt, ...) \
  52     do { } while (0)
  53 #endif
  54
  55 static int dirty_rate_high_cnt;
  56
  57 static uint64_t bitmap_sync_count;
  58
  59 /***********************************************************/
  60 /* ram save/restore */
  61
  62 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  63 #define RAM_SAVE_FLAG_COMPRESS 0x02
  64 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  65 #define RAM_SAVE_FLAG_PAGE     0x08
  66 #define RAM_SAVE_FLAG_EOS      0x10
  67 #define RAM_SAVE_FLAG_CONTINUE 0x20
  68 #define RAM_SAVE_FLAG_XBZRLE   0x40
  69 /* 0x80 is reserved in migration.h start with 0x100 next */
  70 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  71
  72 static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE];
  73
  74 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  75 {
  76     return buffer_is_zero(p, size);
  77 }
  78
  79 /* struct contains XBZRLE cache and a static page
  80    used by the compression */
  81 static struct {
  82     /* buffer used for XBZRLE encoding */
  83     uint8_t *encoded_buf;
  84     /* buffer for storing page content */
  85     uint8_t *current_buf;
  86     /* Cache for XBZRLE, Protected by lock. */
  87     PageCache *cache;
  88     QemuMutex lock;
  89 } XBZRLE;
  90
  91 /* buffer used for XBZRLE decoding */
  92 static uint8_t *xbzrle_decoded_buf;
  93
  94 static void XBZRLE_cache_lock(void)
  95 {
  96     if (migrate_use_xbzrle())
  97         qemu_mutex_lock(&XBZRLE.lock);
  98 }
  99
 100 static void XBZRLE_cache_unlock(void)
 101 {
 102     if (migrate_use_xbzrle())
 103         qemu_mutex_unlock(&XBZRLE.lock);
 104 }
 105
 106 /*
 107  * called from qmp_migrate_set_cache_size in main thread, possibly while
 108  * a migration is in progress.
 109  * A running migration maybe using the cache and might finish during this
 110  * call, hence changes to the cache are protected by XBZRLE.lock().
 111  */
 112 int64_t xbzrle_cache_resize(int64_t new_size)
 113 {
 114     PageCache *new_cache;
 115     int64_t ret;
 116
 117     if (new_size < TARGET_PAGE_SIZE) {
 118         return -1;
 119     }
 120
 121     XBZRLE_cache_lock();
 122
 123     if (XBZRLE.cache != NULL) {
 124         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 125             goto out_new_size;
 126         }
 127         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 128                                         TARGET_PAGE_SIZE);
 129         if (!new_cache) {
 130             error_report("Error creating cache");
 131             ret = -1;
 132             goto out;
 133         }
 134
 135         cache_fini(XBZRLE.cache);
 136         XBZRLE.cache = new_cache;
 137     }
 138
 139 out_new_size:
 140     ret = pow2floor(new_size);
 141 out:
 142     XBZRLE_cache_unlock();
 143     return ret;
 144 }
 145
 146 /* accounting for migration statistics */
 147 typedef struct AccountingInfo {
 148     uint64_t dup_pages;
 149     uint64_t skipped_pages;
 150     uint64_t norm_pages;
 151     uint64_t iterations;
 152     uint64_t xbzrle_bytes;
 153     uint64_t xbzrle_pages;
 154     uint64_t xbzrle_cache_miss;
 155     double xbzrle_cache_miss_rate;
 156     uint64_t xbzrle_overflows;
 157 } AccountingInfo;
 158
 159 static AccountingInfo acct_info;
 160
 161 static void acct_clear(void)
 162 {
 163     memset(&acct_info, 0, sizeof(acct_info));
 164 }
 165
 166 uint64_t dup_mig_bytes_transferred(void)
 167 {
 168     return acct_info.dup_pages * TARGET_PAGE_SIZE;
 169 }
 170
 171 uint64_t dup_mig_pages_transferred(void)
 172 {
 173     return acct_info.dup_pages;
 174 }
 175
 176 uint64_t skipped_mig_bytes_transferred(void)
 177 {
 178     return acct_info.skipped_pages * TARGET_PAGE_SIZE;
 179 }
 180
 181 uint64_t skipped_mig_pages_transferred(void)
 182 {
 183     return acct_info.skipped_pages;
 184 }
 185
 186 uint64_t norm_mig_bytes_transferred(void)
 187 {
 188     return acct_info.norm_pages * TARGET_PAGE_SIZE;
 189 }
 190
 191 uint64_t norm_mig_pages_transferred(void)
 192 {
 193     return acct_info.norm_pages;
 194 }
 195
 196 uint64_t xbzrle_mig_bytes_transferred(void)
 197 {
 198     return acct_info.xbzrle_bytes;
 199 }
 200
 201 uint64_t xbzrle_mig_pages_transferred(void)
 202 {
 203     return acct_info.xbzrle_pages;
 204 }
 205
 206 uint64_t xbzrle_mig_pages_cache_miss(void)
 207 {
 208     return acct_info.xbzrle_cache_miss;
 209 }
 210
 211 double xbzrle_mig_cache_miss_rate(void)
 212 {
 213     return acct_info.xbzrle_cache_miss_rate;
 214 }
 215
 216 uint64_t xbzrle_mig_pages_overflow(void)
 217 {
 218     return acct_info.xbzrle_overflows;
 219 }
 220
 221 /* This is the last block that we have visited serching for dirty pages
 222  */
 223 static RAMBlock *last_seen_block;
 224 /* This is the last block from where we have sent data */
 225 static RAMBlock *last_sent_block;
 226 static ram_addr_t last_offset;
 227 static QemuMutex migration_bitmap_mutex;
 228 static uint64_t migration_dirty_pages;
 229 static uint32_t last_version;
 230 static bool ram_bulk_stage;
 231
 232 /* used by the search for pages to send */
 233 struct PageSearchStatus {
 234     /* Current block being searched */
 235     RAMBlock    *block;
 236     /* Current offset to search from */
 237     ram_addr_t   offset;
 238     /* Set once we wrap around */
 239     bool         complete_round;
 240 };
 241 typedef struct PageSearchStatus PageSearchStatus;
 242
 243 static struct BitmapRcu {
 244     struct rcu_head rcu;
 245     /* Main migration bitmap */
 246     unsigned long *bmap;
 247     /* bitmap of pages that haven't been sent even once
 248      * only maintained and used in postcopy at the moment
 249      * where it's used to send the dirtymap at the start
 250      * of the postcopy phase
 251      */
 252     unsigned long *unsentmap;
 253 } *migration_bitmap_rcu;
 254
 255 struct CompressParam {
 256     bool done;
 257     bool quit;
 258     QEMUFile *file;
 259     QemuMutex mutex;
 260     QemuCond cond;
 261     RAMBlock *block;
 262     ram_addr_t offset;
 263 };
 264 typedef struct CompressParam CompressParam;
 265
 266 struct DecompressParam {
 267     bool done;
 268     bool quit;
 269     QemuMutex mutex;
 270     QemuCond cond;
 271     void *des;
 272     uint8_t *compbuf;
 273     int len;
 274 };
 275 typedef struct DecompressParam DecompressParam;
 276
 277 static CompressParam *comp_param;
 278 static QemuThread *compress_threads;
 279 /* comp_done_cond is used to wake up the migration thread when
 280  * one of the compression threads has finished the compression.
 281  * comp_done_lock is used to co-work with comp_done_cond.
 282  */
 283 static QemuMutex comp_done_lock;
 284 static QemuCond comp_done_cond;
 285 /* The empty QEMUFileOps will be used by file in CompressParam */
 286 static const QEMUFileOps empty_ops = { };
 287
 288 static bool compression_switch;
 289 static DecompressParam *decomp_param;
 290 static QemuThread *decompress_threads;
 291 static QemuMutex decomp_done_lock;
 292 static QemuCond decomp_done_cond;
 293
 294 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 295                                 ram_addr_t offset);
 296
 297 static void *do_data_compress(void *opaque)
 298 {
 299     CompressParam *param = opaque;
 300     RAMBlock *block;
 301     ram_addr_t offset;
 302
 303     qemu_mutex_lock(&param->mutex);
 304     while (!param->quit) {
 305         if (param->block) {
 306             block = param->block;
 307             offset = param->offset;
 308             param->block = NULL;
 309             qemu_mutex_unlock(&param->mutex);
 310
 311             do_compress_ram_page(param->file, block, offset);
 312
 313             qemu_mutex_lock(&comp_done_lock);
 314             param->done = true;
 315             qemu_cond_signal(&comp_done_cond);
 316             qemu_mutex_unlock(&comp_done_lock);
 317
 318             qemu_mutex_lock(&param->mutex);
 319         } else {
 320             qemu_cond_wait(&param->cond, &param->mutex);
 321         }
 322     }
 323     qemu_mutex_unlock(&param->mutex);
 324
 325     return NULL;
 326 }
 327
 328 static inline void terminate_compression_threads(void)
 329 {
 330     int idx, thread_count;
 331
 332     thread_count = migrate_compress_threads();
 333     for (idx = 0; idx < thread_count; idx++) {
 334         qemu_mutex_lock(&comp_param[idx].mutex);
 335         comp_param[idx].quit = true;
 336         qemu_cond_signal(&comp_param[idx].cond);
 337         qemu_mutex_unlock(&comp_param[idx].mutex);
 338     }
 339 }
 340
 341 void migrate_compress_threads_join(void)
 342 {
 343     int i, thread_count;
 344
 345     if (!migrate_use_compression()) {
 346         return;
 347     }
 348     terminate_compression_threads();
 349     thread_count = migrate_compress_threads();
 350     for (i = 0; i < thread_count; i++) {
 351         qemu_thread_join(compress_threads + i);
 352         qemu_fclose(comp_param[i].file);
 353         qemu_mutex_destroy(&comp_param[i].mutex);
 354         qemu_cond_destroy(&comp_param[i].cond);
 355     }
 356     qemu_mutex_destroy(&comp_done_lock);
 357     qemu_cond_destroy(&comp_done_cond);
 358     g_free(compress_threads);
 359     g_free(comp_param);
 360     compress_threads = NULL;
 361     comp_param = NULL;
 362 }
 363
 364 void migrate_compress_threads_create(void)
 365 {
 366     int i, thread_count;
 367
 368     if (!migrate_use_compression()) {
 369         return;
 370     }
 371     compression_switch = true;
 372     thread_count = migrate_compress_threads();
 373     compress_threads = g_new0(QemuThread, thread_count);
 374     comp_param = g_new0(CompressParam, thread_count);
 375     qemu_cond_init(&comp_done_cond);
 376     qemu_mutex_init(&comp_done_lock);
 377     for (i = 0; i < thread_count; i++) {
 378         /* comp_param[i].file is just used as a dummy buffer to save data,
 379          * set its ops to empty.
 380          */
 381         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 382         comp_param[i].done = true;
 383         comp_param[i].quit = false;
 384         qemu_mutex_init(&comp_param[i].mutex);
 385         qemu_cond_init(&comp_param[i].cond);
 386         qemu_thread_create(compress_threads + i, "compress",
 387                            do_data_compress, comp_param + i,
 388                            QEMU_THREAD_JOINABLE);
 389     }
 390 }
 391
 392 /**
 393  * save_page_header: Write page header to wire
 394  *
 395  * If this is the 1st block, it also writes the block identification
 396  *
 397  * Returns: Number of bytes written
 398  *
 399  * @f: QEMUFile where to send the data
 400  * @block: block that contains the page we want to send
 401  * @offset: offset inside the block for the page
 402  *          in the lower bits, it contains flags
 403  */
 404 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
 405 {
 406     size_t size, len;
 407
 408     qemu_put_be64(f, offset);
 409     size = 8;
 410
 411     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 412         len = strlen(block->idstr);
 413         qemu_put_byte(f, len);
 414         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 415         size += 1 + len;
 416     }
 417     return size;
 418 }
 419
 420 /* Reduce amount of guest cpu execution to hopefully slow down memory writes.
 421  * If guest dirty memory rate is reduced below the rate at which we can
 422  * transfer pages to the destination then we should be able to complete
 423  * migration. Some workloads dirty memory way too fast and will not effectively
 424  * converge, even with auto-converge.
 425  */
 426 static void mig_throttle_guest_down(void)
 427 {
 428     MigrationState *s = migrate_get_current();
 429     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 430     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 431
 432     /* We have not started throttling yet. Let's start it. */
 433     if (!cpu_throttle_active()) {
 434         cpu_throttle_set(pct_initial);
 435     } else {
 436         /* Throttling already on, just increase the rate */
 437         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 438     }
 439 }
 440
 441 /* Update the xbzrle cache to reflect a page that's been sent as all 0.
 442  * The important thing is that a stale (not-yet-0'd) page be replaced
 443  * by the new data.
 444  * As a bonus, if the page wasn't in the cache it gets added so that
 445  * when a small write is made into the 0'd page it gets XBZRLE sent
 446  */
 447 static void xbzrle_cache_zero_page(ram_addr_t current_addr)
 448 {
 449     if (ram_bulk_stage || !migrate_use_xbzrle()) {
 450         return;
 451     }
 452
 453     /* We don't care if this fails to allocate a new cache page
 454      * as long as it updated an old one */
 455     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
 456                  bitmap_sync_count);
 457 }
 458
 459 #define ENCODING_FLAG_XBZRLE 0x1
 460
 461 /**
 462  * save_xbzrle_page: compress and send current page
 463  *
 464  * Returns: 1 means that we wrote the page
 465  *          0 means that page is identical to the one already sent
 466  *          -1 means that xbzrle would be longer than normal
 467  *
 468  * @f: QEMUFile where to send the data
 469  * @current_data:
 470  * @current_addr:
 471  * @block: block that contains the page we want to send
 472  * @offset: offset inside the block for the page
 473  * @last_stage: if we are at the completion stage
 474  * @bytes_transferred: increase it with the number of transferred bytes
 475  */
 476 static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
 477                             ram_addr_t current_addr, RAMBlock *block,
 478                             ram_addr_t offset, bool last_stage,
 479                             uint64_t *bytes_transferred)
 480 {
 481     int encoded_len = 0, bytes_xbzrle;
 482     uint8_t *prev_cached_page;
 483
 484     if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
 485         acct_info.xbzrle_cache_miss++;
 486         if (!last_stage) {
 487             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 488                              bitmap_sync_count) == -1) {
 489                 return -1;
 490             } else {
 491                 /* update *current_data when the page has been
 492                    inserted into cache */
 493                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 494             }
 495         }
 496         return -1;
 497     }
 498
 499     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 500
 501     /* save current buffer into memory */
 502     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 503
 504     /* XBZRLE encoding (if there is no overflow) */
 505     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 506                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 507                                        TARGET_PAGE_SIZE);
 508     if (encoded_len == 0) {
 509         DPRINTF("Skipping unmodified page\n");
 510         return 0;
 511     } else if (encoded_len == -1) {
 512         DPRINTF("Overflow\n");
 513         acct_info.xbzrle_overflows++;
 514         /* update data in the cache */
 515         if (!last_stage) {
 516             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 517             *current_data = prev_cached_page;
 518         }
 519         return -1;
 520     }
 521
 522     /* we need to update the data in the cache, in order to get the same data */
 523     if (!last_stage) {
 524         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 525     }
 526
 527     /* Send XBZRLE based compressed page */
 528     bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
 529     qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
 530     qemu_put_be16(f, encoded_len);
 531     qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
 532     bytes_xbzrle += encoded_len + 1 + 2;
 533     acct_info.xbzrle_pages++;
 534     acct_info.xbzrle_bytes += bytes_xbzrle;
 535     *bytes_transferred += bytes_xbzrle;
 536
 537     return 1;
 538 }
 539
 540 /* Called with rcu_read_lock() to protect migration_bitmap
 541  * rb: The RAMBlock  to search for dirty pages in
 542  * start: Start address (typically so we can continue from previous page)
 543  * ram_addr_abs: Pointer into which to store the address of the dirty page
 544  *               within the global ram_addr space
 545  *
 546  * Returns: byte offset within memory region of the start of a dirty page
 547  */
 548 static inline
 549 ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
 550                                        ram_addr_t start,
 551                                        ram_addr_t *ram_addr_abs)
 552 {
 553     unsigned long base = rb->offset >> TARGET_PAGE_BITS;
 554     unsigned long nr = base + (start >> TARGET_PAGE_BITS);
 555     uint64_t rb_size = rb->used_length;
 556     unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
 557     unsigned long *bitmap;
 558
 559     unsigned long next;
 560
 561     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 562     if (ram_bulk_stage && nr > base) {
 563         next = nr + 1;
 564     } else {
 565         next = find_next_bit(bitmap, size, nr);
 566     }
 567
 568     *ram_addr_abs = next << TARGET_PAGE_BITS;
 569     return (next - base) << TARGET_PAGE_BITS;
 570 }
 571
 572 static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
 573 {
 574     bool ret;
 575     int nr = addr >> TARGET_PAGE_BITS;
 576     unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 577
 578     ret = test_and_clear_bit(nr, bitmap);
 579
 580     if (ret) {
 581         migration_dirty_pages--;
 582     }
 583     return ret;
 584 }
 585
 586 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
 587 {
 588     unsigned long *bitmap;
 589     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 590     migration_dirty_pages +=
 591         cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
 592 }
 593
 594 /* Fix me: there are too many global variables used in migration process. */
 595 static int64_t start_time;
 596 static int64_t bytes_xfer_prev;
 597 static int64_t num_dirty_pages_period;
 598 static uint64_t xbzrle_cache_miss_prev;
 599 static uint64_t iterations_prev;
 600
 601 static void migration_bitmap_sync_init(void)
 602 {
 603     start_time = 0;
 604     bytes_xfer_prev = 0;
 605     num_dirty_pages_period = 0;
 606     xbzrle_cache_miss_prev = 0;
 607     iterations_prev = 0;
 608 }
 609
 610 static void migration_bitmap_sync(void)
 611 {
 612     RAMBlock *block;
 613     uint64_t num_dirty_pages_init = migration_dirty_pages;
 614     MigrationState *s = migrate_get_current();
 615     int64_t end_time;
 616     int64_t bytes_xfer_now;
 617
 618     bitmap_sync_count++;
 619
 620     if (!bytes_xfer_prev) {
 621         bytes_xfer_prev = ram_bytes_transferred();
 622     }
 623
 624     if (!start_time) {
 625         start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 626     }
 627
 628     trace_migration_bitmap_sync_start();
 629     memory_global_dirty_log_sync();
 630
 631     qemu_mutex_lock(&migration_bitmap_mutex);
 632     rcu_read_lock();
 633     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 634         migration_bitmap_sync_range(block->offset, block->used_length);
 635     }
 636     rcu_read_unlock();
 637     qemu_mutex_unlock(&migration_bitmap_mutex);
 638
 639     trace_migration_bitmap_sync_end(migration_dirty_pages
 640                                     - num_dirty_pages_init);
 641     num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
 642     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 643
 644     /* more than 1 second = 1000 millisecons */
 645     if (end_time > start_time + 1000) {
 646         if (migrate_auto_converge()) {
 647             /* The following detection logic can be refined later. For now:
 648                Check to see if the dirtied bytes is 50% more than the approx.
 649                amount of bytes that just got transferred since the last time we
 650                were in this routine. If that happens twice, start or increase
 651                throttling */
 652             bytes_xfer_now = ram_bytes_transferred();
 653
 654             if (s->dirty_pages_rate &&
 655                (num_dirty_pages_period * TARGET_PAGE_SIZE >
 656                    (bytes_xfer_now - bytes_xfer_prev)/2) &&
 657                (dirty_rate_high_cnt++ >= 2)) {
 658                     trace_migration_throttle();
 659                     dirty_rate_high_cnt = 0;
 660                     mig_throttle_guest_down();
 661              }
 662              bytes_xfer_prev = bytes_xfer_now;
 663         }
 664
 665         if (migrate_use_xbzrle()) {
 666             if (iterations_prev != acct_info.iterations) {
 667                 acct_info.xbzrle_cache_miss_rate =
 668                    (double)(acct_info.xbzrle_cache_miss -
 669                             xbzrle_cache_miss_prev) /
 670                    (acct_info.iterations - iterations_prev);
 671             }
 672             iterations_prev = acct_info.iterations;
 673             xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
 674         }
 675         s->dirty_pages_rate = num_dirty_pages_period * 1000
 676             / (end_time - start_time);
 677         s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
 678         start_time = end_time;
 679         num_dirty_pages_period = 0;
 680     }
 681     s->dirty_sync_count = bitmap_sync_count;
 682     if (migrate_use_events()) {
 683         qapi_event_send_migration_pass(bitmap_sync_count, NULL);
 684     }
 685 }
 686
 687 /**
 688  * save_zero_page: Send the zero page to the stream
 689  *
 690  * Returns: Number of pages written.
 691  *
 692  * @f: QEMUFile where to send the data
 693  * @block: block that contains the page we want to send
 694  * @offset: offset inside the block for the page
 695  * @p: pointer to the page
 696  * @bytes_transferred: increase it with the number of transferred bytes
 697  */
 698 static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
 699                           uint8_t *p, uint64_t *bytes_transferred)
 700 {
 701     int pages = -1;
 702
 703     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 704         acct_info.dup_pages++;
 705         *bytes_transferred += save_page_header(f, block,
 706                                                offset | RAM_SAVE_FLAG_COMPRESS);
 707         qemu_put_byte(f, 0);
 708         *bytes_transferred += 1;
 709         pages = 1;
 710     }
 711
 712     return pages;
 713 }
 714
 715 /**
 716  * ram_save_page: Send the given page to the stream
 717  *
 718  * Returns: Number of pages written.
 719  *          < 0 - error
 720  *          >=0 - Number of pages written - this might legally be 0
 721  *                if xbzrle noticed the page was the same.
 722  *
 723  * @f: QEMUFile where to send the data
 724  * @block: block that contains the page we want to send
 725  * @offset: offset inside the block for the page
 726  * @last_stage: if we are at the completion stage
 727  * @bytes_transferred: increase it with the number of transferred bytes
 728  */
 729 static int ram_save_page(QEMUFile *f, PageSearchStatus *pss,
 730                          bool last_stage, uint64_t *bytes_transferred)
 731 {
 732     int pages = -1;
 733     uint64_t bytes_xmit;
 734     ram_addr_t current_addr;
 735     uint8_t *p;
 736     int ret;
 737     bool send_async = true;
 738     RAMBlock *block = pss->block;
 739     ram_addr_t offset = pss->offset;
 740
 741     p = block->host + offset;
 742
 743     /* In doubt sent page as normal */
 744     bytes_xmit = 0;
 745     ret = ram_control_save_page(f, block->offset,
 746                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 747     if (bytes_xmit) {
 748         *bytes_transferred += bytes_xmit;
 749         pages = 1;
 750     }
 751
 752     XBZRLE_cache_lock();
 753
 754     current_addr = block->offset + offset;
 755
 756     if (block == last_sent_block) {
 757         offset |= RAM_SAVE_FLAG_CONTINUE;
 758     }
 759     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 760         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 761             if (bytes_xmit > 0) {
 762                 acct_info.norm_pages++;
 763             } else if (bytes_xmit == 0) {
 764                 acct_info.dup_pages++;
 765             }
 766         }
 767     } else {
 768         pages = save_zero_page(f, block, offset, p, bytes_transferred);
 769         if (pages > 0) {
 770             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 771              * page would be stale
 772              */
 773             xbzrle_cache_zero_page(current_addr);
 774         } else if (!ram_bulk_stage &&
 775                    !migration_in_postcopy(migrate_get_current()) &&
 776                    migrate_use_xbzrle()) {
 777             pages = save_xbzrle_page(f, &p, current_addr, block,
 778                                      offset, last_stage, bytes_transferred);
 779             if (!last_stage) {
 780                 /* Can't send this cached data async, since the cache page
 781                  * might get updated before it gets to the wire
 782                  */
 783                 send_async = false;
 784             }
 785         }
 786     }
 787
 788     /* XBZRLE overflow or normal page */
 789     if (pages == -1) {
 790         *bytes_transferred += save_page_header(f, block,
 791                                                offset | RAM_SAVE_FLAG_PAGE);
 792         if (send_async) {
 793             qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
 794         } else {
 795             qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
 796         }
 797         *bytes_transferred += TARGET_PAGE_SIZE;
 798         pages = 1;
 799         acct_info.norm_pages++;
 800     }
 801
 802     XBZRLE_cache_unlock();
 803
 804     return pages;
 805 }
 806
 807 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 808                                 ram_addr_t offset)
 809 {
 810     int bytes_sent, blen;
 811     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 812
 813     bytes_sent = save_page_header(f, block, offset |
 814                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
 815     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 816                                      migrate_compress_level());
 817     if (blen < 0) {
 818         bytes_sent = 0;
 819         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
 820         error_report("compressed data failed!");
 821     } else {
 822         bytes_sent += blen;
 823     }
 824
 825     return bytes_sent;
 826 }
 827
 828 static uint64_t bytes_transferred;
 829
 830 static void flush_compressed_data(QEMUFile *f)
 831 {
 832     int idx, len, thread_count;
 833
 834     if (!migrate_use_compression()) {
 835         return;
 836     }
 837     thread_count = migrate_compress_threads();
 838
 839     qemu_mutex_lock(&comp_done_lock);
 840     for (idx = 0; idx < thread_count; idx++) {
 841         while (!comp_param[idx].done) {
 842             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 843         }
 844     }
 845     qemu_mutex_unlock(&comp_done_lock);
 846
 847     for (idx = 0; idx < thread_count; idx++) {
 848         qemu_mutex_lock(&comp_param[idx].mutex);
 849         if (!comp_param[idx].quit) {
 850             len = qemu_put_qemu_file(f, comp_param[idx].file);
 851             bytes_transferred += len;
 852         }
 853         qemu_mutex_unlock(&comp_param[idx].mutex);
 854     }
 855 }
 856
 857 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 858                                        ram_addr_t offset)
 859 {
 860     param->block = block;
 861     param->offset = offset;
 862 }
 863
 864 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
 865                                            ram_addr_t offset,
 866                                            uint64_t *bytes_transferred)
 867 {
 868     int idx, thread_count, bytes_xmit = -1, pages = -1;
 869
 870     thread_count = migrate_compress_threads();
 871     qemu_mutex_lock(&comp_done_lock);
 872     while (true) {
 873         for (idx = 0; idx < thread_count; idx++) {
 874             if (comp_param[idx].done) {
 875                 comp_param[idx].done = false;
 876                 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
 877                 qemu_mutex_lock(&comp_param[idx].mutex);
 878                 set_compress_params(&comp_param[idx], block, offset);
 879                 qemu_cond_signal(&comp_param[idx].cond);
 880                 qemu_mutex_unlock(&comp_param[idx].mutex);
 881                 pages = 1;
 882                 acct_info.norm_pages++;
 883                 *bytes_transferred += bytes_xmit;
 884                 break;
 885             }
 886         }
 887         if (pages > 0) {
 888             break;
 889         } else {
 890             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 891         }
 892     }
 893     qemu_mutex_unlock(&comp_done_lock);
 894
 895     return pages;
 896 }
 897
 898 /**
 899  * ram_save_compressed_page: compress the given page and send it to the stream
 900  *
 901  * Returns: Number of pages written.
 902  *
 903  * @f: QEMUFile where to send the data
 904  * @block: block that contains the page we want to send
 905  * @offset: offset inside the block for the page
 906  * @last_stage: if we are at the completion stage
 907  * @bytes_transferred: increase it with the number of transferred bytes
 908  */
 909 static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss,
 910                                     bool last_stage,
 911                                     uint64_t *bytes_transferred)
 912 {
 913     int pages = -1;
 914     uint64_t bytes_xmit = 0;
 915     uint8_t *p;
 916     int ret, blen;
 917     RAMBlock *block = pss->block;
 918     ram_addr_t offset = pss->offset;
 919
 920     p = block->host + offset;
 921
 922     ret = ram_control_save_page(f, block->offset,
 923                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
 924     if (bytes_xmit) {
 925         *bytes_transferred += bytes_xmit;
 926         pages = 1;
 927     }
 928     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 929         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 930             if (bytes_xmit > 0) {
 931                 acct_info.norm_pages++;
 932             } else if (bytes_xmit == 0) {
 933                 acct_info.dup_pages++;
 934             }
 935         }
 936     } else {
 937         /* When starting the process of a new block, the first page of
 938          * the block should be sent out before other pages in the same
 939          * block, and all the pages in last block should have been sent
 940          * out, keeping this order is important, because the 'cont' flag
 941          * is used to avoid resending the block name.
 942          */
 943         if (block != last_sent_block) {
 944             flush_compressed_data(f);
 945             pages = save_zero_page(f, block, offset, p, bytes_transferred);
 946             if (pages == -1) {
 947                 /* Make sure the first page is sent out before other pages */
 948                 bytes_xmit = save_page_header(f, block, offset |
 949                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
 950                 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 951                                                  migrate_compress_level());
 952                 if (blen > 0) {
 953                     *bytes_transferred += bytes_xmit + blen;
 954                     acct_info.norm_pages++;
 955                     pages = 1;
 956                 } else {
 957                     qemu_file_set_error(f, blen);
 958                     error_report("compressed data failed!");
 959                 }
 960             }
 961         } else {
 962             offset |= RAM_SAVE_FLAG_CONTINUE;
 963             pages = save_zero_page(f, block, offset, p, bytes_transferred);
 964             if (pages == -1) {
 965                 pages = compress_page_with_multi_thread(f, block, offset,
 966                                                         bytes_transferred);
 967             }
 968         }
 969     }
 970
 971     return pages;
 972 }
 973
 974 /*
 975  * Find the next dirty page and update any state associated with
 976  * the search process.
 977  *
 978  * Returns: True if a page is found
 979  *
 980  * @f: Current migration stream.
 981  * @pss: Data about the state of the current dirty page scan.
 982  * @*again: Set to false if the search has scanned the whole of RAM
 983  * *ram_addr_abs: Pointer into which to store the address of the dirty page
 984  *               within the global ram_addr space
 985  */
 986 static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
 987                              bool *again, ram_addr_t *ram_addr_abs)
 988 {
 989     pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset,
 990                                               ram_addr_abs);
 991     if (pss->complete_round && pss->block == last_seen_block &&
 992         pss->offset >= last_offset) {
 993         /*
 994          * We've been once around the RAM and haven't found anything.
 995          * Give up.
 996          */
 997         *again = false;
 998         return false;
 999     }
1000     if (pss->offset >= pss->block->used_length) {
1001         /* Didn't find anything in this RAM Block */
1002         pss->offset = 0;
1003         pss->block = QLIST_NEXT_RCU(pss->block, next);
1004         if (!pss->block) {
1005             /* Hit the end of the list */
1006             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1007             /* Flag that we've looped */
1008             pss->complete_round = true;
1009             ram_bulk_stage = false;
1010             if (migrate_use_xbzrle()) {
1011                 /* If xbzrle is on, stop using the data compression at this
1012                  * point. In theory, xbzrle can do better than compression.
1013                  */
1014                 flush_compressed_data(f);
1015                 compression_switch = false;
1016             }
1017         }
1018         /* Didn't find anything this time, but try again on the new block */
1019         *again = true;
1020         return false;
1021     } else {
1022         /* Can go around again, but... */
1023         *again = true;
1024         /* We've found something so probably don't need to */
1025         return true;
1026     }
1027 }
1028
1029 /*
1030  * Helper for 'get_queued_page' - gets a page off the queue
1031  *      ms:      MigrationState in
1032  * *offset:      Used to return the offset within the RAMBlock
1033  * ram_addr_abs: global offset in the dirty/sent bitmaps
1034  *
1035  * Returns:      block (or NULL if none available)
1036  */
1037 static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1038                               ram_addr_t *ram_addr_abs)
1039 {
1040     RAMBlock *block = NULL;
1041
1042     qemu_mutex_lock(&ms->src_page_req_mutex);
1043     if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1044         struct MigrationSrcPageRequest *entry =
1045                                 QSIMPLEQ_FIRST(&ms->src_page_requests);
1046         block = entry->rb;
1047         *offset = entry->offset;
1048         *ram_addr_abs = (entry->offset + entry->rb->offset) &
1049                         TARGET_PAGE_MASK;
1050
1051         if (entry->len > TARGET_PAGE_SIZE) {
1052             entry->len -= TARGET_PAGE_SIZE;
1053             entry->offset += TARGET_PAGE_SIZE;
1054         } else {
1055             memory_region_unref(block->mr);
1056             QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1057             g_free(entry);
1058         }
1059     }
1060     qemu_mutex_unlock(&ms->src_page_req_mutex);
1061
1062     return block;
1063 }
1064
1065 /*
1066  * Unqueue a page from the queue fed by postcopy page requests; skips pages
1067  * that are already sent (!dirty)
1068  *
1069  *      ms:      MigrationState in
1070  *     pss:      PageSearchStatus structure updated with found block/offset
1071  * ram_addr_abs: global offset in the dirty/sent bitmaps
1072  *
1073  * Returns:      true if a queued page is found
1074  */
1075 static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss,
1076                             ram_addr_t *ram_addr_abs)
1077 {
1078     RAMBlock  *block;
1079     ram_addr_t offset;
1080     bool dirty;
1081
1082     do {
1083         block = unqueue_page(ms, &offset, ram_addr_abs);
1084         /*
1085          * We're sending this page, and since it's postcopy nothing else
1086          * will dirty it, and we must make sure it doesn't get sent again
1087          * even if this queue request was received after the background
1088          * search already sent it.
1089          */
1090         if (block) {
1091             unsigned long *bitmap;
1092             bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1093             dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1094             if (!dirty) {
1095                 trace_get_queued_page_not_dirty(
1096                     block->idstr, (uint64_t)offset,
1097                     (uint64_t)*ram_addr_abs,
1098                     test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1099                          atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1100             } else {
1101                 trace_get_queued_page(block->idstr,
1102                                       (uint64_t)offset,
1103                                       (uint64_t)*ram_addr_abs);
1104             }
1105         }
1106
1107     } while (block && !dirty);
1108
1109     if (block) {
1110         /*
1111          * As soon as we start servicing pages out of order, then we have
1112          * to kill the bulk stage, since the bulk stage assumes
1113          * in (migration_bitmap_find_and_reset_dirty) that every page is
1114          * dirty, that's no longer true.
1115          */
1116         ram_bulk_stage = false;
1117
1118         /*
1119          * We want the background search to continue from the queued page
1120          * since the guest is likely to want other pages near to the page
1121          * it just requested.
1122          */
1123         pss->block = block;
1124         pss->offset = offset;
1125     }
1126
1127     return !!block;
1128 }
1129
1130 /**
1131  * flush_page_queue: Flush any remaining pages in the ram request queue
1132  *    it should be empty at the end anyway, but in error cases there may be
1133  *    some left.
1134  *
1135  * ms: MigrationState
1136  */
1137 void flush_page_queue(MigrationState *ms)
1138 {
1139     struct MigrationSrcPageRequest *mspr, *next_mspr;
1140     /* This queue generally should be empty - but in the case of a failed
1141      * migration might have some droppings in.
1142      */
1143     rcu_read_lock();
1144     QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1145         memory_region_unref(mspr->rb->mr);
1146         QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1147         g_free(mspr);
1148     }
1149     rcu_read_unlock();
1150 }
1151
1152 /**
1153  * Queue the pages for transmission, e.g. a request from postcopy destination
1154  *   ms: MigrationStatus in which the queue is held
1155  *   rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
1156  *   start: Offset from the start of the RAMBlock
1157  *   len: Length (in bytes) to send
1158  *   Return: 0 on success
1159  */
1160 int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1161                          ram_addr_t start, ram_addr_t len)
1162 {
1163     RAMBlock *ramblock;
1164
1165     ms->postcopy_requests++;
1166     rcu_read_lock();
1167     if (!rbname) {
1168         /* Reuse last RAMBlock */
1169         ramblock = ms->last_req_rb;
1170
1171         if (!ramblock) {
1172             /*
1173              * Shouldn't happen, we can't reuse the last RAMBlock if
1174              * it's the 1st request.
1175              */
1176             error_report("ram_save_queue_pages no previous block");
1177             goto err;
1178         }
1179     } else {
1180         ramblock = qemu_ram_block_by_name(rbname);
1181
1182         if (!ramblock) {
1183             /* We shouldn't be asked for a non-existent RAMBlock */
1184             error_report("ram_save_queue_pages no block '%s'", rbname);
1185             goto err;
1186         }
1187         ms->last_req_rb = ramblock;
1188     }
1189     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1190     if (start+len > ramblock->used_length) {
1191         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1192                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1193                      __func__, start, len, ramblock->used_length);
1194         goto err;
1195     }
1196
1197     struct MigrationSrcPageRequest *new_entry =
1198         g_malloc0(sizeof(struct MigrationSrcPageRequest));
1199     new_entry->rb = ramblock;
1200     new_entry->offset = start;
1201     new_entry->len = len;
1202
1203     memory_region_ref(ramblock->mr);
1204     qemu_mutex_lock(&ms->src_page_req_mutex);
1205     QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1206     qemu_mutex_unlock(&ms->src_page_req_mutex);
1207     rcu_read_unlock();
1208
1209     return 0;
1210
1211 err:
1212     rcu_read_unlock();
1213     return -1;
1214 }
1215
1216 /**
1217  * ram_save_target_page: Save one target page
1218  *
1219  *
1220  * @f: QEMUFile where to send the data
1221  * @block: pointer to block that contains the page we want to send
1222  * @offset: offset inside the block for the page;
1223  * @last_stage: if we are at the completion stage
1224  * @bytes_transferred: increase it with the number of transferred bytes
1225  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1226  *
1227  * Returns: Number of pages written.
1228  */
1229 static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
1230                                 PageSearchStatus *pss,
1231                                 bool last_stage,
1232                                 uint64_t *bytes_transferred,
1233                                 ram_addr_t dirty_ram_abs)
1234 {
1235     int res = 0;
1236
1237     /* Check the pages is dirty and if it is send it */
1238     if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1239         unsigned long *unsentmap;
1240         if (compression_switch && migrate_use_compression()) {
1241             res = ram_save_compressed_page(f, pss,
1242                                            last_stage,
1243                                            bytes_transferred);
1244         } else {
1245             res = ram_save_page(f, pss, last_stage,
1246                                 bytes_transferred);
1247         }
1248
1249         if (res < 0) {
1250             return res;
1251         }
1252         unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1253         if (unsentmap) {
1254             clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1255         }
1256         /* Only update last_sent_block if a block was actually sent; xbzrle
1257          * might have decided the page was identical so didn't bother writing
1258          * to the stream.
1259          */
1260         if (res > 0) {
1261             last_sent_block = pss->block;
1262         }
1263     }
1264
1265     return res;
1266 }
1267
1268 /**
1269  * ram_save_host_page: Starting at *offset send pages up to the end
1270  *                     of the current host page.  It's valid for the initial
1271  *                     offset to point into the middle of a host page
1272  *                     in which case the remainder of the hostpage is sent.
1273  *                     Only dirty target pages are sent.
1274  *
1275  * Returns: Number of pages written.
1276  *
1277  * @f: QEMUFile where to send the data
1278  * @block: pointer to block that contains the page we want to send
1279  * @offset: offset inside the block for the page; updated to last target page
1280  *          sent
1281  * @last_stage: if we are at the completion stage
1282  * @bytes_transferred: increase it with the number of transferred bytes
1283  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1284  */
1285 static int ram_save_host_page(MigrationState *ms, QEMUFile *f,
1286                               PageSearchStatus *pss,
1287                               bool last_stage,
1288                               uint64_t *bytes_transferred,
1289                               ram_addr_t dirty_ram_abs)
1290 {
1291     int tmppages, pages = 0;
1292     do {
1293         tmppages = ram_save_target_page(ms, f, pss, last_stage,
1294                                         bytes_transferred, dirty_ram_abs);
1295         if (tmppages < 0) {
1296             return tmppages;
1297         }
1298
1299         pages += tmppages;
1300         pss->offset += TARGET_PAGE_SIZE;
1301         dirty_ram_abs += TARGET_PAGE_SIZE;
1302     } while (pss->offset & (qemu_host_page_size - 1));
1303
1304     /* The offset we leave with is the last one we looked at */
1305     pss->offset -= TARGET_PAGE_SIZE;
1306     return pages;
1307 }
1308
1309 /**
1310  * ram_find_and_save_block: Finds a dirty page and sends it to f
1311  *
1312  * Called within an RCU critical section.
1313  *
1314  * Returns:  The number of pages written
1315  *           0 means no dirty pages
1316  *
1317  * @f: QEMUFile where to send the data
1318  * @last_stage: if we are at the completion stage
1319  * @bytes_transferred: increase it with the number of transferred bytes
1320  *
1321  * On systems where host-page-size > target-page-size it will send all the
1322  * pages in a host page that are dirty.
1323  */
1324
1325 static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
1326                                    uint64_t *bytes_transferred)
1327 {
1328     PageSearchStatus pss;
1329     MigrationState *ms = migrate_get_current();
1330     int pages = 0;
1331     bool again, found;
1332     ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1333                                  ram_addr_t space */
1334
1335     pss.block = last_seen_block;
1336     pss.offset = last_offset;
1337     pss.complete_round = false;
1338
1339     if (!pss.block) {
1340         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1341     }
1342
1343     do {
1344         again = true;
1345         found = get_queued_page(ms, &pss, &dirty_ram_abs);
1346
1347         if (!found) {
1348             /* priority queue empty, so just search for something dirty */
1349             found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
1350         }
1351
1352         if (found) {
1353             pages = ram_save_host_page(ms, f, &pss,
1354                                        last_stage, bytes_transferred,
1355                                        dirty_ram_abs);
1356         }
1357     } while (!pages && again);
1358
1359     last_seen_block = pss.block;
1360     last_offset = pss.offset;
1361
1362     return pages;
1363 }
1364
1365 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1366 {
1367     uint64_t pages = size / TARGET_PAGE_SIZE;
1368     if (zero) {
1369         acct_info.dup_pages += pages;
1370     } else {
1371         acct_info.norm_pages += pages;
1372         bytes_transferred += size;
1373         qemu_update_position(f, size);
1374     }
1375 }
1376
1377 static ram_addr_t ram_save_remaining(void)
1378 {
1379     return migration_dirty_pages;
1380 }
1381
1382 uint64_t ram_bytes_remaining(void)
1383 {
1384     return ram_save_remaining() * TARGET_PAGE_SIZE;
1385 }
1386
1387 uint64_t ram_bytes_transferred(void)
1388 {
1389     return bytes_transferred;
1390 }
1391
1392 uint64_t ram_bytes_total(void)
1393 {
1394     RAMBlock *block;
1395     uint64_t total = 0;
1396
1397     rcu_read_lock();
1398     QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1399         total += block->used_length;
1400     rcu_read_unlock();
1401     return total;
1402 }
1403
1404 void free_xbzrle_decoded_buf(void)
1405 {
1406     g_free(xbzrle_decoded_buf);
1407     xbzrle_decoded_buf = NULL;
1408 }
1409
1410 static void migration_bitmap_free(struct BitmapRcu *bmap)
1411 {
1412     g_free(bmap->bmap);
1413     g_free(bmap->unsentmap);
1414     g_free(bmap);
1415 }
1416
1417 static void ram_migration_cleanup(void *opaque)
1418 {
1419     /* caller have hold iothread lock or is in a bh, so there is
1420      * no writing race against this migration_bitmap
1421      */
1422     struct BitmapRcu *bitmap = migration_bitmap_rcu;
1423     atomic_rcu_set(&migration_bitmap_rcu, NULL);
1424     if (bitmap) {
1425         memory_global_dirty_log_stop();
1426         call_rcu(bitmap, migration_bitmap_free, rcu);
1427     }
1428
1429     XBZRLE_cache_lock();
1430     if (XBZRLE.cache) {
1431         cache_fini(XBZRLE.cache);
1432         g_free(XBZRLE.encoded_buf);
1433         g_free(XBZRLE.current_buf);
1434         XBZRLE.cache = NULL;
1435         XBZRLE.encoded_buf = NULL;
1436         XBZRLE.current_buf = NULL;
1437     }
1438     XBZRLE_cache_unlock();
1439 }
1440
1441 static void reset_ram_globals(void)
1442 {
1443     last_seen_block = NULL;
1444     last_sent_block = NULL;
1445     last_offset = 0;
1446     last_version = ram_list.version;
1447     ram_bulk_stage = true;
1448 }
1449
1450 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1451
1452 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1453 {
1454     /* called in qemu main thread, so there is
1455      * no writing race against this migration_bitmap
1456      */
1457     if (migration_bitmap_rcu) {
1458         struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1459         bitmap = g_new(struct BitmapRcu, 1);
1460         bitmap->bmap = bitmap_new(new);
1461
1462         /* prevent migration_bitmap content from being set bit
1463          * by migration_bitmap_sync_range() at the same time.
1464          * it is safe to migration if migration_bitmap is cleared bit
1465          * at the same time.
1466          */
1467         qemu_mutex_lock(&migration_bitmap_mutex);
1468         bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1469         bitmap_set(bitmap->bmap, old, new - old);
1470
1471         /* We don't have a way to safely extend the sentmap
1472          * with RCU; so mark it as missing, entry to postcopy
1473          * will fail.
1474          */
1475         bitmap->unsentmap = NULL;
1476
1477         atomic_rcu_set(&migration_bitmap_rcu, bitmap);
1478         qemu_mutex_unlock(&migration_bitmap_mutex);
1479         migration_dirty_pages += new - old;
1480         call_rcu(old_bitmap, migration_bitmap_free, rcu);
1481     }
1482 }
1483
1484 /*
1485  * 'expected' is the value you expect the bitmap mostly to be full
1486  * of; it won't bother printing lines that are all this value.
1487  * If 'todump' is null the migration bitmap is dumped.
1488  */
1489 void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1490 {
1491     int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1492
1493     int64_t cur;
1494     int64_t linelen = 128;
1495     char linebuf[129];
1496
1497     if (!todump) {
1498         todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1499     }
1500
1501     for (cur = 0; cur < ram_pages; cur += linelen) {
1502         int64_t curb;
1503         bool found = false;
1504         /*
1505          * Last line; catch the case where the line length
1506          * is longer than remaining ram
1507          */
1508         if (cur + linelen > ram_pages) {
1509             linelen = ram_pages - cur;
1510         }
1511         for (curb = 0; curb < linelen; curb++) {
1512             bool thisbit = test_bit(cur + curb, todump);
1513             linebuf[curb] = thisbit ? '1' : '.';
1514             found = found || (thisbit != expected);
1515         }
1516         if (found) {
1517             linebuf[curb] = '\0';
1518             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1519         }
1520     }
1521 }
1522
1523 /* **** functions for postcopy ***** */
1524
1525 /*
1526  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1527  * Note: At this point the 'unsentmap' is the processed bitmap combined
1528  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1529  * start,length: Indexes into the bitmap for the first bit
1530  *            representing the named block and length in target-pages
1531  */
1532 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1533                                         PostcopyDiscardState *pds,
1534                                         unsigned long start,
1535                                         unsigned long length)
1536 {
1537     unsigned long end = start + length; /* one after the end */
1538     unsigned long current;
1539     unsigned long *unsentmap;
1540
1541     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1542     for (current = start; current < end; ) {
1543         unsigned long one = find_next_bit(unsentmap, end, current);
1544
1545         if (one <= end) {
1546             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1547             unsigned long discard_length;
1548
1549             if (zero >= end) {
1550                 discard_length = end - one;
1551             } else {
1552                 discard_length = zero - one;
1553             }
1554             if (discard_length) {
1555                 postcopy_discard_send_range(ms, pds, one, discard_length);
1556             }
1557             current = one + discard_length;
1558         } else {
1559             current = one;
1560         }
1561     }
1562
1563     return 0;
1564 }
1565
1566 /*
1567  * Utility for the outgoing postcopy code.
1568  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1569  *   passing it bitmap indexes and name.
1570  * Returns: 0 on success
1571  * (qemu_ram_foreach_block ends up passing unscaled lengths
1572  *  which would mean postcopy code would have to deal with target page)
1573  */
1574 static int postcopy_each_ram_send_discard(MigrationState *ms)
1575 {
1576     struct RAMBlock *block;
1577     int ret;
1578
1579     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1580         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1581         PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1582                                                                first,
1583                                                                block->idstr);
1584
1585         /*
1586          * Postcopy sends chunks of bitmap over the wire, but it
1587          * just needs indexes at this point, avoids it having
1588          * target page specific code.
1589          */
1590         ret = postcopy_send_discard_bm_ram(ms, pds, first,
1591                                     block->used_length >> TARGET_PAGE_BITS);
1592         postcopy_discard_send_finish(ms, pds);
1593         if (ret) {
1594             return ret;
1595         }
1596     }
1597
1598     return 0;
1599 }
1600
1601 /*
1602  * Helper for postcopy_chunk_hostpages; it's called twice to cleanup
1603  *   the two bitmaps, that are similar, but one is inverted.
1604  *
1605  * We search for runs of target-pages that don't start or end on a
1606  * host page boundary;
1607  * unsent_pass=true: Cleans up partially unsent host pages by searching
1608  *                 the unsentmap
1609  * unsent_pass=false: Cleans up partially dirty host pages by searching
1610  *                 the main migration bitmap
1611  *
1612  */
1613 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1614                                           RAMBlock *block,
1615                                           PostcopyDiscardState *pds)
1616 {
1617     unsigned long *bitmap;
1618     unsigned long *unsentmap;
1619     unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE;
1620     unsigned long first = block->offset >> TARGET_PAGE_BITS;
1621     unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1622     unsigned long last = first + (len - 1);
1623     unsigned long run_start;
1624
1625     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1626     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1627
1628     if (unsent_pass) {
1629         /* Find a sent page */
1630         run_start = find_next_zero_bit(unsentmap, last + 1, first);
1631     } else {
1632         /* Find a dirty page */
1633         run_start = find_next_bit(bitmap, last + 1, first);
1634     }
1635
1636     while (run_start <= last) {
1637         bool do_fixup = false;
1638         unsigned long fixup_start_addr;
1639         unsigned long host_offset;
1640
1641         /*
1642          * If the start of this run of pages is in the middle of a host
1643          * page, then we need to fixup this host page.
1644          */
1645         host_offset = run_start % host_ratio;
1646         if (host_offset) {
1647             do_fixup = true;
1648             run_start -= host_offset;
1649             fixup_start_addr = run_start;
1650             /* For the next pass */
1651             run_start = run_start + host_ratio;
1652         } else {
1653             /* Find the end of this run */
1654             unsigned long run_end;
1655             if (unsent_pass) {
1656                 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1657             } else {
1658                 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1659             }
1660             /*
1661              * If the end isn't at the start of a host page, then the
1662              * run doesn't finish at the end of a host page
1663              * and we need to discard.
1664              */
1665             host_offset = run_end % host_ratio;
1666             if (host_offset) {
1667                 do_fixup = true;
1668                 fixup_start_addr = run_end - host_offset;
1669                 /*
1670                  * This host page has gone, the next loop iteration starts
1671                  * from after the fixup
1672                  */
1673                 run_start = fixup_start_addr + host_ratio;
1674             } else {
1675                 /*
1676                  * No discards on this iteration, next loop starts from
1677                  * next sent/dirty page
1678                  */
1679                 run_start = run_end + 1;
1680             }
1681         }
1682
1683         if (do_fixup) {
1684             unsigned long page;
1685
1686             /* Tell the destination to discard this page */
1687             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1688                 /* For the unsent_pass we:
1689                  *     discard partially sent pages
1690                  * For the !unsent_pass (dirty) we:
1691                  *     discard partially dirty pages that were sent
1692                  *     (any partially sent pages were already discarded
1693                  *     by the previous unsent_pass)
1694                  */
1695                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1696                                             host_ratio);
1697             }
1698
1699             /* Clean up the bitmap */
1700             for (page = fixup_start_addr;
1701                  page < fixup_start_addr + host_ratio; page++) {
1702                 /* All pages in this host page are now not sent */
1703                 set_bit(page, unsentmap);
1704
1705                 /*
1706                  * Remark them as dirty, updating the count for any pages
1707                  * that weren't previously dirty.
1708                  */
1709                 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1710             }
1711         }
1712
1713         if (unsent_pass) {
1714             /* Find the next sent page for the next iteration */
1715             run_start = find_next_zero_bit(unsentmap, last + 1,
1716                                            run_start);
1717         } else {
1718             /* Find the next dirty page for the next iteration */
1719             run_start = find_next_bit(bitmap, last + 1, run_start);
1720         }
1721     }
1722 }
1723
1724 /*
1725  * Utility for the outgoing postcopy code.
1726  *
1727  * Discard any partially sent host-page size chunks, mark any partially
1728  * dirty host-page size chunks as all dirty.
1729  *
1730  * Returns: 0 on success
1731  */
1732 static int postcopy_chunk_hostpages(MigrationState *ms)
1733 {
1734     struct RAMBlock *block;
1735
1736     if (qemu_host_page_size == TARGET_PAGE_SIZE) {
1737         /* Easy case - TPS==HPS - nothing to be done */
1738         return 0;
1739     }
1740
1741     /* Easiest way to make sure we don't resume in the middle of a host-page */
1742     last_seen_block = NULL;
1743     last_sent_block = NULL;
1744     last_offset     = 0;
1745
1746     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1747         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1748
1749         PostcopyDiscardState *pds =
1750                          postcopy_discard_send_init(ms, first, block->idstr);
1751
1752         /* First pass: Discard all partially sent host pages */
1753         postcopy_chunk_hostpages_pass(ms, true, block, pds);
1754         /*
1755          * Second pass: Ensure that all partially dirty host pages are made
1756          * fully dirty.
1757          */
1758         postcopy_chunk_hostpages_pass(ms, false, block, pds);
1759
1760         postcopy_discard_send_finish(ms, pds);
1761     } /* ram_list loop */
1762
1763     return 0;
1764 }
1765
1766 /*
1767  * Transmit the set of pages to be discarded after precopy to the target
1768  * these are pages that:
1769  *     a) Have been previously transmitted but are now dirty again
1770  *     b) Pages that have never been transmitted, this ensures that
1771  *        any pages on the destination that have been mapped by background
1772  *        tasks get discarded (transparent huge pages is the specific concern)
1773  * Hopefully this is pretty sparse
1774  */
1775 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1776 {
1777     int ret;
1778     unsigned long *bitmap, *unsentmap;
1779
1780     rcu_read_lock();
1781
1782     /* This should be our last sync, the src is now paused */
1783     migration_bitmap_sync();
1784
1785     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1786     if (!unsentmap) {
1787         /* We don't have a safe way to resize the sentmap, so
1788          * if the bitmap was resized it will be NULL at this
1789          * point.
1790          */
1791         error_report("migration ram resized during precopy phase");
1792         rcu_read_unlock();
1793         return -EINVAL;
1794     }
1795
1796     /* Deal with TPS != HPS */
1797     ret = postcopy_chunk_hostpages(ms);
1798     if (ret) {
1799         rcu_read_unlock();
1800         return ret;
1801     }
1802
1803     /*
1804      * Update the unsentmap to be unsentmap = unsentmap | dirty
1805      */
1806     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1807     bitmap_or(unsentmap, unsentmap, bitmap,
1808                last_ram_offset() >> TARGET_PAGE_BITS);
1809
1810
1811     trace_ram_postcopy_send_discard_bitmap();
1812 #ifdef DEBUG_POSTCOPY
1813     ram_debug_dump_bitmap(unsentmap, true);
1814 #endif
1815
1816     ret = postcopy_each_ram_send_discard(ms);
1817     rcu_read_unlock();
1818
1819     return ret;
1820 }
1821
1822 /*
1823  * At the start of the postcopy phase of migration, any now-dirty
1824  * precopied pages are discarded.
1825  *
1826  * start, length describe a byte address range within the RAMBlock
1827  *
1828  * Returns 0 on success.
1829  */
1830 int ram_discard_range(MigrationIncomingState *mis,
1831                       const char *block_name,
1832                       uint64_t start, size_t length)
1833 {
1834     int ret = -1;
1835
1836     rcu_read_lock();
1837     RAMBlock *rb = qemu_ram_block_by_name(block_name);
1838
1839     if (!rb) {
1840         error_report("ram_discard_range: Failed to find block '%s'",
1841                      block_name);
1842         goto err;
1843     }
1844
1845     uint8_t *host_startaddr = rb->host + start;
1846
1847     if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) {
1848         error_report("ram_discard_range: Unaligned start address: %p",
1849                      host_startaddr);
1850         goto err;
1851     }
1852
1853     if ((start + length) <= rb->used_length) {
1854         uint8_t *host_endaddr = host_startaddr + length;
1855         if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) {
1856             error_report("ram_discard_range: Unaligned end address: %p",
1857                          host_endaddr);
1858             goto err;
1859         }
1860         ret = postcopy_ram_discard_range(mis, host_startaddr, length);
1861     } else {
1862         error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
1863                      "/%zx/" RAM_ADDR_FMT")",
1864                      block_name, start, length, rb->used_length);
1865     }
1866
1867 err:
1868     rcu_read_unlock();
1869
1870     return ret;
1871 }
1872
1873
1874 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1875  * long-running RCU critical section.  When rcu-reclaims in the code
1876  * start to become numerous it will be necessary to reduce the
1877  * granularity of these critical sections.
1878  */
1879
1880 static int ram_save_setup(QEMUFile *f, void *opaque)
1881 {
1882     RAMBlock *block;
1883     int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1884
1885     dirty_rate_high_cnt = 0;
1886     bitmap_sync_count = 0;
1887     migration_bitmap_sync_init();
1888     qemu_mutex_init(&migration_bitmap_mutex);
1889
1890     if (migrate_use_xbzrle()) {
1891         XBZRLE_cache_lock();
1892         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1893                                   TARGET_PAGE_SIZE,
1894                                   TARGET_PAGE_SIZE);
1895         if (!XBZRLE.cache) {
1896             XBZRLE_cache_unlock();
1897             error_report("Error creating cache");
1898             return -1;
1899         }
1900         XBZRLE_cache_unlock();
1901
1902         /* We prefer not to abort if there is no memory */
1903         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1904         if (!XBZRLE.encoded_buf) {
1905             error_report("Error allocating encoded_buf");
1906             return -1;
1907         }
1908
1909         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1910         if (!XBZRLE.current_buf) {
1911             error_report("Error allocating current_buf");
1912             g_free(XBZRLE.encoded_buf);
1913             XBZRLE.encoded_buf = NULL;
1914             return -1;
1915         }
1916
1917         acct_clear();
1918     }
1919
1920     /* For memory_global_dirty_log_start below.  */
1921     qemu_mutex_lock_iothread();
1922
1923     qemu_mutex_lock_ramlist();
1924     rcu_read_lock();
1925     bytes_transferred = 0;
1926     reset_ram_globals();
1927
1928     ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1929     migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
1930     migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
1931     bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
1932
1933     if (migrate_postcopy_ram()) {
1934         migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
1935         bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
1936     }
1937
1938     /*
1939      * Count the total number of pages used by ram blocks not including any
1940      * gaps due to alignment or unplugs.
1941      */
1942     migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1943
1944     memory_global_dirty_log_start();
1945     migration_bitmap_sync();
1946     qemu_mutex_unlock_ramlist();
1947     qemu_mutex_unlock_iothread();
1948
1949     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1950
1951     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1952         qemu_put_byte(f, strlen(block->idstr));
1953         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1954         qemu_put_be64(f, block->used_length);
1955     }
1956
1957     rcu_read_unlock();
1958
1959     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1960     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1961
1962     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1963
1964     return 0;
1965 }
1966
1967 static int ram_save_iterate(QEMUFile *f, void *opaque)
1968 {
1969     int ret;
1970     int i;
1971     int64_t t0;
1972     int pages_sent = 0;
1973
1974     rcu_read_lock();
1975     if (ram_list.version != last_version) {
1976         reset_ram_globals();
1977     }
1978
1979     /* Read version before ram_list.blocks */
1980     smp_rmb();
1981
1982     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1983
1984     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1985     i = 0;
1986     while ((ret = qemu_file_rate_limit(f)) == 0) {
1987         int pages;
1988
1989         pages = ram_find_and_save_block(f, false, &bytes_transferred);
1990         /* no more pages to sent */
1991         if (pages == 0) {
1992             break;
1993         }
1994         pages_sent += pages;
1995         acct_info.iterations++;
1996
1997         /* we want to check in the 1st loop, just in case it was the 1st time
1998            and we had to sync the dirty bitmap.
1999            qemu_get_clock_ns() is a bit expensive, so we only check each some
2000            iterations
2001         */
2002         if ((i & 63) == 0) {
2003             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2004             if (t1 > MAX_WAIT) {
2005                 DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
2006                         t1, i);
2007                 break;
2008             }
2009         }
2010         i++;
2011     }
2012     flush_compressed_data(f);
2013     rcu_read_unlock();
2014
2015     /*
2016      * Must occur before EOS (or any QEMUFile operation)
2017      * because of RDMA protocol.
2018      */
2019     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2020
2021     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2022     bytes_transferred += 8;
2023
2024     ret = qemu_file_get_error(f);
2025     if (ret < 0) {
2026         return ret;
2027     }
2028
2029     return pages_sent;
2030 }
2031
2032 /* Called with iothread lock */
2033 static int ram_save_complete(QEMUFile *f, void *opaque)
2034 {
2035     rcu_read_lock();
2036
2037     if (!migration_in_postcopy(migrate_get_current())) {
2038         migration_bitmap_sync();
2039     }
2040
2041     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2042
2043     /* try transferring iterative blocks of memory */
2044
2045     /* flush all remaining blocks regardless of rate limiting */
2046     while (true) {
2047         int pages;
2048
2049         pages = ram_find_and_save_block(f, true, &bytes_transferred);
2050         /* no more blocks to sent */
2051         if (pages == 0) {
2052             break;
2053         }
2054     }
2055
2056     flush_compressed_data(f);
2057     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2058
2059     rcu_read_unlock();
2060
2061     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2062
2063     return 0;
2064 }
2065
2066 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2067                              uint64_t *non_postcopiable_pending,
2068                              uint64_t *postcopiable_pending)
2069 {
2070     uint64_t remaining_size;
2071
2072     remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2073
2074     if (!migration_in_postcopy(migrate_get_current()) &&
2075         remaining_size < max_size) {
2076         qemu_mutex_lock_iothread();
2077         rcu_read_lock();
2078         migration_bitmap_sync();
2079         rcu_read_unlock();
2080         qemu_mutex_unlock_iothread();
2081         remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2082     }
2083
2084     /* We can do postcopy, and all the data is postcopiable */
2085     *postcopiable_pending += remaining_size;
2086 }
2087
2088 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2089 {
2090     unsigned int xh_len;
2091     int xh_flags;
2092     uint8_t *loaded_data;
2093
2094     if (!xbzrle_decoded_buf) {
2095         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2096     }
2097     loaded_data = xbzrle_decoded_buf;
2098
2099     /* extract RLE header */
2100     xh_flags = qemu_get_byte(f);
2101     xh_len = qemu_get_be16(f);
2102
2103     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2104         error_report("Failed to load XBZRLE page - wrong compression!");
2105         return -1;
2106     }
2107
2108     if (xh_len > TARGET_PAGE_SIZE) {
2109         error_report("Failed to load XBZRLE page - len overflow!");
2110         return -1;
2111     }
2112     /* load data and decode */
2113     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2114
2115     /* decode RLE */
2116     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2117                              TARGET_PAGE_SIZE) == -1) {
2118         error_report("Failed to load XBZRLE page - decode error!");
2119         return -1;
2120     }
2121
2122     return 0;
2123 }
2124
2125 /* Must be called from within a rcu critical section.
2126  * Returns a pointer from within the RCU-protected ram_list.
2127  */
2128 /*
2129  * Read a RAMBlock ID from the stream f.
2130  *
2131  * f: Stream to read from
2132  * flags: Page flags (mostly to see if it's a continuation of previous block)
2133  */
2134 static inline RAMBlock *ram_block_from_stream(QEMUFile *f,
2135                                               int flags)
2136 {
2137     static RAMBlock *block = NULL;
2138     char id[256];
2139     uint8_t len;
2140
2141     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2142         if (!block) {
2143             error_report("Ack, bad migration stream!");
2144             return NULL;
2145         }
2146         return block;
2147     }
2148
2149     len = qemu_get_byte(f);
2150     qemu_get_buffer(f, (uint8_t *)id, len);
2151     id[len] = 0;
2152
2153     block = qemu_ram_block_by_name(id);
2154     if (!block) {
2155         error_report("Can't find block %s", id);
2156         return NULL;
2157     }
2158
2159     return block;
2160 }
2161
2162 static inline void *host_from_ram_block_offset(RAMBlock *block,
2163                                                ram_addr_t offset)
2164 {
2165     if (!offset_in_ramblock(block, offset)) {
2166         return NULL;
2167     }
2168
2169     return block->host + offset;
2170 }
2171
2172 /*
2173  * If a page (or a whole RDMA chunk) has been
2174  * determined to be zero, then zap it.
2175  */
2176 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2177 {
2178     if (ch != 0 || !is_zero_range(host, size)) {
2179         memset(host, ch, size);
2180     }
2181 }
2182
2183 static void *do_data_decompress(void *opaque)
2184 {
2185     DecompressParam *param = opaque;
2186     unsigned long pagesize;
2187     uint8_t *des;
2188     int len;
2189
2190     qemu_mutex_lock(&param->mutex);
2191     while (!param->quit) {
2192         if (param->des) {
2193             des = param->des;
2194             len = param->len;
2195             param->des = 0;
2196             qemu_mutex_unlock(&param->mutex);
2197
2198             pagesize = TARGET_PAGE_SIZE;
2199             /* uncompress() will return failed in some case, especially
2200              * when the page is dirted when doing the compression, it's
2201              * not a problem because the dirty page will be retransferred
2202              * and uncompress() won't break the data in other pages.
2203              */
2204             uncompress((Bytef *)des, &pagesize,
2205                        (const Bytef *)param->compbuf, len);
2206
2207             qemu_mutex_lock(&decomp_done_lock);
2208             param->done = true;
2209             qemu_cond_signal(&decomp_done_cond);
2210             qemu_mutex_unlock(&decomp_done_lock);
2211
2212             qemu_mutex_lock(&param->mutex);
2213         } else {
2214             qemu_cond_wait(&param->cond, &param->mutex);
2215         }
2216     }
2217     qemu_mutex_unlock(&param->mutex);
2218
2219     return NULL;
2220 }
2221
2222 static void wait_for_decompress_done(void)
2223 {
2224     int idx, thread_count;
2225
2226     if (!migrate_use_compression()) {
2227         return;
2228     }
2229
2230     thread_count = migrate_decompress_threads();
2231     qemu_mutex_lock(&decomp_done_lock);
2232     for (idx = 0; idx < thread_count; idx++) {
2233         while (!decomp_param[idx].done) {
2234             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2235         }
2236     }
2237     qemu_mutex_unlock(&decomp_done_lock);
2238 }
2239
2240 void migrate_decompress_threads_create(void)
2241 {
2242     int i, thread_count;
2243
2244     thread_count = migrate_decompress_threads();
2245     decompress_threads = g_new0(QemuThread, thread_count);
2246     decomp_param = g_new0(DecompressParam, thread_count);
2247     qemu_mutex_init(&decomp_done_lock);
2248     qemu_cond_init(&decomp_done_cond);
2249     for (i = 0; i < thread_count; i++) {
2250         qemu_mutex_init(&decomp_param[i].mutex);
2251         qemu_cond_init(&decomp_param[i].cond);
2252         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2253         decomp_param[i].done = true;
2254         decomp_param[i].quit = false;
2255         qemu_thread_create(decompress_threads + i, "decompress",
2256                            do_data_decompress, decomp_param + i,
2257                            QEMU_THREAD_JOINABLE);
2258     }
2259 }
2260
2261 void migrate_decompress_threads_join(void)
2262 {
2263     int i, thread_count;
2264
2265     thread_count = migrate_decompress_threads();
2266     for (i = 0; i < thread_count; i++) {
2267         qemu_mutex_lock(&decomp_param[i].mutex);
2268         decomp_param[i].quit = true;
2269         qemu_cond_signal(&decomp_param[i].cond);
2270         qemu_mutex_unlock(&decomp_param[i].mutex);
2271     }
2272     for (i = 0; i < thread_count; i++) {
2273         qemu_thread_join(decompress_threads + i);
2274         qemu_mutex_destroy(&decomp_param[i].mutex);
2275         qemu_cond_destroy(&decomp_param[i].cond);
2276         g_free(decomp_param[i].compbuf);
2277     }
2278     g_free(decompress_threads);
2279     g_free(decomp_param);
2280     decompress_threads = NULL;
2281     decomp_param = NULL;
2282 }
2283
2284 static void decompress_data_with_multi_threads(QEMUFile *f,
2285                                                void *host, int len)
2286 {
2287     int idx, thread_count;
2288
2289     thread_count = migrate_decompress_threads();
2290     qemu_mutex_lock(&decomp_done_lock);
2291     while (true) {
2292         for (idx = 0; idx < thread_count; idx++) {
2293             if (decomp_param[idx].done) {
2294                 decomp_param[idx].done = false;
2295                 qemu_mutex_lock(&decomp_param[idx].mutex);
2296                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2297                 decomp_param[idx].des = host;
2298                 decomp_param[idx].len = len;
2299                 qemu_cond_signal(&decomp_param[idx].cond);
2300                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2301                 break;
2302             }
2303         }
2304         if (idx < thread_count) {
2305             break;
2306         } else {
2307             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2308         }
2309     }
2310     qemu_mutex_unlock(&decomp_done_lock);
2311 }
2312
2313 /*
2314  * Allocate data structures etc needed by incoming migration with postcopy-ram
2315  * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
2316  */
2317 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2318 {
2319     size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2320
2321     return postcopy_ram_incoming_init(mis, ram_pages);
2322 }
2323
2324 /*
2325  * Called in postcopy mode by ram_load().
2326  * rcu_read_lock is taken prior to this being called.
2327  */
2328 static int ram_load_postcopy(QEMUFile *f)
2329 {
2330     int flags = 0, ret = 0;
2331     bool place_needed = false;
2332     bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE;
2333     MigrationIncomingState *mis = migration_incoming_get_current();
2334     /* Temporary page that is later 'placed' */
2335     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2336     void *last_host = NULL;
2337     bool all_zero = false;
2338
2339     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2340         ram_addr_t addr;
2341         void *host = NULL;
2342         void *page_buffer = NULL;
2343         void *place_source = NULL;
2344         uint8_t ch;
2345
2346         addr = qemu_get_be64(f);
2347         flags = addr & ~TARGET_PAGE_MASK;
2348         addr &= TARGET_PAGE_MASK;
2349
2350         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2351         place_needed = false;
2352         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2353             RAMBlock *block = ram_block_from_stream(f, flags);
2354
2355             host = host_from_ram_block_offset(block, addr);
2356             if (!host) {
2357                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2358                 ret = -EINVAL;
2359                 break;
2360             }
2361             /*
2362              * Postcopy requires that we place whole host pages atomically.
2363              * To make it atomic, the data is read into a temporary page
2364              * that's moved into place later.
2365              * The migration protocol uses,  possibly smaller, target-pages
2366              * however the source ensures it always sends all the components
2367              * of a host page in order.
2368              */
2369             page_buffer = postcopy_host_page +
2370                           ((uintptr_t)host & ~qemu_host_page_mask);
2371             /* If all TP are zero then we can optimise the place */
2372             if (!((uintptr_t)host & ~qemu_host_page_mask)) {
2373                 all_zero = true;
2374             } else {
2375                 /* not the 1st TP within the HP */
2376                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2377                     error_report("Non-sequential target page %p/%p",
2378                                   host, last_host);
2379                     ret = -EINVAL;
2380                     break;
2381                 }
2382             }
2383
2384
2385             /*
2386              * If it's the last part of a host page then we place the host
2387              * page
2388              */
2389             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2390                                      ~qemu_host_page_mask) == 0;
2391             place_source = postcopy_host_page;
2392         }
2393         last_host = host;
2394
2395         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2396         case RAM_SAVE_FLAG_COMPRESS:
2397             ch = qemu_get_byte(f);
2398             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2399             if (ch) {
2400                 all_zero = false;
2401             }
2402             break;
2403
2404         case RAM_SAVE_FLAG_PAGE:
2405             all_zero = false;
2406             if (!place_needed || !matching_page_sizes) {
2407                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2408             } else {
2409                 /* Avoids the qemu_file copy during postcopy, which is
2410                  * going to do a copy later; can only do it when we
2411                  * do this read in one go (matching page sizes)
2412                  */
2413                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2414                                          TARGET_PAGE_SIZE);
2415             }
2416             break;
2417         case RAM_SAVE_FLAG_EOS:
2418             /* normal exit */
2419             break;
2420         default:
2421             error_report("Unknown combination of migration flags: %#x"
2422                          " (postcopy mode)", flags);
2423             ret = -EINVAL;
2424         }
2425
2426         if (place_needed) {
2427             /* This gets called at the last target page in the host page */
2428             if (all_zero) {
2429                 ret = postcopy_place_page_zero(mis,
2430                                                host + TARGET_PAGE_SIZE -
2431                                                qemu_host_page_size);
2432             } else {
2433                 ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE -
2434                                                qemu_host_page_size,
2435                                                place_source);
2436             }
2437         }
2438         if (!ret) {
2439             ret = qemu_file_get_error(f);
2440         }
2441     }
2442
2443     return ret;
2444 }
2445
2446 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2447 {
2448     int flags = 0, ret = 0;
2449     static uint64_t seq_iter;
2450     int len = 0;
2451     /*
2452      * If system is running in postcopy mode, page inserts to host memory must
2453      * be atomic
2454      */
2455     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2456
2457     seq_iter++;
2458
2459     if (version_id != 4) {
2460         ret = -EINVAL;
2461     }
2462
2463     /* This RCU critical section can be very long running.
2464      * When RCU reclaims in the code start to become numerous,
2465      * it will be necessary to reduce the granularity of this
2466      * critical section.
2467      */
2468     rcu_read_lock();
2469
2470     if (postcopy_running) {
2471         ret = ram_load_postcopy(f);
2472     }
2473
2474     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2475         ram_addr_t addr, total_ram_bytes;
2476         void *host = NULL;
2477         uint8_t ch;
2478
2479         addr = qemu_get_be64(f);
2480         flags = addr & ~TARGET_PAGE_MASK;
2481         addr &= TARGET_PAGE_MASK;
2482
2483         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2484                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2485             RAMBlock *block = ram_block_from_stream(f, flags);
2486
2487             host = host_from_ram_block_offset(block, addr);
2488             if (!host) {
2489                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2490                 ret = -EINVAL;
2491                 break;
2492             }
2493         }
2494
2495         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2496         case RAM_SAVE_FLAG_MEM_SIZE:
2497             /* Synchronize RAM block list */
2498             total_ram_bytes = addr;
2499             while (!ret && total_ram_bytes) {
2500                 RAMBlock *block;
2501                 char id[256];
2502                 ram_addr_t length;
2503
2504                 len = qemu_get_byte(f);
2505                 qemu_get_buffer(f, (uint8_t *)id, len);
2506                 id[len] = 0;
2507                 length = qemu_get_be64(f);
2508
2509                 block = qemu_ram_block_by_name(id);
2510                 if (block) {
2511                     if (length != block->used_length) {
2512                         Error *local_err = NULL;
2513
2514                         ret = qemu_ram_resize(block, length,
2515                                               &local_err);
2516                         if (local_err) {
2517                             error_report_err(local_err);
2518                         }
2519                     }
2520                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2521                                           block->idstr);
2522                 } else {
2523                     error_report("Unknown ramblock \"%s\", cannot "
2524                                  "accept migration", id);
2525                     ret = -EINVAL;
2526                 }
2527
2528                 total_ram_bytes -= length;
2529             }
2530             break;
2531
2532         case RAM_SAVE_FLAG_COMPRESS:
2533             ch = qemu_get_byte(f);
2534             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2535             break;
2536
2537         case RAM_SAVE_FLAG_PAGE:
2538             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2539             break;
2540
2541         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2542             len = qemu_get_be32(f);
2543             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2544                 error_report("Invalid compressed data length: %d", len);
2545                 ret = -EINVAL;
2546                 break;
2547             }
2548             decompress_data_with_multi_threads(f, host, len);
2549             break;
2550
2551         case RAM_SAVE_FLAG_XBZRLE:
2552             if (load_xbzrle(f, addr, host) < 0) {
2553                 error_report("Failed to decompress XBZRLE page at "
2554                              RAM_ADDR_FMT, addr);
2555                 ret = -EINVAL;
2556                 break;
2557             }
2558             break;
2559         case RAM_SAVE_FLAG_EOS:
2560             /* normal exit */
2561             break;
2562         default:
2563             if (flags & RAM_SAVE_FLAG_HOOK) {
2564                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2565             } else {
2566                 error_report("Unknown combination of migration flags: %#x",
2567                              flags);
2568                 ret = -EINVAL;
2569             }
2570         }
2571         if (!ret) {
2572             ret = qemu_file_get_error(f);
2573         }
2574     }
2575
2576     wait_for_decompress_done();
2577     rcu_read_unlock();
2578     DPRINTF("Completed load of VM with exit code %d seq iteration "
2579             "%" PRIu64 "\n", ret, seq_iter);
2580     return ret;
2581 }
2582
2583 static SaveVMHandlers savevm_ram_handlers = {
2584     .save_live_setup = ram_save_setup,
2585     .save_live_iterate = ram_save_iterate,
2586     .save_live_complete_postcopy = ram_save_complete,
2587     .save_live_complete_precopy = ram_save_complete,
2588     .save_live_pending = ram_save_pending,
2589     .load_state = ram_load,
2590     .cleanup = ram_migration_cleanup,
2591 };
2592
2593 void ram_mig_init(void)
2594 {
2595     qemu_mutex_init(&XBZRLE.lock);
2596     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
2597 }