migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "qemu-common.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qapi-event.h"
  33 #include "qemu/cutils.h"
  34 #include "qemu/bitops.h"
  35 #include "qemu/bitmap.h"
  36 #include "qemu/timer.h"
  37 #include "qemu/main-loop.h"
  38 #include "migration/migration.h"
  39 #include "migration/postcopy-ram.h"
  40 #include "exec/address-spaces.h"
  41 #include "migration/page_cache.h"
  42 #include "qemu/error-report.h"
  43 #include "trace.h"
  44 #include "exec/ram_addr.h"
  45 #include "qemu/rcu_queue.h"
  46
  47 #ifdef DEBUG_MIGRATION_RAM
  48 #define DPRINTF(fmt, ...) \
  49     do { fprintf(stdout, "migration_ram: " fmt, ## __VA_ARGS__); } while (0)
  50 #else
  51 #define DPRINTF(fmt, ...) \
  52     do { } while (0)
  53 #endif
  54
  55 static int dirty_rate_high_cnt;
  56
  57 static uint64_t bitmap_sync_count;
  58
  59 /***********************************************************/
  60 /* ram save/restore */
  61
  62 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  63 #define RAM_SAVE_FLAG_COMPRESS 0x02
  64 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  65 #define RAM_SAVE_FLAG_PAGE     0x08
  66 #define RAM_SAVE_FLAG_EOS      0x10
  67 #define RAM_SAVE_FLAG_CONTINUE 0x20
  68 #define RAM_SAVE_FLAG_XBZRLE   0x40
  69 /* 0x80 is reserved in migration.h start with 0x100 next */
  70 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  71
  72 static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE];
  73
  74 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  75 {
  76     return buffer_is_zero(p, size);
  77 }
  78
  79 /* struct contains XBZRLE cache and a static page
  80    used by the compression */
  81 static struct {
  82     /* buffer used for XBZRLE encoding */
  83     uint8_t *encoded_buf;
  84     /* buffer for storing page content */
  85     uint8_t *current_buf;
  86     /* Cache for XBZRLE, Protected by lock. */
  87     PageCache *cache;
  88     QemuMutex lock;
  89 } XBZRLE;
  90
  91 /* buffer used for XBZRLE decoding */
  92 static uint8_t *xbzrle_decoded_buf;
  93
  94 static void XBZRLE_cache_lock(void)
  95 {
  96     if (migrate_use_xbzrle())
  97         qemu_mutex_lock(&XBZRLE.lock);
  98 }
  99
 100 static void XBZRLE_cache_unlock(void)
 101 {
 102     if (migrate_use_xbzrle())
 103         qemu_mutex_unlock(&XBZRLE.lock);
 104 }
 105
 106 /*
 107  * called from qmp_migrate_set_cache_size in main thread, possibly while
 108  * a migration is in progress.
 109  * A running migration maybe using the cache and might finish during this
 110  * call, hence changes to the cache are protected by XBZRLE.lock().
 111  */
 112 int64_t xbzrle_cache_resize(int64_t new_size)
 113 {
 114     PageCache *new_cache;
 115     int64_t ret;
 116
 117     if (new_size < TARGET_PAGE_SIZE) {
 118         return -1;
 119     }
 120
 121     XBZRLE_cache_lock();
 122
 123     if (XBZRLE.cache != NULL) {
 124         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 125             goto out_new_size;
 126         }
 127         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 128                                         TARGET_PAGE_SIZE);
 129         if (!new_cache) {
 130             error_report("Error creating cache");
 131             ret = -1;
 132             goto out;
 133         }
 134
 135         cache_fini(XBZRLE.cache);
 136         XBZRLE.cache = new_cache;
 137     }
 138
 139 out_new_size:
 140     ret = pow2floor(new_size);
 141 out:
 142     XBZRLE_cache_unlock();
 143     return ret;
 144 }
 145
 146 /* accounting for migration statistics */
 147 typedef struct AccountingInfo {
 148     uint64_t dup_pages;
 149     uint64_t skipped_pages;
 150     uint64_t norm_pages;
 151     uint64_t iterations;
 152     uint64_t xbzrle_bytes;
 153     uint64_t xbzrle_pages;
 154     uint64_t xbzrle_cache_miss;
 155     double xbzrle_cache_miss_rate;
 156     uint64_t xbzrle_overflows;
 157 } AccountingInfo;
 158
 159 static AccountingInfo acct_info;
 160
 161 static void acct_clear(void)
 162 {
 163     memset(&acct_info, 0, sizeof(acct_info));
 164 }
 165
 166 uint64_t dup_mig_bytes_transferred(void)
 167 {
 168     return acct_info.dup_pages * TARGET_PAGE_SIZE;
 169 }
 170
 171 uint64_t dup_mig_pages_transferred(void)
 172 {
 173     return acct_info.dup_pages;
 174 }
 175
 176 uint64_t skipped_mig_bytes_transferred(void)
 177 {
 178     return acct_info.skipped_pages * TARGET_PAGE_SIZE;
 179 }
 180
 181 uint64_t skipped_mig_pages_transferred(void)
 182 {
 183     return acct_info.skipped_pages;
 184 }
 185
 186 uint64_t norm_mig_bytes_transferred(void)
 187 {
 188     return acct_info.norm_pages * TARGET_PAGE_SIZE;
 189 }
 190
 191 uint64_t norm_mig_pages_transferred(void)
 192 {
 193     return acct_info.norm_pages;
 194 }
 195
 196 uint64_t xbzrle_mig_bytes_transferred(void)
 197 {
 198     return acct_info.xbzrle_bytes;
 199 }
 200
 201 uint64_t xbzrle_mig_pages_transferred(void)
 202 {
 203     return acct_info.xbzrle_pages;
 204 }
 205
 206 uint64_t xbzrle_mig_pages_cache_miss(void)
 207 {
 208     return acct_info.xbzrle_cache_miss;
 209 }
 210
 211 double xbzrle_mig_cache_miss_rate(void)
 212 {
 213     return acct_info.xbzrle_cache_miss_rate;
 214 }
 215
 216 uint64_t xbzrle_mig_pages_overflow(void)
 217 {
 218     return acct_info.xbzrle_overflows;
 219 }
 220
 221 /* This is the last block that we have visited serching for dirty pages
 222  */
 223 static RAMBlock *last_seen_block;
 224 /* This is the last block from where we have sent data */
 225 static RAMBlock *last_sent_block;
 226 static ram_addr_t last_offset;
 227 static QemuMutex migration_bitmap_mutex;
 228 static uint64_t migration_dirty_pages;
 229 static uint32_t last_version;
 230 static bool ram_bulk_stage;
 231
 232 /* used by the search for pages to send */
 233 struct PageSearchStatus {
 234     /* Current block being searched */
 235     RAMBlock    *block;
 236     /* Current offset to search from */
 237     ram_addr_t   offset;
 238     /* Set once we wrap around */
 239     bool         complete_round;
 240 };
 241 typedef struct PageSearchStatus PageSearchStatus;
 242
 243 static struct BitmapRcu {
 244     struct rcu_head rcu;
 245     /* Main migration bitmap */
 246     unsigned long *bmap;
 247     /* bitmap of pages that haven't been sent even once
 248      * only maintained and used in postcopy at the moment
 249      * where it's used to send the dirtymap at the start
 250      * of the postcopy phase
 251      */
 252     unsigned long *unsentmap;
 253 } *migration_bitmap_rcu;
 254
 255 struct CompressParam {
 256     bool done;
 257     bool quit;
 258     QEMUFile *file;
 259     QemuMutex mutex;
 260     QemuCond cond;
 261     RAMBlock *block;
 262     ram_addr_t offset;
 263 };
 264 typedef struct CompressParam CompressParam;
 265
 266 struct DecompressParam {
 267     bool done;
 268     bool quit;
 269     QemuMutex mutex;
 270     QemuCond cond;
 271     void *des;
 272     uint8_t *compbuf;
 273     int len;
 274 };
 275 typedef struct DecompressParam DecompressParam;
 276
 277 static CompressParam *comp_param;
 278 static QemuThread *compress_threads;
 279 /* comp_done_cond is used to wake up the migration thread when
 280  * one of the compression threads has finished the compression.
 281  * comp_done_lock is used to co-work with comp_done_cond.
 282  */
 283 static QemuMutex comp_done_lock;
 284 static QemuCond comp_done_cond;
 285 /* The empty QEMUFileOps will be used by file in CompressParam */
 286 static const QEMUFileOps empty_ops = { };
 287
 288 static bool compression_switch;
 289 static DecompressParam *decomp_param;
 290 static QemuThread *decompress_threads;
 291 static QemuMutex decomp_done_lock;
 292 static QemuCond decomp_done_cond;
 293
 294 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 295                                 ram_addr_t offset);
 296
 297 static void *do_data_compress(void *opaque)
 298 {
 299     CompressParam *param = opaque;
 300     RAMBlock *block;
 301     ram_addr_t offset;
 302
 303     qemu_mutex_lock(&param->mutex);
 304     while (!param->quit) {
 305         if (param->block) {
 306             block = param->block;
 307             offset = param->offset;
 308             param->block = NULL;
 309             qemu_mutex_unlock(&param->mutex);
 310
 311             do_compress_ram_page(param->file, block, offset);
 312
 313             qemu_mutex_lock(&comp_done_lock);
 314             param->done = true;
 315             qemu_cond_signal(&comp_done_cond);
 316             qemu_mutex_unlock(&comp_done_lock);
 317
 318             qemu_mutex_lock(&param->mutex);
 319         } else {
 320             qemu_cond_wait(&param->cond, &param->mutex);
 321         }
 322     }
 323     qemu_mutex_unlock(&param->mutex);
 324
 325     return NULL;
 326 }
 327
 328 static inline void terminate_compression_threads(void)
 329 {
 330     int idx, thread_count;
 331
 332     thread_count = migrate_compress_threads();
 333     for (idx = 0; idx < thread_count; idx++) {
 334         qemu_mutex_lock(&comp_param[idx].mutex);
 335         comp_param[idx].quit = true;
 336         qemu_cond_signal(&comp_param[idx].cond);
 337         qemu_mutex_unlock(&comp_param[idx].mutex);
 338     }
 339 }
 340
 341 void migrate_compress_threads_join(void)
 342 {
 343     int i, thread_count;
 344
 345     if (!migrate_use_compression()) {
 346         return;
 347     }
 348     terminate_compression_threads();
 349     thread_count = migrate_compress_threads();
 350     for (i = 0; i < thread_count; i++) {
 351         qemu_thread_join(compress_threads + i);
 352         qemu_fclose(comp_param[i].file);
 353         qemu_mutex_destroy(&comp_param[i].mutex);
 354         qemu_cond_destroy(&comp_param[i].cond);
 355     }
 356     qemu_mutex_destroy(&comp_done_lock);
 357     qemu_cond_destroy(&comp_done_cond);
 358     g_free(compress_threads);
 359     g_free(comp_param);
 360     compress_threads = NULL;
 361     comp_param = NULL;
 362 }
 363
 364 void migrate_compress_threads_create(void)
 365 {
 366     int i, thread_count;
 367
 368     if (!migrate_use_compression()) {
 369         return;
 370     }
 371     compression_switch = true;
 372     thread_count = migrate_compress_threads();
 373     compress_threads = g_new0(QemuThread, thread_count);
 374     comp_param = g_new0(CompressParam, thread_count);
 375     qemu_cond_init(&comp_done_cond);
 376     qemu_mutex_init(&comp_done_lock);
 377     for (i = 0; i < thread_count; i++) {
 378         /* comp_param[i].file is just used as a dummy buffer to save data,
 379          * set its ops to empty.
 380          */
 381         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 382         comp_param[i].done = true;
 383         comp_param[i].quit = false;
 384         qemu_mutex_init(&comp_param[i].mutex);
 385         qemu_cond_init(&comp_param[i].cond);
 386         qemu_thread_create(compress_threads + i, "compress",
 387                            do_data_compress, comp_param + i,
 388                            QEMU_THREAD_JOINABLE);
 389     }
 390 }
 391
 392 /**
 393  * save_page_header: Write page header to wire
 394  *
 395  * If this is the 1st block, it also writes the block identification
 396  *
 397  * Returns: Number of bytes written
 398  *
 399  * @f: QEMUFile where to send the data
 400  * @block: block that contains the page we want to send
 401  * @offset: offset inside the block for the page
 402  *          in the lower bits, it contains flags
 403  */
 404 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
 405 {
 406     size_t size, len;
 407
 408     qemu_put_be64(f, offset);
 409     size = 8;
 410
 411     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 412         len = strlen(block->idstr);
 413         qemu_put_byte(f, len);
 414         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 415         size += 1 + len;
 416     }
 417     return size;
 418 }
 419
 420 /* Reduce amount of guest cpu execution to hopefully slow down memory writes.
 421  * If guest dirty memory rate is reduced below the rate at which we can
 422  * transfer pages to the destination then we should be able to complete
 423  * migration. Some workloads dirty memory way too fast and will not effectively
 424  * converge, even with auto-converge.
 425  */
 426 static void mig_throttle_guest_down(void)
 427 {
 428     MigrationState *s = migrate_get_current();
 429     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 430     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 431
 432     /* We have not started throttling yet. Let's start it. */
 433     if (!cpu_throttle_active()) {
 434         cpu_throttle_set(pct_initial);
 435     } else {
 436         /* Throttling already on, just increase the rate */
 437         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 438     }
 439 }
 440
 441 /* Update the xbzrle cache to reflect a page that's been sent as all 0.
 442  * The important thing is that a stale (not-yet-0'd) page be replaced
 443  * by the new data.
 444  * As a bonus, if the page wasn't in the cache it gets added so that
 445  * when a small write is made into the 0'd page it gets XBZRLE sent
 446  */
 447 static void xbzrle_cache_zero_page(ram_addr_t current_addr)
 448 {
 449     if (ram_bulk_stage || !migrate_use_xbzrle()) {
 450         return;
 451     }
 452
 453     /* We don't care if this fails to allocate a new cache page
 454      * as long as it updated an old one */
 455     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
 456                  bitmap_sync_count);
 457 }
 458
 459 #define ENCODING_FLAG_XBZRLE 0x1
 460
 461 /**
 462  * save_xbzrle_page: compress and send current page
 463  *
 464  * Returns: 1 means that we wrote the page
 465  *          0 means that page is identical to the one already sent
 466  *          -1 means that xbzrle would be longer than normal
 467  *
 468  * @f: QEMUFile where to send the data
 469  * @current_data:
 470  * @current_addr:
 471  * @block: block that contains the page we want to send
 472  * @offset: offset inside the block for the page
 473  * @last_stage: if we are at the completion stage
 474  * @bytes_transferred: increase it with the number of transferred bytes
 475  */
 476 static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
 477                             ram_addr_t current_addr, RAMBlock *block,
 478                             ram_addr_t offset, bool last_stage,
 479                             uint64_t *bytes_transferred)
 480 {
 481     int encoded_len = 0, bytes_xbzrle;
 482     uint8_t *prev_cached_page;
 483
 484     if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
 485         acct_info.xbzrle_cache_miss++;
 486         if (!last_stage) {
 487             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 488                              bitmap_sync_count) == -1) {
 489                 return -1;
 490             } else {
 491                 /* update *current_data when the page has been
 492                    inserted into cache */
 493                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 494             }
 495         }
 496         return -1;
 497     }
 498
 499     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 500
 501     /* save current buffer into memory */
 502     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 503
 504     /* XBZRLE encoding (if there is no overflow) */
 505     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 506                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 507                                        TARGET_PAGE_SIZE);
 508     if (encoded_len == 0) {
 509         DPRINTF("Skipping unmodified page\n");
 510         return 0;
 511     } else if (encoded_len == -1) {
 512         DPRINTF("Overflow\n");
 513         acct_info.xbzrle_overflows++;
 514         /* update data in the cache */
 515         if (!last_stage) {
 516             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 517             *current_data = prev_cached_page;
 518         }
 519         return -1;
 520     }
 521
 522     /* we need to update the data in the cache, in order to get the same data */
 523     if (!last_stage) {
 524         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 525     }
 526
 527     /* Send XBZRLE based compressed page */
 528     bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
 529     qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
 530     qemu_put_be16(f, encoded_len);
 531     qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
 532     bytes_xbzrle += encoded_len + 1 + 2;
 533     acct_info.xbzrle_pages++;
 534     acct_info.xbzrle_bytes += bytes_xbzrle;
 535     *bytes_transferred += bytes_xbzrle;
 536
 537     return 1;
 538 }
 539
 540 /* Called with rcu_read_lock() to protect migration_bitmap
 541  * rb: The RAMBlock  to search for dirty pages in
 542  * start: Start address (typically so we can continue from previous page)
 543  * ram_addr_abs: Pointer into which to store the address of the dirty page
 544  *               within the global ram_addr space
 545  *
 546  * Returns: byte offset within memory region of the start of a dirty page
 547  */
 548 static inline
 549 ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
 550                                        ram_addr_t start,
 551                                        ram_addr_t *ram_addr_abs)
 552 {
 553     unsigned long base = rb->offset >> TARGET_PAGE_BITS;
 554     unsigned long nr = base + (start >> TARGET_PAGE_BITS);
 555     uint64_t rb_size = rb->used_length;
 556     unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
 557     unsigned long *bitmap;
 558
 559     unsigned long next;
 560
 561     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 562     if (ram_bulk_stage && nr > base) {
 563         next = nr + 1;
 564     } else {
 565         next = find_next_bit(bitmap, size, nr);
 566     }
 567
 568     *ram_addr_abs = next << TARGET_PAGE_BITS;
 569     return (next - base) << TARGET_PAGE_BITS;
 570 }
 571
 572 static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
 573 {
 574     bool ret;
 575     int nr = addr >> TARGET_PAGE_BITS;
 576     unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 577
 578     ret = test_and_clear_bit(nr, bitmap);
 579
 580     if (ret) {
 581         migration_dirty_pages--;
 582     }
 583     return ret;
 584 }
 585
 586 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
 587 {
 588     unsigned long *bitmap;
 589     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 590     migration_dirty_pages +=
 591         cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
 592 }
 593
 594 /* Fix me: there are too many global variables used in migration process. */
 595 static int64_t start_time;
 596 static int64_t bytes_xfer_prev;
 597 static int64_t num_dirty_pages_period;
 598 static uint64_t xbzrle_cache_miss_prev;
 599 static uint64_t iterations_prev;
 600
 601 static void migration_bitmap_sync_init(void)
 602 {
 603     start_time = 0;
 604     bytes_xfer_prev = 0;
 605     num_dirty_pages_period = 0;
 606     xbzrle_cache_miss_prev = 0;
 607     iterations_prev = 0;
 608 }
 609
 610 static void migration_bitmap_sync(void)
 611 {
 612     RAMBlock *block;
 613     uint64_t num_dirty_pages_init = migration_dirty_pages;
 614     MigrationState *s = migrate_get_current();
 615     int64_t end_time;
 616     int64_t bytes_xfer_now;
 617
 618     bitmap_sync_count++;
 619
 620     if (!bytes_xfer_prev) {
 621         bytes_xfer_prev = ram_bytes_transferred();
 622     }
 623
 624     if (!start_time) {
 625         start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 626     }
 627
 628     trace_migration_bitmap_sync_start();
 629     memory_global_dirty_log_sync();
 630
 631     qemu_mutex_lock(&migration_bitmap_mutex);
 632     rcu_read_lock();
 633     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 634         migration_bitmap_sync_range(block->offset, block->used_length);
 635     }
 636     rcu_read_unlock();
 637     qemu_mutex_unlock(&migration_bitmap_mutex);
 638
 639     trace_migration_bitmap_sync_end(migration_dirty_pages
 640                                     - num_dirty_pages_init);
 641     num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
 642     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 643
 644     /* more than 1 second = 1000 millisecons */
 645     if (end_time > start_time + 1000) {
 646         if (migrate_auto_converge()) {
 647             /* The following detection logic can be refined later. For now:
 648                Check to see if the dirtied bytes is 50% more than the approx.
 649                amount of bytes that just got transferred since the last time we
 650                were in this routine. If that happens twice, start or increase
 651                throttling */
 652             bytes_xfer_now = ram_bytes_transferred();
 653
 654             if (s->dirty_pages_rate &&
 655                (num_dirty_pages_period * TARGET_PAGE_SIZE >
 656                    (bytes_xfer_now - bytes_xfer_prev)/2) &&
 657                (dirty_rate_high_cnt++ >= 2)) {
 658                     trace_migration_throttle();
 659                     dirty_rate_high_cnt = 0;
 660                     mig_throttle_guest_down();
 661              }
 662              bytes_xfer_prev = bytes_xfer_now;
 663         }
 664
 665         if (migrate_use_xbzrle()) {
 666             if (iterations_prev != acct_info.iterations) {
 667                 acct_info.xbzrle_cache_miss_rate =
 668                    (double)(acct_info.xbzrle_cache_miss -
 669                             xbzrle_cache_miss_prev) /
 670                    (acct_info.iterations - iterations_prev);
 671             }
 672             iterations_prev = acct_info.iterations;
 673             xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
 674         }
 675         s->dirty_pages_rate = num_dirty_pages_period * 1000
 676             / (end_time - start_time);
 677         s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
 678         start_time = end_time;
 679         num_dirty_pages_period = 0;
 680     }
 681     s->dirty_sync_count = bitmap_sync_count;
 682     if (migrate_use_events()) {
 683         qapi_event_send_migration_pass(bitmap_sync_count, NULL);
 684     }
 685 }
 686
 687 /**
 688  * save_zero_page: Send the zero page to the stream
 689  *
 690  * Returns: Number of pages written.
 691  *
 692  * @f: QEMUFile where to send the data
 693  * @block: block that contains the page we want to send
 694  * @offset: offset inside the block for the page
 695  * @p: pointer to the page
 696  * @bytes_transferred: increase it with the number of transferred bytes
 697  */
 698 static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
 699                           uint8_t *p, uint64_t *bytes_transferred)
 700 {
 701     int pages = -1;
 702
 703     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 704         acct_info.dup_pages++;
 705         *bytes_transferred += save_page_header(f, block,
 706                                                offset | RAM_SAVE_FLAG_COMPRESS);
 707         qemu_put_byte(f, 0);
 708         *bytes_transferred += 1;
 709         pages = 1;
 710     }
 711
 712     return pages;
 713 }
 714
 715 /**
 716  * ram_save_page: Send the given page to the stream
 717  *
 718  * Returns: Number of pages written.
 719  *          < 0 - error
 720  *          >=0 - Number of pages written - this might legally be 0
 721  *                if xbzrle noticed the page was the same.
 722  *
 723  * @f: QEMUFile where to send the data
 724  * @block: block that contains the page we want to send
 725  * @offset: offset inside the block for the page
 726  * @last_stage: if we are at the completion stage
 727  * @bytes_transferred: increase it with the number of transferred bytes
 728  */
 729 static int ram_save_page(QEMUFile *f, PageSearchStatus *pss,
 730                          bool last_stage, uint64_t *bytes_transferred)
 731 {
 732     int pages = -1;
 733     uint64_t bytes_xmit;
 734     ram_addr_t current_addr;
 735     uint8_t *p;
 736     int ret;
 737     bool send_async = true;
 738     RAMBlock *block = pss->block;
 739     ram_addr_t offset = pss->offset;
 740
 741     p = block->host + offset;
 742
 743     /* In doubt sent page as normal */
 744     bytes_xmit = 0;
 745     ret = ram_control_save_page(f, block->offset,
 746                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 747     if (bytes_xmit) {
 748         *bytes_transferred += bytes_xmit;
 749         pages = 1;
 750     }
 751
 752     XBZRLE_cache_lock();
 753
 754     current_addr = block->offset + offset;
 755
 756     if (block == last_sent_block) {
 757         offset |= RAM_SAVE_FLAG_CONTINUE;
 758     }
 759     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 760         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 761             if (bytes_xmit > 0) {
 762                 acct_info.norm_pages++;
 763             } else if (bytes_xmit == 0) {
 764                 acct_info.dup_pages++;
 765             }
 766         }
 767     } else {
 768         pages = save_zero_page(f, block, offset, p, bytes_transferred);
 769         if (pages > 0) {
 770             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 771              * page would be stale
 772              */
 773             xbzrle_cache_zero_page(current_addr);
 774         } else if (!ram_bulk_stage && migrate_use_xbzrle()) {
 775             pages = save_xbzrle_page(f, &p, current_addr, block,
 776                                      offset, last_stage, bytes_transferred);
 777             if (!last_stage) {
 778                 /* Can't send this cached data async, since the cache page
 779                  * might get updated before it gets to the wire
 780                  */
 781                 send_async = false;
 782             }
 783         }
 784     }
 785
 786     /* XBZRLE overflow or normal page */
 787     if (pages == -1) {
 788         *bytes_transferred += save_page_header(f, block,
 789                                                offset | RAM_SAVE_FLAG_PAGE);
 790         if (send_async) {
 791             qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
 792         } else {
 793             qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
 794         }
 795         *bytes_transferred += TARGET_PAGE_SIZE;
 796         pages = 1;
 797         acct_info.norm_pages++;
 798     }
 799
 800     XBZRLE_cache_unlock();
 801
 802     return pages;
 803 }
 804
 805 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 806                                 ram_addr_t offset)
 807 {
 808     int bytes_sent, blen;
 809     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 810
 811     bytes_sent = save_page_header(f, block, offset |
 812                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
 813     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 814                                      migrate_compress_level());
 815     if (blen < 0) {
 816         bytes_sent = 0;
 817         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
 818         error_report("compressed data failed!");
 819     } else {
 820         bytes_sent += blen;
 821     }
 822
 823     return bytes_sent;
 824 }
 825
 826 static uint64_t bytes_transferred;
 827
 828 static void flush_compressed_data(QEMUFile *f)
 829 {
 830     int idx, len, thread_count;
 831
 832     if (!migrate_use_compression()) {
 833         return;
 834     }
 835     thread_count = migrate_compress_threads();
 836
 837     qemu_mutex_lock(&comp_done_lock);
 838     for (idx = 0; idx < thread_count; idx++) {
 839         while (!comp_param[idx].done) {
 840             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 841         }
 842     }
 843     qemu_mutex_unlock(&comp_done_lock);
 844
 845     for (idx = 0; idx < thread_count; idx++) {
 846         qemu_mutex_lock(&comp_param[idx].mutex);
 847         if (!comp_param[idx].quit) {
 848             len = qemu_put_qemu_file(f, comp_param[idx].file);
 849             bytes_transferred += len;
 850         }
 851         qemu_mutex_unlock(&comp_param[idx].mutex);
 852     }
 853 }
 854
 855 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 856                                        ram_addr_t offset)
 857 {
 858     param->block = block;
 859     param->offset = offset;
 860 }
 861
 862 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
 863                                            ram_addr_t offset,
 864                                            uint64_t *bytes_transferred)
 865 {
 866     int idx, thread_count, bytes_xmit = -1, pages = -1;
 867
 868     thread_count = migrate_compress_threads();
 869     qemu_mutex_lock(&comp_done_lock);
 870     while (true) {
 871         for (idx = 0; idx < thread_count; idx++) {
 872             if (comp_param[idx].done) {
 873                 comp_param[idx].done = false;
 874                 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
 875                 qemu_mutex_lock(&comp_param[idx].mutex);
 876                 set_compress_params(&comp_param[idx], block, offset);
 877                 qemu_cond_signal(&comp_param[idx].cond);
 878                 qemu_mutex_unlock(&comp_param[idx].mutex);
 879                 pages = 1;
 880                 acct_info.norm_pages++;
 881                 *bytes_transferred += bytes_xmit;
 882                 break;
 883             }
 884         }
 885         if (pages > 0) {
 886             break;
 887         } else {
 888             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 889         }
 890     }
 891     qemu_mutex_unlock(&comp_done_lock);
 892
 893     return pages;
 894 }
 895
 896 /**
 897  * ram_save_compressed_page: compress the given page and send it to the stream
 898  *
 899  * Returns: Number of pages written.
 900  *
 901  * @f: QEMUFile where to send the data
 902  * @block: block that contains the page we want to send
 903  * @offset: offset inside the block for the page
 904  * @last_stage: if we are at the completion stage
 905  * @bytes_transferred: increase it with the number of transferred bytes
 906  */
 907 static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss,
 908                                     bool last_stage,
 909                                     uint64_t *bytes_transferred)
 910 {
 911     int pages = -1;
 912     uint64_t bytes_xmit = 0;
 913     uint8_t *p;
 914     int ret, blen;
 915     RAMBlock *block = pss->block;
 916     ram_addr_t offset = pss->offset;
 917
 918     p = block->host + offset;
 919
 920     ret = ram_control_save_page(f, block->offset,
 921                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
 922     if (bytes_xmit) {
 923         *bytes_transferred += bytes_xmit;
 924         pages = 1;
 925     }
 926     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 927         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 928             if (bytes_xmit > 0) {
 929                 acct_info.norm_pages++;
 930             } else if (bytes_xmit == 0) {
 931                 acct_info.dup_pages++;
 932             }
 933         }
 934     } else {
 935         /* When starting the process of a new block, the first page of
 936          * the block should be sent out before other pages in the same
 937          * block, and all the pages in last block should have been sent
 938          * out, keeping this order is important, because the 'cont' flag
 939          * is used to avoid resending the block name.
 940          */
 941         if (block != last_sent_block) {
 942             flush_compressed_data(f);
 943             pages = save_zero_page(f, block, offset, p, bytes_transferred);
 944             if (pages == -1) {
 945                 /* Make sure the first page is sent out before other pages */
 946                 bytes_xmit = save_page_header(f, block, offset |
 947                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
 948                 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 949                                                  migrate_compress_level());
 950                 if (blen > 0) {
 951                     *bytes_transferred += bytes_xmit + blen;
 952                     acct_info.norm_pages++;
 953                     pages = 1;
 954                 } else {
 955                     qemu_file_set_error(f, blen);
 956                     error_report("compressed data failed!");
 957                 }
 958             }
 959         } else {
 960             offset |= RAM_SAVE_FLAG_CONTINUE;
 961             pages = save_zero_page(f, block, offset, p, bytes_transferred);
 962             if (pages == -1) {
 963                 pages = compress_page_with_multi_thread(f, block, offset,
 964                                                         bytes_transferred);
 965             }
 966         }
 967     }
 968
 969     return pages;
 970 }
 971
 972 /*
 973  * Find the next dirty page and update any state associated with
 974  * the search process.
 975  *
 976  * Returns: True if a page is found
 977  *
 978  * @f: Current migration stream.
 979  * @pss: Data about the state of the current dirty page scan.
 980  * @*again: Set to false if the search has scanned the whole of RAM
 981  * *ram_addr_abs: Pointer into which to store the address of the dirty page
 982  *               within the global ram_addr space
 983  */
 984 static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
 985                              bool *again, ram_addr_t *ram_addr_abs)
 986 {
 987     pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset,
 988                                               ram_addr_abs);
 989     if (pss->complete_round && pss->block == last_seen_block &&
 990         pss->offset >= last_offset) {
 991         /*
 992          * We've been once around the RAM and haven't found anything.
 993          * Give up.
 994          */
 995         *again = false;
 996         return false;
 997     }
 998     if (pss->offset >= pss->block->used_length) {
 999         /* Didn't find anything in this RAM Block */
1000         pss->offset = 0;
1001         pss->block = QLIST_NEXT_RCU(pss->block, next);
1002         if (!pss->block) {
1003             /* Hit the end of the list */
1004             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1005             /* Flag that we've looped */
1006             pss->complete_round = true;
1007             ram_bulk_stage = false;
1008             if (migrate_use_xbzrle()) {
1009                 /* If xbzrle is on, stop using the data compression at this
1010                  * point. In theory, xbzrle can do better than compression.
1011                  */
1012                 flush_compressed_data(f);
1013                 compression_switch = false;
1014             }
1015         }
1016         /* Didn't find anything this time, but try again on the new block */
1017         *again = true;
1018         return false;
1019     } else {
1020         /* Can go around again, but... */
1021         *again = true;
1022         /* We've found something so probably don't need to */
1023         return true;
1024     }
1025 }
1026
1027 /*
1028  * Helper for 'get_queued_page' - gets a page off the queue
1029  *      ms:      MigrationState in
1030  * *offset:      Used to return the offset within the RAMBlock
1031  * ram_addr_abs: global offset in the dirty/sent bitmaps
1032  *
1033  * Returns:      block (or NULL if none available)
1034  */
1035 static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1036                               ram_addr_t *ram_addr_abs)
1037 {
1038     RAMBlock *block = NULL;
1039
1040     qemu_mutex_lock(&ms->src_page_req_mutex);
1041     if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1042         struct MigrationSrcPageRequest *entry =
1043                                 QSIMPLEQ_FIRST(&ms->src_page_requests);
1044         block = entry->rb;
1045         *offset = entry->offset;
1046         *ram_addr_abs = (entry->offset + entry->rb->offset) &
1047                         TARGET_PAGE_MASK;
1048
1049         if (entry->len > TARGET_PAGE_SIZE) {
1050             entry->len -= TARGET_PAGE_SIZE;
1051             entry->offset += TARGET_PAGE_SIZE;
1052         } else {
1053             memory_region_unref(block->mr);
1054             QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1055             g_free(entry);
1056         }
1057     }
1058     qemu_mutex_unlock(&ms->src_page_req_mutex);
1059
1060     return block;
1061 }
1062
1063 /*
1064  * Unqueue a page from the queue fed by postcopy page requests; skips pages
1065  * that are already sent (!dirty)
1066  *
1067  *      ms:      MigrationState in
1068  *     pss:      PageSearchStatus structure updated with found block/offset
1069  * ram_addr_abs: global offset in the dirty/sent bitmaps
1070  *
1071  * Returns:      true if a queued page is found
1072  */
1073 static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss,
1074                             ram_addr_t *ram_addr_abs)
1075 {
1076     RAMBlock  *block;
1077     ram_addr_t offset;
1078     bool dirty;
1079
1080     do {
1081         block = unqueue_page(ms, &offset, ram_addr_abs);
1082         /*
1083          * We're sending this page, and since it's postcopy nothing else
1084          * will dirty it, and we must make sure it doesn't get sent again
1085          * even if this queue request was received after the background
1086          * search already sent it.
1087          */
1088         if (block) {
1089             unsigned long *bitmap;
1090             bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1091             dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1092             if (!dirty) {
1093                 trace_get_queued_page_not_dirty(
1094                     block->idstr, (uint64_t)offset,
1095                     (uint64_t)*ram_addr_abs,
1096                     test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1097                          atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1098             } else {
1099                 trace_get_queued_page(block->idstr,
1100                                       (uint64_t)offset,
1101                                       (uint64_t)*ram_addr_abs);
1102             }
1103         }
1104
1105     } while (block && !dirty);
1106
1107     if (block) {
1108         /*
1109          * As soon as we start servicing pages out of order, then we have
1110          * to kill the bulk stage, since the bulk stage assumes
1111          * in (migration_bitmap_find_and_reset_dirty) that every page is
1112          * dirty, that's no longer true.
1113          */
1114         ram_bulk_stage = false;
1115
1116         /*
1117          * We want the background search to continue from the queued page
1118          * since the guest is likely to want other pages near to the page
1119          * it just requested.
1120          */
1121         pss->block = block;
1122         pss->offset = offset;
1123     }
1124
1125     return !!block;
1126 }
1127
1128 /**
1129  * flush_page_queue: Flush any remaining pages in the ram request queue
1130  *    it should be empty at the end anyway, but in error cases there may be
1131  *    some left.
1132  *
1133  * ms: MigrationState
1134  */
1135 void flush_page_queue(MigrationState *ms)
1136 {
1137     struct MigrationSrcPageRequest *mspr, *next_mspr;
1138     /* This queue generally should be empty - but in the case of a failed
1139      * migration might have some droppings in.
1140      */
1141     rcu_read_lock();
1142     QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1143         memory_region_unref(mspr->rb->mr);
1144         QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1145         g_free(mspr);
1146     }
1147     rcu_read_unlock();
1148 }
1149
1150 /**
1151  * Queue the pages for transmission, e.g. a request from postcopy destination
1152  *   ms: MigrationStatus in which the queue is held
1153  *   rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
1154  *   start: Offset from the start of the RAMBlock
1155  *   len: Length (in bytes) to send
1156  *   Return: 0 on success
1157  */
1158 int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1159                          ram_addr_t start, ram_addr_t len)
1160 {
1161     RAMBlock *ramblock;
1162
1163     ms->postcopy_requests++;
1164     rcu_read_lock();
1165     if (!rbname) {
1166         /* Reuse last RAMBlock */
1167         ramblock = ms->last_req_rb;
1168
1169         if (!ramblock) {
1170             /*
1171              * Shouldn't happen, we can't reuse the last RAMBlock if
1172              * it's the 1st request.
1173              */
1174             error_report("ram_save_queue_pages no previous block");
1175             goto err;
1176         }
1177     } else {
1178         ramblock = qemu_ram_block_by_name(rbname);
1179
1180         if (!ramblock) {
1181             /* We shouldn't be asked for a non-existent RAMBlock */
1182             error_report("ram_save_queue_pages no block '%s'", rbname);
1183             goto err;
1184         }
1185         ms->last_req_rb = ramblock;
1186     }
1187     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1188     if (start+len > ramblock->used_length) {
1189         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1190                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1191                      __func__, start, len, ramblock->used_length);
1192         goto err;
1193     }
1194
1195     struct MigrationSrcPageRequest *new_entry =
1196         g_malloc0(sizeof(struct MigrationSrcPageRequest));
1197     new_entry->rb = ramblock;
1198     new_entry->offset = start;
1199     new_entry->len = len;
1200
1201     memory_region_ref(ramblock->mr);
1202     qemu_mutex_lock(&ms->src_page_req_mutex);
1203     QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1204     qemu_mutex_unlock(&ms->src_page_req_mutex);
1205     rcu_read_unlock();
1206
1207     return 0;
1208
1209 err:
1210     rcu_read_unlock();
1211     return -1;
1212 }
1213
1214 /**
1215  * ram_save_target_page: Save one target page
1216  *
1217  *
1218  * @f: QEMUFile where to send the data
1219  * @block: pointer to block that contains the page we want to send
1220  * @offset: offset inside the block for the page;
1221  * @last_stage: if we are at the completion stage
1222  * @bytes_transferred: increase it with the number of transferred bytes
1223  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1224  *
1225  * Returns: Number of pages written.
1226  */
1227 static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
1228                                 PageSearchStatus *pss,
1229                                 bool last_stage,
1230                                 uint64_t *bytes_transferred,
1231                                 ram_addr_t dirty_ram_abs)
1232 {
1233     int res = 0;
1234
1235     /* Check the pages is dirty and if it is send it */
1236     if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1237         unsigned long *unsentmap;
1238         if (compression_switch && migrate_use_compression()) {
1239             res = ram_save_compressed_page(f, pss,
1240                                            last_stage,
1241                                            bytes_transferred);
1242         } else {
1243             res = ram_save_page(f, pss, last_stage,
1244                                 bytes_transferred);
1245         }
1246
1247         if (res < 0) {
1248             return res;
1249         }
1250         unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1251         if (unsentmap) {
1252             clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1253         }
1254         /* Only update last_sent_block if a block was actually sent; xbzrle
1255          * might have decided the page was identical so didn't bother writing
1256          * to the stream.
1257          */
1258         if (res > 0) {
1259             last_sent_block = pss->block;
1260         }
1261     }
1262
1263     return res;
1264 }
1265
1266 /**
1267  * ram_save_host_page: Starting at *offset send pages up to the end
1268  *                     of the current host page.  It's valid for the initial
1269  *                     offset to point into the middle of a host page
1270  *                     in which case the remainder of the hostpage is sent.
1271  *                     Only dirty target pages are sent.
1272  *
1273  * Returns: Number of pages written.
1274  *
1275  * @f: QEMUFile where to send the data
1276  * @block: pointer to block that contains the page we want to send
1277  * @offset: offset inside the block for the page; updated to last target page
1278  *          sent
1279  * @last_stage: if we are at the completion stage
1280  * @bytes_transferred: increase it with the number of transferred bytes
1281  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1282  */
1283 static int ram_save_host_page(MigrationState *ms, QEMUFile *f,
1284                               PageSearchStatus *pss,
1285                               bool last_stage,
1286                               uint64_t *bytes_transferred,
1287                               ram_addr_t dirty_ram_abs)
1288 {
1289     int tmppages, pages = 0;
1290     do {
1291         tmppages = ram_save_target_page(ms, f, pss, last_stage,
1292                                         bytes_transferred, dirty_ram_abs);
1293         if (tmppages < 0) {
1294             return tmppages;
1295         }
1296
1297         pages += tmppages;
1298         pss->offset += TARGET_PAGE_SIZE;
1299         dirty_ram_abs += TARGET_PAGE_SIZE;
1300     } while (pss->offset & (qemu_host_page_size - 1));
1301
1302     /* The offset we leave with is the last one we looked at */
1303     pss->offset -= TARGET_PAGE_SIZE;
1304     return pages;
1305 }
1306
1307 /**
1308  * ram_find_and_save_block: Finds a dirty page and sends it to f
1309  *
1310  * Called within an RCU critical section.
1311  *
1312  * Returns:  The number of pages written
1313  *           0 means no dirty pages
1314  *
1315  * @f: QEMUFile where to send the data
1316  * @last_stage: if we are at the completion stage
1317  * @bytes_transferred: increase it with the number of transferred bytes
1318  *
1319  * On systems where host-page-size > target-page-size it will send all the
1320  * pages in a host page that are dirty.
1321  */
1322
1323 static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
1324                                    uint64_t *bytes_transferred)
1325 {
1326     PageSearchStatus pss;
1327     MigrationState *ms = migrate_get_current();
1328     int pages = 0;
1329     bool again, found;
1330     ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1331                                  ram_addr_t space */
1332
1333     pss.block = last_seen_block;
1334     pss.offset = last_offset;
1335     pss.complete_round = false;
1336
1337     if (!pss.block) {
1338         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1339     }
1340
1341     do {
1342         again = true;
1343         found = get_queued_page(ms, &pss, &dirty_ram_abs);
1344
1345         if (!found) {
1346             /* priority queue empty, so just search for something dirty */
1347             found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
1348         }
1349
1350         if (found) {
1351             pages = ram_save_host_page(ms, f, &pss,
1352                                        last_stage, bytes_transferred,
1353                                        dirty_ram_abs);
1354         }
1355     } while (!pages && again);
1356
1357     last_seen_block = pss.block;
1358     last_offset = pss.offset;
1359
1360     return pages;
1361 }
1362
1363 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1364 {
1365     uint64_t pages = size / TARGET_PAGE_SIZE;
1366     if (zero) {
1367         acct_info.dup_pages += pages;
1368     } else {
1369         acct_info.norm_pages += pages;
1370         bytes_transferred += size;
1371         qemu_update_position(f, size);
1372     }
1373 }
1374
1375 static ram_addr_t ram_save_remaining(void)
1376 {
1377     return migration_dirty_pages;
1378 }
1379
1380 uint64_t ram_bytes_remaining(void)
1381 {
1382     return ram_save_remaining() * TARGET_PAGE_SIZE;
1383 }
1384
1385 uint64_t ram_bytes_transferred(void)
1386 {
1387     return bytes_transferred;
1388 }
1389
1390 uint64_t ram_bytes_total(void)
1391 {
1392     RAMBlock *block;
1393     uint64_t total = 0;
1394
1395     rcu_read_lock();
1396     QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1397         total += block->used_length;
1398     rcu_read_unlock();
1399     return total;
1400 }
1401
1402 void free_xbzrle_decoded_buf(void)
1403 {
1404     g_free(xbzrle_decoded_buf);
1405     xbzrle_decoded_buf = NULL;
1406 }
1407
1408 static void migration_bitmap_free(struct BitmapRcu *bmap)
1409 {
1410     g_free(bmap->bmap);
1411     g_free(bmap->unsentmap);
1412     g_free(bmap);
1413 }
1414
1415 static void ram_migration_cleanup(void *opaque)
1416 {
1417     /* caller have hold iothread lock or is in a bh, so there is
1418      * no writing race against this migration_bitmap
1419      */
1420     struct BitmapRcu *bitmap = migration_bitmap_rcu;
1421     atomic_rcu_set(&migration_bitmap_rcu, NULL);
1422     if (bitmap) {
1423         memory_global_dirty_log_stop();
1424         call_rcu(bitmap, migration_bitmap_free, rcu);
1425     }
1426
1427     XBZRLE_cache_lock();
1428     if (XBZRLE.cache) {
1429         cache_fini(XBZRLE.cache);
1430         g_free(XBZRLE.encoded_buf);
1431         g_free(XBZRLE.current_buf);
1432         XBZRLE.cache = NULL;
1433         XBZRLE.encoded_buf = NULL;
1434         XBZRLE.current_buf = NULL;
1435     }
1436     XBZRLE_cache_unlock();
1437 }
1438
1439 static void reset_ram_globals(void)
1440 {
1441     last_seen_block = NULL;
1442     last_sent_block = NULL;
1443     last_offset = 0;
1444     last_version = ram_list.version;
1445     ram_bulk_stage = true;
1446 }
1447
1448 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1449
1450 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1451 {
1452     /* called in qemu main thread, so there is
1453      * no writing race against this migration_bitmap
1454      */
1455     if (migration_bitmap_rcu) {
1456         struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1457         bitmap = g_new(struct BitmapRcu, 1);
1458         bitmap->bmap = bitmap_new(new);
1459
1460         /* prevent migration_bitmap content from being set bit
1461          * by migration_bitmap_sync_range() at the same time.
1462          * it is safe to migration if migration_bitmap is cleared bit
1463          * at the same time.
1464          */
1465         qemu_mutex_lock(&migration_bitmap_mutex);
1466         bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1467         bitmap_set(bitmap->bmap, old, new - old);
1468
1469         /* We don't have a way to safely extend the sentmap
1470          * with RCU; so mark it as missing, entry to postcopy
1471          * will fail.
1472          */
1473         bitmap->unsentmap = NULL;
1474
1475         atomic_rcu_set(&migration_bitmap_rcu, bitmap);
1476         qemu_mutex_unlock(&migration_bitmap_mutex);
1477         migration_dirty_pages += new - old;
1478         call_rcu(old_bitmap, migration_bitmap_free, rcu);
1479     }
1480 }
1481
1482 /*
1483  * 'expected' is the value you expect the bitmap mostly to be full
1484  * of; it won't bother printing lines that are all this value.
1485  * If 'todump' is null the migration bitmap is dumped.
1486  */
1487 void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1488 {
1489     int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1490
1491     int64_t cur;
1492     int64_t linelen = 128;
1493     char linebuf[129];
1494
1495     if (!todump) {
1496         todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1497     }
1498
1499     for (cur = 0; cur < ram_pages; cur += linelen) {
1500         int64_t curb;
1501         bool found = false;
1502         /*
1503          * Last line; catch the case where the line length
1504          * is longer than remaining ram
1505          */
1506         if (cur + linelen > ram_pages) {
1507             linelen = ram_pages - cur;
1508         }
1509         for (curb = 0; curb < linelen; curb++) {
1510             bool thisbit = test_bit(cur + curb, todump);
1511             linebuf[curb] = thisbit ? '1' : '.';
1512             found = found || (thisbit != expected);
1513         }
1514         if (found) {
1515             linebuf[curb] = '\0';
1516             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1517         }
1518     }
1519 }
1520
1521 /* **** functions for postcopy ***** */
1522
1523 /*
1524  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1525  * Note: At this point the 'unsentmap' is the processed bitmap combined
1526  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1527  * start,length: Indexes into the bitmap for the first bit
1528  *            representing the named block and length in target-pages
1529  */
1530 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1531                                         PostcopyDiscardState *pds,
1532                                         unsigned long start,
1533                                         unsigned long length)
1534 {
1535     unsigned long end = start + length; /* one after the end */
1536     unsigned long current;
1537     unsigned long *unsentmap;
1538
1539     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1540     for (current = start; current < end; ) {
1541         unsigned long one = find_next_bit(unsentmap, end, current);
1542
1543         if (one <= end) {
1544             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1545             unsigned long discard_length;
1546
1547             if (zero >= end) {
1548                 discard_length = end - one;
1549             } else {
1550                 discard_length = zero - one;
1551             }
1552             if (discard_length) {
1553                 postcopy_discard_send_range(ms, pds, one, discard_length);
1554             }
1555             current = one + discard_length;
1556         } else {
1557             current = one;
1558         }
1559     }
1560
1561     return 0;
1562 }
1563
1564 /*
1565  * Utility for the outgoing postcopy code.
1566  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1567  *   passing it bitmap indexes and name.
1568  * Returns: 0 on success
1569  * (qemu_ram_foreach_block ends up passing unscaled lengths
1570  *  which would mean postcopy code would have to deal with target page)
1571  */
1572 static int postcopy_each_ram_send_discard(MigrationState *ms)
1573 {
1574     struct RAMBlock *block;
1575     int ret;
1576
1577     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1578         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1579         PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1580                                                                first,
1581                                                                block->idstr);
1582
1583         /*
1584          * Postcopy sends chunks of bitmap over the wire, but it
1585          * just needs indexes at this point, avoids it having
1586          * target page specific code.
1587          */
1588         ret = postcopy_send_discard_bm_ram(ms, pds, first,
1589                                     block->used_length >> TARGET_PAGE_BITS);
1590         postcopy_discard_send_finish(ms, pds);
1591         if (ret) {
1592             return ret;
1593         }
1594     }
1595
1596     return 0;
1597 }
1598
1599 /*
1600  * Helper for postcopy_chunk_hostpages; it's called twice to cleanup
1601  *   the two bitmaps, that are similar, but one is inverted.
1602  *
1603  * We search for runs of target-pages that don't start or end on a
1604  * host page boundary;
1605  * unsent_pass=true: Cleans up partially unsent host pages by searching
1606  *                 the unsentmap
1607  * unsent_pass=false: Cleans up partially dirty host pages by searching
1608  *                 the main migration bitmap
1609  *
1610  */
1611 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1612                                           RAMBlock *block,
1613                                           PostcopyDiscardState *pds)
1614 {
1615     unsigned long *bitmap;
1616     unsigned long *unsentmap;
1617     unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE;
1618     unsigned long first = block->offset >> TARGET_PAGE_BITS;
1619     unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1620     unsigned long last = first + (len - 1);
1621     unsigned long run_start;
1622
1623     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1624     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1625
1626     if (unsent_pass) {
1627         /* Find a sent page */
1628         run_start = find_next_zero_bit(unsentmap, last + 1, first);
1629     } else {
1630         /* Find a dirty page */
1631         run_start = find_next_bit(bitmap, last + 1, first);
1632     }
1633
1634     while (run_start <= last) {
1635         bool do_fixup = false;
1636         unsigned long fixup_start_addr;
1637         unsigned long host_offset;
1638
1639         /*
1640          * If the start of this run of pages is in the middle of a host
1641          * page, then we need to fixup this host page.
1642          */
1643         host_offset = run_start % host_ratio;
1644         if (host_offset) {
1645             do_fixup = true;
1646             run_start -= host_offset;
1647             fixup_start_addr = run_start;
1648             /* For the next pass */
1649             run_start = run_start + host_ratio;
1650         } else {
1651             /* Find the end of this run */
1652             unsigned long run_end;
1653             if (unsent_pass) {
1654                 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1655             } else {
1656                 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1657             }
1658             /*
1659              * If the end isn't at the start of a host page, then the
1660              * run doesn't finish at the end of a host page
1661              * and we need to discard.
1662              */
1663             host_offset = run_end % host_ratio;
1664             if (host_offset) {
1665                 do_fixup = true;
1666                 fixup_start_addr = run_end - host_offset;
1667                 /*
1668                  * This host page has gone, the next loop iteration starts
1669                  * from after the fixup
1670                  */
1671                 run_start = fixup_start_addr + host_ratio;
1672             } else {
1673                 /*
1674                  * No discards on this iteration, next loop starts from
1675                  * next sent/dirty page
1676                  */
1677                 run_start = run_end + 1;
1678             }
1679         }
1680
1681         if (do_fixup) {
1682             unsigned long page;
1683
1684             /* Tell the destination to discard this page */
1685             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1686                 /* For the unsent_pass we:
1687                  *     discard partially sent pages
1688                  * For the !unsent_pass (dirty) we:
1689                  *     discard partially dirty pages that were sent
1690                  *     (any partially sent pages were already discarded
1691                  *     by the previous unsent_pass)
1692                  */
1693                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1694                                             host_ratio);
1695             }
1696
1697             /* Clean up the bitmap */
1698             for (page = fixup_start_addr;
1699                  page < fixup_start_addr + host_ratio; page++) {
1700                 /* All pages in this host page are now not sent */
1701                 set_bit(page, unsentmap);
1702
1703                 /*
1704                  * Remark them as dirty, updating the count for any pages
1705                  * that weren't previously dirty.
1706                  */
1707                 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1708             }
1709         }
1710
1711         if (unsent_pass) {
1712             /* Find the next sent page for the next iteration */
1713             run_start = find_next_zero_bit(unsentmap, last + 1,
1714                                            run_start);
1715         } else {
1716             /* Find the next dirty page for the next iteration */
1717             run_start = find_next_bit(bitmap, last + 1, run_start);
1718         }
1719     }
1720 }
1721
1722 /*
1723  * Utility for the outgoing postcopy code.
1724  *
1725  * Discard any partially sent host-page size chunks, mark any partially
1726  * dirty host-page size chunks as all dirty.
1727  *
1728  * Returns: 0 on success
1729  */
1730 static int postcopy_chunk_hostpages(MigrationState *ms)
1731 {
1732     struct RAMBlock *block;
1733
1734     if (qemu_host_page_size == TARGET_PAGE_SIZE) {
1735         /* Easy case - TPS==HPS - nothing to be done */
1736         return 0;
1737     }
1738
1739     /* Easiest way to make sure we don't resume in the middle of a host-page */
1740     last_seen_block = NULL;
1741     last_sent_block = NULL;
1742     last_offset     = 0;
1743
1744     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1745         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1746
1747         PostcopyDiscardState *pds =
1748                          postcopy_discard_send_init(ms, first, block->idstr);
1749
1750         /* First pass: Discard all partially sent host pages */
1751         postcopy_chunk_hostpages_pass(ms, true, block, pds);
1752         /*
1753          * Second pass: Ensure that all partially dirty host pages are made
1754          * fully dirty.
1755          */
1756         postcopy_chunk_hostpages_pass(ms, false, block, pds);
1757
1758         postcopy_discard_send_finish(ms, pds);
1759     } /* ram_list loop */
1760
1761     return 0;
1762 }
1763
1764 /*
1765  * Transmit the set of pages to be discarded after precopy to the target
1766  * these are pages that:
1767  *     a) Have been previously transmitted but are now dirty again
1768  *     b) Pages that have never been transmitted, this ensures that
1769  *        any pages on the destination that have been mapped by background
1770  *        tasks get discarded (transparent huge pages is the specific concern)
1771  * Hopefully this is pretty sparse
1772  */
1773 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1774 {
1775     int ret;
1776     unsigned long *bitmap, *unsentmap;
1777
1778     rcu_read_lock();
1779
1780     /* This should be our last sync, the src is now paused */
1781     migration_bitmap_sync();
1782
1783     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1784     if (!unsentmap) {
1785         /* We don't have a safe way to resize the sentmap, so
1786          * if the bitmap was resized it will be NULL at this
1787          * point.
1788          */
1789         error_report("migration ram resized during precopy phase");
1790         rcu_read_unlock();
1791         return -EINVAL;
1792     }
1793
1794     /* Deal with TPS != HPS */
1795     ret = postcopy_chunk_hostpages(ms);
1796     if (ret) {
1797         rcu_read_unlock();
1798         return ret;
1799     }
1800
1801     /*
1802      * Update the unsentmap to be unsentmap = unsentmap | dirty
1803      */
1804     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1805     bitmap_or(unsentmap, unsentmap, bitmap,
1806                last_ram_offset() >> TARGET_PAGE_BITS);
1807
1808
1809     trace_ram_postcopy_send_discard_bitmap();
1810 #ifdef DEBUG_POSTCOPY
1811     ram_debug_dump_bitmap(unsentmap, true);
1812 #endif
1813
1814     ret = postcopy_each_ram_send_discard(ms);
1815     rcu_read_unlock();
1816
1817     return ret;
1818 }
1819
1820 /*
1821  * At the start of the postcopy phase of migration, any now-dirty
1822  * precopied pages are discarded.
1823  *
1824  * start, length describe a byte address range within the RAMBlock
1825  *
1826  * Returns 0 on success.
1827  */
1828 int ram_discard_range(MigrationIncomingState *mis,
1829                       const char *block_name,
1830                       uint64_t start, size_t length)
1831 {
1832     int ret = -1;
1833
1834     rcu_read_lock();
1835     RAMBlock *rb = qemu_ram_block_by_name(block_name);
1836
1837     if (!rb) {
1838         error_report("ram_discard_range: Failed to find block '%s'",
1839                      block_name);
1840         goto err;
1841     }
1842
1843     uint8_t *host_startaddr = rb->host + start;
1844
1845     if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) {
1846         error_report("ram_discard_range: Unaligned start address: %p",
1847                      host_startaddr);
1848         goto err;
1849     }
1850
1851     if ((start + length) <= rb->used_length) {
1852         uint8_t *host_endaddr = host_startaddr + length;
1853         if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) {
1854             error_report("ram_discard_range: Unaligned end address: %p",
1855                          host_endaddr);
1856             goto err;
1857         }
1858         ret = postcopy_ram_discard_range(mis, host_startaddr, length);
1859     } else {
1860         error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
1861                      "/%zx/" RAM_ADDR_FMT")",
1862                      block_name, start, length, rb->used_length);
1863     }
1864
1865 err:
1866     rcu_read_unlock();
1867
1868     return ret;
1869 }
1870
1871
1872 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1873  * long-running RCU critical section.  When rcu-reclaims in the code
1874  * start to become numerous it will be necessary to reduce the
1875  * granularity of these critical sections.
1876  */
1877
1878 static int ram_save_setup(QEMUFile *f, void *opaque)
1879 {
1880     RAMBlock *block;
1881     int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1882
1883     dirty_rate_high_cnt = 0;
1884     bitmap_sync_count = 0;
1885     migration_bitmap_sync_init();
1886     qemu_mutex_init(&migration_bitmap_mutex);
1887
1888     if (migrate_use_xbzrle()) {
1889         XBZRLE_cache_lock();
1890         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1891                                   TARGET_PAGE_SIZE,
1892                                   TARGET_PAGE_SIZE);
1893         if (!XBZRLE.cache) {
1894             XBZRLE_cache_unlock();
1895             error_report("Error creating cache");
1896             return -1;
1897         }
1898         XBZRLE_cache_unlock();
1899
1900         /* We prefer not to abort if there is no memory */
1901         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1902         if (!XBZRLE.encoded_buf) {
1903             error_report("Error allocating encoded_buf");
1904             return -1;
1905         }
1906
1907         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1908         if (!XBZRLE.current_buf) {
1909             error_report("Error allocating current_buf");
1910             g_free(XBZRLE.encoded_buf);
1911             XBZRLE.encoded_buf = NULL;
1912             return -1;
1913         }
1914
1915         acct_clear();
1916     }
1917
1918     /* For memory_global_dirty_log_start below.  */
1919     qemu_mutex_lock_iothread();
1920
1921     qemu_mutex_lock_ramlist();
1922     rcu_read_lock();
1923     bytes_transferred = 0;
1924     reset_ram_globals();
1925
1926     ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1927     migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
1928     migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
1929     bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
1930
1931     if (migrate_postcopy_ram()) {
1932         migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
1933         bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
1934     }
1935
1936     /*
1937      * Count the total number of pages used by ram blocks not including any
1938      * gaps due to alignment or unplugs.
1939      */
1940     migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1941
1942     memory_global_dirty_log_start();
1943     migration_bitmap_sync();
1944     qemu_mutex_unlock_ramlist();
1945     qemu_mutex_unlock_iothread();
1946
1947     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1948
1949     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1950         qemu_put_byte(f, strlen(block->idstr));
1951         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1952         qemu_put_be64(f, block->used_length);
1953     }
1954
1955     rcu_read_unlock();
1956
1957     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1958     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1959
1960     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1961
1962     return 0;
1963 }
1964
1965 static int ram_save_iterate(QEMUFile *f, void *opaque)
1966 {
1967     int ret;
1968     int i;
1969     int64_t t0;
1970     int pages_sent = 0;
1971
1972     rcu_read_lock();
1973     if (ram_list.version != last_version) {
1974         reset_ram_globals();
1975     }
1976
1977     /* Read version before ram_list.blocks */
1978     smp_rmb();
1979
1980     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1981
1982     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1983     i = 0;
1984     while ((ret = qemu_file_rate_limit(f)) == 0) {
1985         int pages;
1986
1987         pages = ram_find_and_save_block(f, false, &bytes_transferred);
1988         /* no more pages to sent */
1989         if (pages == 0) {
1990             break;
1991         }
1992         pages_sent += pages;
1993         acct_info.iterations++;
1994
1995         /* we want to check in the 1st loop, just in case it was the 1st time
1996            and we had to sync the dirty bitmap.
1997            qemu_get_clock_ns() is a bit expensive, so we only check each some
1998            iterations
1999         */
2000         if ((i & 63) == 0) {
2001             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2002             if (t1 > MAX_WAIT) {
2003                 DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
2004                         t1, i);
2005                 break;
2006             }
2007         }
2008         i++;
2009     }
2010     flush_compressed_data(f);
2011     rcu_read_unlock();
2012
2013     /*
2014      * Must occur before EOS (or any QEMUFile operation)
2015      * because of RDMA protocol.
2016      */
2017     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2018
2019     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2020     bytes_transferred += 8;
2021
2022     ret = qemu_file_get_error(f);
2023     if (ret < 0) {
2024         return ret;
2025     }
2026
2027     return pages_sent;
2028 }
2029
2030 /* Called with iothread lock */
2031 static int ram_save_complete(QEMUFile *f, void *opaque)
2032 {
2033     rcu_read_lock();
2034
2035     if (!migration_in_postcopy(migrate_get_current())) {
2036         migration_bitmap_sync();
2037     }
2038
2039     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2040
2041     /* try transferring iterative blocks of memory */
2042
2043     /* flush all remaining blocks regardless of rate limiting */
2044     while (true) {
2045         int pages;
2046
2047         pages = ram_find_and_save_block(f, true, &bytes_transferred);
2048         /* no more blocks to sent */
2049         if (pages == 0) {
2050             break;
2051         }
2052     }
2053
2054     flush_compressed_data(f);
2055     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2056
2057     rcu_read_unlock();
2058
2059     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2060
2061     return 0;
2062 }
2063
2064 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2065                              uint64_t *non_postcopiable_pending,
2066                              uint64_t *postcopiable_pending)
2067 {
2068     uint64_t remaining_size;
2069
2070     remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2071
2072     if (!migration_in_postcopy(migrate_get_current()) &&
2073         remaining_size < max_size) {
2074         qemu_mutex_lock_iothread();
2075         rcu_read_lock();
2076         migration_bitmap_sync();
2077         rcu_read_unlock();
2078         qemu_mutex_unlock_iothread();
2079         remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2080     }
2081
2082     /* We can do postcopy, and all the data is postcopiable */
2083     *postcopiable_pending += remaining_size;
2084 }
2085
2086 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2087 {
2088     unsigned int xh_len;
2089     int xh_flags;
2090     uint8_t *loaded_data;
2091
2092     if (!xbzrle_decoded_buf) {
2093         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2094     }
2095     loaded_data = xbzrle_decoded_buf;
2096
2097     /* extract RLE header */
2098     xh_flags = qemu_get_byte(f);
2099     xh_len = qemu_get_be16(f);
2100
2101     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2102         error_report("Failed to load XBZRLE page - wrong compression!");
2103         return -1;
2104     }
2105
2106     if (xh_len > TARGET_PAGE_SIZE) {
2107         error_report("Failed to load XBZRLE page - len overflow!");
2108         return -1;
2109     }
2110     /* load data and decode */
2111     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2112
2113     /* decode RLE */
2114     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2115                              TARGET_PAGE_SIZE) == -1) {
2116         error_report("Failed to load XBZRLE page - decode error!");
2117         return -1;
2118     }
2119
2120     return 0;
2121 }
2122
2123 /* Must be called from within a rcu critical section.
2124  * Returns a pointer from within the RCU-protected ram_list.
2125  */
2126 /*
2127  * Read a RAMBlock ID from the stream f.
2128  *
2129  * f: Stream to read from
2130  * flags: Page flags (mostly to see if it's a continuation of previous block)
2131  */
2132 static inline RAMBlock *ram_block_from_stream(QEMUFile *f,
2133                                               int flags)
2134 {
2135     static RAMBlock *block = NULL;
2136     char id[256];
2137     uint8_t len;
2138
2139     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2140         if (!block) {
2141             error_report("Ack, bad migration stream!");
2142             return NULL;
2143         }
2144         return block;
2145     }
2146
2147     len = qemu_get_byte(f);
2148     qemu_get_buffer(f, (uint8_t *)id, len);
2149     id[len] = 0;
2150
2151     block = qemu_ram_block_by_name(id);
2152     if (!block) {
2153         error_report("Can't find block %s", id);
2154         return NULL;
2155     }
2156
2157     return block;
2158 }
2159
2160 static inline void *host_from_ram_block_offset(RAMBlock *block,
2161                                                ram_addr_t offset)
2162 {
2163     if (!offset_in_ramblock(block, offset)) {
2164         return NULL;
2165     }
2166
2167     return block->host + offset;
2168 }
2169
2170 /*
2171  * If a page (or a whole RDMA chunk) has been
2172  * determined to be zero, then zap it.
2173  */
2174 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2175 {
2176     if (ch != 0 || !is_zero_range(host, size)) {
2177         memset(host, ch, size);
2178     }
2179 }
2180
2181 static void *do_data_decompress(void *opaque)
2182 {
2183     DecompressParam *param = opaque;
2184     unsigned long pagesize;
2185     uint8_t *des;
2186     int len;
2187
2188     qemu_mutex_lock(&param->mutex);
2189     while (!param->quit) {
2190         if (param->des) {
2191             des = param->des;
2192             len = param->len;
2193             param->des = 0;
2194             qemu_mutex_unlock(&param->mutex);
2195
2196             pagesize = TARGET_PAGE_SIZE;
2197             /* uncompress() will return failed in some case, especially
2198              * when the page is dirted when doing the compression, it's
2199              * not a problem because the dirty page will be retransferred
2200              * and uncompress() won't break the data in other pages.
2201              */
2202             uncompress((Bytef *)des, &pagesize,
2203                        (const Bytef *)param->compbuf, len);
2204
2205             qemu_mutex_lock(&decomp_done_lock);
2206             param->done = true;
2207             qemu_cond_signal(&decomp_done_cond);
2208             qemu_mutex_unlock(&decomp_done_lock);
2209
2210             qemu_mutex_lock(&param->mutex);
2211         } else {
2212             qemu_cond_wait(&param->cond, &param->mutex);
2213         }
2214     }
2215     qemu_mutex_unlock(&param->mutex);
2216
2217     return NULL;
2218 }
2219
2220 static void wait_for_decompress_done(void)
2221 {
2222     int idx, thread_count;
2223
2224     if (!migrate_use_compression()) {
2225         return;
2226     }
2227
2228     thread_count = migrate_decompress_threads();
2229     qemu_mutex_lock(&decomp_done_lock);
2230     for (idx = 0; idx < thread_count; idx++) {
2231         while (!decomp_param[idx].done) {
2232             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2233         }
2234     }
2235     qemu_mutex_unlock(&decomp_done_lock);
2236 }
2237
2238 void migrate_decompress_threads_create(void)
2239 {
2240     int i, thread_count;
2241
2242     thread_count = migrate_decompress_threads();
2243     decompress_threads = g_new0(QemuThread, thread_count);
2244     decomp_param = g_new0(DecompressParam, thread_count);
2245     qemu_mutex_init(&decomp_done_lock);
2246     qemu_cond_init(&decomp_done_cond);
2247     for (i = 0; i < thread_count; i++) {
2248         qemu_mutex_init(&decomp_param[i].mutex);
2249         qemu_cond_init(&decomp_param[i].cond);
2250         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2251         decomp_param[i].done = true;
2252         decomp_param[i].quit = false;
2253         qemu_thread_create(decompress_threads + i, "decompress",
2254                            do_data_decompress, decomp_param + i,
2255                            QEMU_THREAD_JOINABLE);
2256     }
2257 }
2258
2259 void migrate_decompress_threads_join(void)
2260 {
2261     int i, thread_count;
2262
2263     thread_count = migrate_decompress_threads();
2264     for (i = 0; i < thread_count; i++) {
2265         qemu_mutex_lock(&decomp_param[i].mutex);
2266         decomp_param[i].quit = true;
2267         qemu_cond_signal(&decomp_param[i].cond);
2268         qemu_mutex_unlock(&decomp_param[i].mutex);
2269     }
2270     for (i = 0; i < thread_count; i++) {
2271         qemu_thread_join(decompress_threads + i);
2272         qemu_mutex_destroy(&decomp_param[i].mutex);
2273         qemu_cond_destroy(&decomp_param[i].cond);
2274         g_free(decomp_param[i].compbuf);
2275     }
2276     g_free(decompress_threads);
2277     g_free(decomp_param);
2278     decompress_threads = NULL;
2279     decomp_param = NULL;
2280 }
2281
2282 static void decompress_data_with_multi_threads(QEMUFile *f,
2283                                                void *host, int len)
2284 {
2285     int idx, thread_count;
2286
2287     thread_count = migrate_decompress_threads();
2288     qemu_mutex_lock(&decomp_done_lock);
2289     while (true) {
2290         for (idx = 0; idx < thread_count; idx++) {
2291             if (decomp_param[idx].done) {
2292                 decomp_param[idx].done = false;
2293                 qemu_mutex_lock(&decomp_param[idx].mutex);
2294                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2295                 decomp_param[idx].des = host;
2296                 decomp_param[idx].len = len;
2297                 qemu_cond_signal(&decomp_param[idx].cond);
2298                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2299                 break;
2300             }
2301         }
2302         if (idx < thread_count) {
2303             break;
2304         } else {
2305             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2306         }
2307     }
2308     qemu_mutex_unlock(&decomp_done_lock);
2309 }
2310
2311 /*
2312  * Allocate data structures etc needed by incoming migration with postcopy-ram
2313  * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
2314  */
2315 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2316 {
2317     size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2318
2319     return postcopy_ram_incoming_init(mis, ram_pages);
2320 }
2321
2322 /*
2323  * Called in postcopy mode by ram_load().
2324  * rcu_read_lock is taken prior to this being called.
2325  */
2326 static int ram_load_postcopy(QEMUFile *f)
2327 {
2328     int flags = 0, ret = 0;
2329     bool place_needed = false;
2330     bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE;
2331     MigrationIncomingState *mis = migration_incoming_get_current();
2332     /* Temporary page that is later 'placed' */
2333     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2334     void *last_host = NULL;
2335     bool all_zero = false;
2336
2337     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2338         ram_addr_t addr;
2339         void *host = NULL;
2340         void *page_buffer = NULL;
2341         void *place_source = NULL;
2342         uint8_t ch;
2343
2344         addr = qemu_get_be64(f);
2345         flags = addr & ~TARGET_PAGE_MASK;
2346         addr &= TARGET_PAGE_MASK;
2347
2348         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2349         place_needed = false;
2350         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2351             RAMBlock *block = ram_block_from_stream(f, flags);
2352
2353             host = host_from_ram_block_offset(block, addr);
2354             if (!host) {
2355                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2356                 ret = -EINVAL;
2357                 break;
2358             }
2359             /*
2360              * Postcopy requires that we place whole host pages atomically.
2361              * To make it atomic, the data is read into a temporary page
2362              * that's moved into place later.
2363              * The migration protocol uses,  possibly smaller, target-pages
2364              * however the source ensures it always sends all the components
2365              * of a host page in order.
2366              */
2367             page_buffer = postcopy_host_page +
2368                           ((uintptr_t)host & ~qemu_host_page_mask);
2369             /* If all TP are zero then we can optimise the place */
2370             if (!((uintptr_t)host & ~qemu_host_page_mask)) {
2371                 all_zero = true;
2372             } else {
2373                 /* not the 1st TP within the HP */
2374                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2375                     error_report("Non-sequential target page %p/%p",
2376                                   host, last_host);
2377                     ret = -EINVAL;
2378                     break;
2379                 }
2380             }
2381
2382
2383             /*
2384              * If it's the last part of a host page then we place the host
2385              * page
2386              */
2387             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2388                                      ~qemu_host_page_mask) == 0;
2389             place_source = postcopy_host_page;
2390         }
2391         last_host = host;
2392
2393         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2394         case RAM_SAVE_FLAG_COMPRESS:
2395             ch = qemu_get_byte(f);
2396             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2397             if (ch) {
2398                 all_zero = false;
2399             }
2400             break;
2401
2402         case RAM_SAVE_FLAG_PAGE:
2403             all_zero = false;
2404             if (!place_needed || !matching_page_sizes) {
2405                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2406             } else {
2407                 /* Avoids the qemu_file copy during postcopy, which is
2408                  * going to do a copy later; can only do it when we
2409                  * do this read in one go (matching page sizes)
2410                  */
2411                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2412                                          TARGET_PAGE_SIZE);
2413             }
2414             break;
2415         case RAM_SAVE_FLAG_EOS:
2416             /* normal exit */
2417             break;
2418         default:
2419             error_report("Unknown combination of migration flags: %#x"
2420                          " (postcopy mode)", flags);
2421             ret = -EINVAL;
2422         }
2423
2424         if (place_needed) {
2425             /* This gets called at the last target page in the host page */
2426             if (all_zero) {
2427                 ret = postcopy_place_page_zero(mis,
2428                                                host + TARGET_PAGE_SIZE -
2429                                                qemu_host_page_size);
2430             } else {
2431                 ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE -
2432                                                qemu_host_page_size,
2433                                                place_source);
2434             }
2435         }
2436         if (!ret) {
2437             ret = qemu_file_get_error(f);
2438         }
2439     }
2440
2441     return ret;
2442 }
2443
2444 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2445 {
2446     int flags = 0, ret = 0;
2447     static uint64_t seq_iter;
2448     int len = 0;
2449     /*
2450      * If system is running in postcopy mode, page inserts to host memory must
2451      * be atomic
2452      */
2453     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2454
2455     seq_iter++;
2456
2457     if (version_id != 4) {
2458         ret = -EINVAL;
2459     }
2460
2461     /* This RCU critical section can be very long running.
2462      * When RCU reclaims in the code start to become numerous,
2463      * it will be necessary to reduce the granularity of this
2464      * critical section.
2465      */
2466     rcu_read_lock();
2467
2468     if (postcopy_running) {
2469         ret = ram_load_postcopy(f);
2470     }
2471
2472     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2473         ram_addr_t addr, total_ram_bytes;
2474         void *host = NULL;
2475         uint8_t ch;
2476
2477         addr = qemu_get_be64(f);
2478         flags = addr & ~TARGET_PAGE_MASK;
2479         addr &= TARGET_PAGE_MASK;
2480
2481         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2482                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2483             RAMBlock *block = ram_block_from_stream(f, flags);
2484
2485             host = host_from_ram_block_offset(block, addr);
2486             if (!host) {
2487                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2488                 ret = -EINVAL;
2489                 break;
2490             }
2491         }
2492
2493         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2494         case RAM_SAVE_FLAG_MEM_SIZE:
2495             /* Synchronize RAM block list */
2496             total_ram_bytes = addr;
2497             while (!ret && total_ram_bytes) {
2498                 RAMBlock *block;
2499                 char id[256];
2500                 ram_addr_t length;
2501
2502                 len = qemu_get_byte(f);
2503                 qemu_get_buffer(f, (uint8_t *)id, len);
2504                 id[len] = 0;
2505                 length = qemu_get_be64(f);
2506
2507                 block = qemu_ram_block_by_name(id);
2508                 if (block) {
2509                     if (length != block->used_length) {
2510                         Error *local_err = NULL;
2511
2512                         ret = qemu_ram_resize(block, length,
2513                                               &local_err);
2514                         if (local_err) {
2515                             error_report_err(local_err);
2516                         }
2517                     }
2518                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2519                                           block->idstr);
2520                 } else {
2521                     error_report("Unknown ramblock \"%s\", cannot "
2522                                  "accept migration", id);
2523                     ret = -EINVAL;
2524                 }
2525
2526                 total_ram_bytes -= length;
2527             }
2528             break;
2529
2530         case RAM_SAVE_FLAG_COMPRESS:
2531             ch = qemu_get_byte(f);
2532             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2533             break;
2534
2535         case RAM_SAVE_FLAG_PAGE:
2536             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2537             break;
2538
2539         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2540             len = qemu_get_be32(f);
2541             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2542                 error_report("Invalid compressed data length: %d", len);
2543                 ret = -EINVAL;
2544                 break;
2545             }
2546             decompress_data_with_multi_threads(f, host, len);
2547             break;
2548
2549         case RAM_SAVE_FLAG_XBZRLE:
2550             if (load_xbzrle(f, addr, host) < 0) {
2551                 error_report("Failed to decompress XBZRLE page at "
2552                              RAM_ADDR_FMT, addr);
2553                 ret = -EINVAL;
2554                 break;
2555             }
2556             break;
2557         case RAM_SAVE_FLAG_EOS:
2558             /* normal exit */
2559             break;
2560         default:
2561             if (flags & RAM_SAVE_FLAG_HOOK) {
2562                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2563             } else {
2564                 error_report("Unknown combination of migration flags: %#x",
2565                              flags);
2566                 ret = -EINVAL;
2567             }
2568         }
2569         if (!ret) {
2570             ret = qemu_file_get_error(f);
2571         }
2572     }
2573
2574     wait_for_decompress_done();
2575     rcu_read_unlock();
2576     DPRINTF("Completed load of VM with exit code %d seq iteration "
2577             "%" PRIu64 "\n", ret, seq_iter);
2578     return ret;
2579 }
2580
2581 static SaveVMHandlers savevm_ram_handlers = {
2582     .save_live_setup = ram_save_setup,
2583     .save_live_iterate = ram_save_iterate,
2584     .save_live_complete_postcopy = ram_save_complete,
2585     .save_live_complete_precopy = ram_save_complete,
2586     .save_live_pending = ram_save_pending,
2587     .load_state = ram_load,
2588     .cleanup = ram_migration_cleanup,
2589 };
2590
2591 void ram_mig_init(void)
2592 {
2593     qemu_mutex_init(&XBZRLE.lock);
2594     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
2595 }