migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include <stdint.h>
  29 #include <zlib.h>
  30 #include "qapi-event.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/timer.h"
  34 #include "qemu/main-loop.h"
  35 #include "migration/migration.h"
  36 #include "migration/postcopy-ram.h"
  37 #include "exec/address-spaces.h"
  38 #include "migration/page_cache.h"
  39 #include "qemu/error-report.h"
  40 #include "trace.h"
  41 #include "exec/ram_addr.h"
  42 #include "qemu/rcu_queue.h"
  43
  44 #ifdef DEBUG_MIGRATION_RAM
  45 #define DPRINTF(fmt, ...) \
  46     do { fprintf(stdout, "migration_ram: " fmt, ## __VA_ARGS__); } while (0)
  47 #else
  48 #define DPRINTF(fmt, ...) \
  49     do { } while (0)
  50 #endif
  51
  52 static int dirty_rate_high_cnt;
  53
  54 static uint64_t bitmap_sync_count;
  55
  56 /***********************************************************/
  57 /* ram save/restore */
  58
  59 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  60 #define RAM_SAVE_FLAG_COMPRESS 0x02
  61 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  62 #define RAM_SAVE_FLAG_PAGE     0x08
  63 #define RAM_SAVE_FLAG_EOS      0x10
  64 #define RAM_SAVE_FLAG_CONTINUE 0x20
  65 #define RAM_SAVE_FLAG_XBZRLE   0x40
  66 /* 0x80 is reserved in migration.h start with 0x100 next */
  67 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  68
  69 static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE];
  70
  71 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  72 {
  73     return buffer_find_nonzero_offset(p, size) == size;
  74 }
  75
  76 /* struct contains XBZRLE cache and a static page
  77    used by the compression */
  78 static struct {
  79     /* buffer used for XBZRLE encoding */
  80     uint8_t *encoded_buf;
  81     /* buffer for storing page content */
  82     uint8_t *current_buf;
  83     /* Cache for XBZRLE, Protected by lock. */
  84     PageCache *cache;
  85     QemuMutex lock;
  86 } XBZRLE;
  87
  88 /* buffer used for XBZRLE decoding */
  89 static uint8_t *xbzrle_decoded_buf;
  90
  91 static void XBZRLE_cache_lock(void)
  92 {
  93     if (migrate_use_xbzrle())
  94         qemu_mutex_lock(&XBZRLE.lock);
  95 }
  96
  97 static void XBZRLE_cache_unlock(void)
  98 {
  99     if (migrate_use_xbzrle())
 100         qemu_mutex_unlock(&XBZRLE.lock);
 101 }
 102
 103 /*
 104  * called from qmp_migrate_set_cache_size in main thread, possibly while
 105  * a migration is in progress.
 106  * A running migration maybe using the cache and might finish during this
 107  * call, hence changes to the cache are protected by XBZRLE.lock().
 108  */
 109 int64_t xbzrle_cache_resize(int64_t new_size)
 110 {
 111     PageCache *new_cache;
 112     int64_t ret;
 113
 114     if (new_size < TARGET_PAGE_SIZE) {
 115         return -1;
 116     }
 117
 118     XBZRLE_cache_lock();
 119
 120     if (XBZRLE.cache != NULL) {
 121         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 122             goto out_new_size;
 123         }
 124         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 125                                         TARGET_PAGE_SIZE);
 126         if (!new_cache) {
 127             error_report("Error creating cache");
 128             ret = -1;
 129             goto out;
 130         }
 131
 132         cache_fini(XBZRLE.cache);
 133         XBZRLE.cache = new_cache;
 134     }
 135
 136 out_new_size:
 137     ret = pow2floor(new_size);
 138 out:
 139     XBZRLE_cache_unlock();
 140     return ret;
 141 }
 142
 143 /* accounting for migration statistics */
 144 typedef struct AccountingInfo {
 145     uint64_t dup_pages;
 146     uint64_t skipped_pages;
 147     uint64_t norm_pages;
 148     uint64_t iterations;
 149     uint64_t xbzrle_bytes;
 150     uint64_t xbzrle_pages;
 151     uint64_t xbzrle_cache_miss;
 152     double xbzrle_cache_miss_rate;
 153     uint64_t xbzrle_overflows;
 154 } AccountingInfo;
 155
 156 static AccountingInfo acct_info;
 157
 158 static void acct_clear(void)
 159 {
 160     memset(&acct_info, 0, sizeof(acct_info));
 161 }
 162
 163 uint64_t dup_mig_bytes_transferred(void)
 164 {
 165     return acct_info.dup_pages * TARGET_PAGE_SIZE;
 166 }
 167
 168 uint64_t dup_mig_pages_transferred(void)
 169 {
 170     return acct_info.dup_pages;
 171 }
 172
 173 uint64_t skipped_mig_bytes_transferred(void)
 174 {
 175     return acct_info.skipped_pages * TARGET_PAGE_SIZE;
 176 }
 177
 178 uint64_t skipped_mig_pages_transferred(void)
 179 {
 180     return acct_info.skipped_pages;
 181 }
 182
 183 uint64_t norm_mig_bytes_transferred(void)
 184 {
 185     return acct_info.norm_pages * TARGET_PAGE_SIZE;
 186 }
 187
 188 uint64_t norm_mig_pages_transferred(void)
 189 {
 190     return acct_info.norm_pages;
 191 }
 192
 193 uint64_t xbzrle_mig_bytes_transferred(void)
 194 {
 195     return acct_info.xbzrle_bytes;
 196 }
 197
 198 uint64_t xbzrle_mig_pages_transferred(void)
 199 {
 200     return acct_info.xbzrle_pages;
 201 }
 202
 203 uint64_t xbzrle_mig_pages_cache_miss(void)
 204 {
 205     return acct_info.xbzrle_cache_miss;
 206 }
 207
 208 double xbzrle_mig_cache_miss_rate(void)
 209 {
 210     return acct_info.xbzrle_cache_miss_rate;
 211 }
 212
 213 uint64_t xbzrle_mig_pages_overflow(void)
 214 {
 215     return acct_info.xbzrle_overflows;
 216 }
 217
 218 /* This is the last block that we have visited serching for dirty pages
 219  */
 220 static RAMBlock *last_seen_block;
 221 /* This is the last block from where we have sent data */
 222 static RAMBlock *last_sent_block;
 223 static ram_addr_t last_offset;
 224 static QemuMutex migration_bitmap_mutex;
 225 static uint64_t migration_dirty_pages;
 226 static uint32_t last_version;
 227 static bool ram_bulk_stage;
 228
 229 /* used by the search for pages to send */
 230 struct PageSearchStatus {
 231     /* Current block being searched */
 232     RAMBlock    *block;
 233     /* Current offset to search from */
 234     ram_addr_t   offset;
 235     /* Set once we wrap around */
 236     bool         complete_round;
 237 };
 238 typedef struct PageSearchStatus PageSearchStatus;
 239
 240 static struct BitmapRcu {
 241     struct rcu_head rcu;
 242     /* Main migration bitmap */
 243     unsigned long *bmap;
 244     /* bitmap of pages that haven't been sent even once
 245      * only maintained and used in postcopy at the moment
 246      * where it's used to send the dirtymap at the start
 247      * of the postcopy phase
 248      */
 249     unsigned long *unsentmap;
 250 } *migration_bitmap_rcu;
 251
 252 struct CompressParam {
 253     bool start;
 254     bool done;
 255     QEMUFile *file;
 256     QemuMutex mutex;
 257     QemuCond cond;
 258     RAMBlock *block;
 259     ram_addr_t offset;
 260 };
 261 typedef struct CompressParam CompressParam;
 262
 263 struct DecompressParam {
 264     bool start;
 265     QemuMutex mutex;
 266     QemuCond cond;
 267     void *des;
 268     uint8 *compbuf;
 269     int len;
 270 };
 271 typedef struct DecompressParam DecompressParam;
 272
 273 static CompressParam *comp_param;
 274 static QemuThread *compress_threads;
 275 /* comp_done_cond is used to wake up the migration thread when
 276  * one of the compression threads has finished the compression.
 277  * comp_done_lock is used to co-work with comp_done_cond.
 278  */
 279 static QemuMutex *comp_done_lock;
 280 static QemuCond *comp_done_cond;
 281 /* The empty QEMUFileOps will be used by file in CompressParam */
 282 static const QEMUFileOps empty_ops = { };
 283
 284 static bool compression_switch;
 285 static bool quit_comp_thread;
 286 static bool quit_decomp_thread;
 287 static DecompressParam *decomp_param;
 288 static QemuThread *decompress_threads;
 289 static uint8_t *compressed_data_buf;
 290
 291 static int do_compress_ram_page(CompressParam *param);
 292
 293 static void *do_data_compress(void *opaque)
 294 {
 295     CompressParam *param = opaque;
 296
 297     while (!quit_comp_thread) {
 298         qemu_mutex_lock(&param->mutex);
 299         /* Re-check the quit_comp_thread in case of
 300          * terminate_compression_threads is called just before
 301          * qemu_mutex_lock(&param->mutex) and after
 302          * while(!quit_comp_thread), re-check it here can make
 303          * sure the compression thread terminate as expected.
 304          */
 305         while (!param->start && !quit_comp_thread) {
 306             qemu_cond_wait(&param->cond, &param->mutex);
 307         }
 308         if (!quit_comp_thread) {
 309             do_compress_ram_page(param);
 310         }
 311         param->start = false;
 312         qemu_mutex_unlock(&param->mutex);
 313
 314         qemu_mutex_lock(comp_done_lock);
 315         param->done = true;
 316         qemu_cond_signal(comp_done_cond);
 317         qemu_mutex_unlock(comp_done_lock);
 318     }
 319
 320     return NULL;
 321 }
 322
 323 static inline void terminate_compression_threads(void)
 324 {
 325     int idx, thread_count;
 326
 327     thread_count = migrate_compress_threads();
 328     quit_comp_thread = true;
 329     for (idx = 0; idx < thread_count; idx++) {
 330         qemu_mutex_lock(&comp_param[idx].mutex);
 331         qemu_cond_signal(&comp_param[idx].cond);
 332         qemu_mutex_unlock(&comp_param[idx].mutex);
 333     }
 334 }
 335
 336 void migrate_compress_threads_join(void)
 337 {
 338     int i, thread_count;
 339
 340     if (!migrate_use_compression()) {
 341         return;
 342     }
 343     terminate_compression_threads();
 344     thread_count = migrate_compress_threads();
 345     for (i = 0; i < thread_count; i++) {
 346         qemu_thread_join(compress_threads + i);
 347         qemu_fclose(comp_param[i].file);
 348         qemu_mutex_destroy(&comp_param[i].mutex);
 349         qemu_cond_destroy(&comp_param[i].cond);
 350     }
 351     qemu_mutex_destroy(comp_done_lock);
 352     qemu_cond_destroy(comp_done_cond);
 353     g_free(compress_threads);
 354     g_free(comp_param);
 355     g_free(comp_done_cond);
 356     g_free(comp_done_lock);
 357     compress_threads = NULL;
 358     comp_param = NULL;
 359     comp_done_cond = NULL;
 360     comp_done_lock = NULL;
 361 }
 362
 363 void migrate_compress_threads_create(void)
 364 {
 365     int i, thread_count;
 366
 367     if (!migrate_use_compression()) {
 368         return;
 369     }
 370     quit_comp_thread = false;
 371     compression_switch = true;
 372     thread_count = migrate_compress_threads();
 373     compress_threads = g_new0(QemuThread, thread_count);
 374     comp_param = g_new0(CompressParam, thread_count);
 375     comp_done_cond = g_new0(QemuCond, 1);
 376     comp_done_lock = g_new0(QemuMutex, 1);
 377     qemu_cond_init(comp_done_cond);
 378     qemu_mutex_init(comp_done_lock);
 379     for (i = 0; i < thread_count; i++) {
 380         /* com_param[i].file is just used as a dummy buffer to save data, set
 381          * it's ops to empty.
 382          */
 383         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 384         comp_param[i].done = true;
 385         qemu_mutex_init(&comp_param[i].mutex);
 386         qemu_cond_init(&comp_param[i].cond);
 387         qemu_thread_create(compress_threads + i, "compress",
 388                            do_data_compress, comp_param + i,
 389                            QEMU_THREAD_JOINABLE);
 390     }
 391 }
 392
 393 /**
 394  * save_page_header: Write page header to wire
 395  *
 396  * If this is the 1st block, it also writes the block identification
 397  *
 398  * Returns: Number of bytes written
 399  *
 400  * @f: QEMUFile where to send the data
 401  * @block: block that contains the page we want to send
 402  * @offset: offset inside the block for the page
 403  *          in the lower bits, it contains flags
 404  */
 405 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
 406 {
 407     size_t size, len;
 408
 409     qemu_put_be64(f, offset);
 410     size = 8;
 411
 412     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 413         len = strlen(block->idstr);
 414         qemu_put_byte(f, len);
 415         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 416         size += 1 + len;
 417     }
 418     return size;
 419 }
 420
 421 /* Reduce amount of guest cpu execution to hopefully slow down memory writes.
 422  * If guest dirty memory rate is reduced below the rate at which we can
 423  * transfer pages to the destination then we should be able to complete
 424  * migration. Some workloads dirty memory way too fast and will not effectively
 425  * converge, even with auto-converge.
 426  */
 427 static void mig_throttle_guest_down(void)
 428 {
 429     MigrationState *s = migrate_get_current();
 430     uint64_t pct_initial =
 431             s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL];
 432     uint64_t pct_icrement =
 433             s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT];
 434
 435     /* We have not started throttling yet. Let's start it. */
 436     if (!cpu_throttle_active()) {
 437         cpu_throttle_set(pct_initial);
 438     } else {
 439         /* Throttling already on, just increase the rate */
 440         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 441     }
 442 }
 443
 444 /* Update the xbzrle cache to reflect a page that's been sent as all 0.
 445  * The important thing is that a stale (not-yet-0'd) page be replaced
 446  * by the new data.
 447  * As a bonus, if the page wasn't in the cache it gets added so that
 448  * when a small write is made into the 0'd page it gets XBZRLE sent
 449  */
 450 static void xbzrle_cache_zero_page(ram_addr_t current_addr)
 451 {
 452     if (ram_bulk_stage || !migrate_use_xbzrle()) {
 453         return;
 454     }
 455
 456     /* We don't care if this fails to allocate a new cache page
 457      * as long as it updated an old one */
 458     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
 459                  bitmap_sync_count);
 460 }
 461
 462 #define ENCODING_FLAG_XBZRLE 0x1
 463
 464 /**
 465  * save_xbzrle_page: compress and send current page
 466  *
 467  * Returns: 1 means that we wrote the page
 468  *          0 means that page is identical to the one already sent
 469  *          -1 means that xbzrle would be longer than normal
 470  *
 471  * @f: QEMUFile where to send the data
 472  * @current_data:
 473  * @current_addr:
 474  * @block: block that contains the page we want to send
 475  * @offset: offset inside the block for the page
 476  * @last_stage: if we are at the completion stage
 477  * @bytes_transferred: increase it with the number of transferred bytes
 478  */
 479 static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
 480                             ram_addr_t current_addr, RAMBlock *block,
 481                             ram_addr_t offset, bool last_stage,
 482                             uint64_t *bytes_transferred)
 483 {
 484     int encoded_len = 0, bytes_xbzrle;
 485     uint8_t *prev_cached_page;
 486
 487     if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
 488         acct_info.xbzrle_cache_miss++;
 489         if (!last_stage) {
 490             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 491                              bitmap_sync_count) == -1) {
 492                 return -1;
 493             } else {
 494                 /* update *current_data when the page has been
 495                    inserted into cache */
 496                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 497             }
 498         }
 499         return -1;
 500     }
 501
 502     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 503
 504     /* save current buffer into memory */
 505     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 506
 507     /* XBZRLE encoding (if there is no overflow) */
 508     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 509                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 510                                        TARGET_PAGE_SIZE);
 511     if (encoded_len == 0) {
 512         DPRINTF("Skipping unmodified page\n");
 513         return 0;
 514     } else if (encoded_len == -1) {
 515         DPRINTF("Overflow\n");
 516         acct_info.xbzrle_overflows++;
 517         /* update data in the cache */
 518         if (!last_stage) {
 519             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 520             *current_data = prev_cached_page;
 521         }
 522         return -1;
 523     }
 524
 525     /* we need to update the data in the cache, in order to get the same data */
 526     if (!last_stage) {
 527         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 528     }
 529
 530     /* Send XBZRLE based compressed page */
 531     bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
 532     qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
 533     qemu_put_be16(f, encoded_len);
 534     qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
 535     bytes_xbzrle += encoded_len + 1 + 2;
 536     acct_info.xbzrle_pages++;
 537     acct_info.xbzrle_bytes += bytes_xbzrle;
 538     *bytes_transferred += bytes_xbzrle;
 539
 540     return 1;
 541 }
 542
 543 /* Called with rcu_read_lock() to protect migration_bitmap
 544  * rb: The RAMBlock  to search for dirty pages in
 545  * start: Start address (typically so we can continue from previous page)
 546  * ram_addr_abs: Pointer into which to store the address of the dirty page
 547  *               within the global ram_addr space
 548  *
 549  * Returns: byte offset within memory region of the start of a dirty page
 550  */
 551 static inline
 552 ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
 553                                        ram_addr_t start,
 554                                        ram_addr_t *ram_addr_abs)
 555 {
 556     unsigned long base = rb->offset >> TARGET_PAGE_BITS;
 557     unsigned long nr = base + (start >> TARGET_PAGE_BITS);
 558     uint64_t rb_size = rb->used_length;
 559     unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
 560     unsigned long *bitmap;
 561
 562     unsigned long next;
 563
 564     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 565     if (ram_bulk_stage && nr > base) {
 566         next = nr + 1;
 567     } else {
 568         next = find_next_bit(bitmap, size, nr);
 569     }
 570
 571     *ram_addr_abs = next << TARGET_PAGE_BITS;
 572     return (next - base) << TARGET_PAGE_BITS;
 573 }
 574
 575 static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
 576 {
 577     bool ret;
 578     int nr = addr >> TARGET_PAGE_BITS;
 579     unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 580
 581     ret = test_and_clear_bit(nr, bitmap);
 582
 583     if (ret) {
 584         migration_dirty_pages--;
 585     }
 586     return ret;
 587 }
 588
 589 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
 590 {
 591     unsigned long *bitmap;
 592     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 593     migration_dirty_pages +=
 594         cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
 595 }
 596
 597 /* Fix me: there are too many global variables used in migration process. */
 598 static int64_t start_time;
 599 static int64_t bytes_xfer_prev;
 600 static int64_t num_dirty_pages_period;
 601 static uint64_t xbzrle_cache_miss_prev;
 602 static uint64_t iterations_prev;
 603
 604 static void migration_bitmap_sync_init(void)
 605 {
 606     start_time = 0;
 607     bytes_xfer_prev = 0;
 608     num_dirty_pages_period = 0;
 609     xbzrle_cache_miss_prev = 0;
 610     iterations_prev = 0;
 611 }
 612
 613 /* Called with iothread lock held, to protect ram_list.dirty_memory[] */
 614 static void migration_bitmap_sync(void)
 615 {
 616     RAMBlock *block;
 617     uint64_t num_dirty_pages_init = migration_dirty_pages;
 618     MigrationState *s = migrate_get_current();
 619     int64_t end_time;
 620     int64_t bytes_xfer_now;
 621
 622     bitmap_sync_count++;
 623
 624     if (!bytes_xfer_prev) {
 625         bytes_xfer_prev = ram_bytes_transferred();
 626     }
 627
 628     if (!start_time) {
 629         start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 630     }
 631
 632     trace_migration_bitmap_sync_start();
 633     address_space_sync_dirty_bitmap(&address_space_memory);
 634
 635     qemu_mutex_lock(&migration_bitmap_mutex);
 636     rcu_read_lock();
 637     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 638         migration_bitmap_sync_range(block->offset, block->used_length);
 639     }
 640     rcu_read_unlock();
 641     qemu_mutex_unlock(&migration_bitmap_mutex);
 642
 643     trace_migration_bitmap_sync_end(migration_dirty_pages
 644                                     - num_dirty_pages_init);
 645     num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
 646     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 647
 648     /* more than 1 second = 1000 millisecons */
 649     if (end_time > start_time + 1000) {
 650         if (migrate_auto_converge()) {
 651             /* The following detection logic can be refined later. For now:
 652                Check to see if the dirtied bytes is 50% more than the approx.
 653                amount of bytes that just got transferred since the last time we
 654                were in this routine. If that happens twice, start or increase
 655                throttling */
 656             bytes_xfer_now = ram_bytes_transferred();
 657
 658             if (s->dirty_pages_rate &&
 659                (num_dirty_pages_period * TARGET_PAGE_SIZE >
 660                    (bytes_xfer_now - bytes_xfer_prev)/2) &&
 661                (dirty_rate_high_cnt++ >= 2)) {
 662                     trace_migration_throttle();
 663                     dirty_rate_high_cnt = 0;
 664                     mig_throttle_guest_down();
 665              }
 666              bytes_xfer_prev = bytes_xfer_now;
 667         }
 668
 669         if (migrate_use_xbzrle()) {
 670             if (iterations_prev != acct_info.iterations) {
 671                 acct_info.xbzrle_cache_miss_rate =
 672                    (double)(acct_info.xbzrle_cache_miss -
 673                             xbzrle_cache_miss_prev) /
 674                    (acct_info.iterations - iterations_prev);
 675             }
 676             iterations_prev = acct_info.iterations;
 677             xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
 678         }
 679         s->dirty_pages_rate = num_dirty_pages_period * 1000
 680             / (end_time - start_time);
 681         s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
 682         start_time = end_time;
 683         num_dirty_pages_period = 0;
 684     }
 685     s->dirty_sync_count = bitmap_sync_count;
 686     if (migrate_use_events()) {
 687         qapi_event_send_migration_pass(bitmap_sync_count, NULL);
 688     }
 689 }
 690
 691 /**
 692  * save_zero_page: Send the zero page to the stream
 693  *
 694  * Returns: Number of pages written.
 695  *
 696  * @f: QEMUFile where to send the data
 697  * @block: block that contains the page we want to send
 698  * @offset: offset inside the block for the page
 699  * @p: pointer to the page
 700  * @bytes_transferred: increase it with the number of transferred bytes
 701  */
 702 static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
 703                           uint8_t *p, uint64_t *bytes_transferred)
 704 {
 705     int pages = -1;
 706
 707     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 708         acct_info.dup_pages++;
 709         *bytes_transferred += save_page_header(f, block,
 710                                                offset | RAM_SAVE_FLAG_COMPRESS);
 711         qemu_put_byte(f, 0);
 712         *bytes_transferred += 1;
 713         pages = 1;
 714     }
 715
 716     return pages;
 717 }
 718
 719 /**
 720  * ram_save_page: Send the given page to the stream
 721  *
 722  * Returns: Number of pages written.
 723  *          < 0 - error
 724  *          >=0 - Number of pages written - this might legally be 0
 725  *                if xbzrle noticed the page was the same.
 726  *
 727  * @f: QEMUFile where to send the data
 728  * @block: block that contains the page we want to send
 729  * @offset: offset inside the block for the page
 730  * @last_stage: if we are at the completion stage
 731  * @bytes_transferred: increase it with the number of transferred bytes
 732  */
 733 static int ram_save_page(QEMUFile *f, RAMBlock* block, ram_addr_t offset,
 734                          bool last_stage, uint64_t *bytes_transferred)
 735 {
 736     int pages = -1;
 737     uint64_t bytes_xmit;
 738     ram_addr_t current_addr;
 739     uint8_t *p;
 740     int ret;
 741     bool send_async = true;
 742
 743     p = block->host + offset;
 744
 745     /* In doubt sent page as normal */
 746     bytes_xmit = 0;
 747     ret = ram_control_save_page(f, block->offset,
 748                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 749     if (bytes_xmit) {
 750         *bytes_transferred += bytes_xmit;
 751         pages = 1;
 752     }
 753
 754     XBZRLE_cache_lock();
 755
 756     current_addr = block->offset + offset;
 757
 758     if (block == last_sent_block) {
 759         offset |= RAM_SAVE_FLAG_CONTINUE;
 760     }
 761     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 762         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 763             if (bytes_xmit > 0) {
 764                 acct_info.norm_pages++;
 765             } else if (bytes_xmit == 0) {
 766                 acct_info.dup_pages++;
 767             }
 768         }
 769     } else {
 770         pages = save_zero_page(f, block, offset, p, bytes_transferred);
 771         if (pages > 0) {
 772             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 773              * page would be stale
 774              */
 775             xbzrle_cache_zero_page(current_addr);
 776         } else if (!ram_bulk_stage && migrate_use_xbzrle()) {
 777             pages = save_xbzrle_page(f, &p, current_addr, block,
 778                                      offset, last_stage, bytes_transferred);
 779             if (!last_stage) {
 780                 /* Can't send this cached data async, since the cache page
 781                  * might get updated before it gets to the wire
 782                  */
 783                 send_async = false;
 784             }
 785         }
 786     }
 787
 788     /* XBZRLE overflow or normal page */
 789     if (pages == -1) {
 790         *bytes_transferred += save_page_header(f, block,
 791                                                offset | RAM_SAVE_FLAG_PAGE);
 792         if (send_async) {
 793             qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
 794         } else {
 795             qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
 796         }
 797         *bytes_transferred += TARGET_PAGE_SIZE;
 798         pages = 1;
 799         acct_info.norm_pages++;
 800     }
 801
 802     XBZRLE_cache_unlock();
 803
 804     return pages;
 805 }
 806
 807 static int do_compress_ram_page(CompressParam *param)
 808 {
 809     int bytes_sent, blen;
 810     uint8_t *p;
 811     RAMBlock *block = param->block;
 812     ram_addr_t offset = param->offset;
 813
 814     p = block->host + (offset & TARGET_PAGE_MASK);
 815
 816     bytes_sent = save_page_header(param->file, block, offset |
 817                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
 818     blen = qemu_put_compression_data(param->file, p, TARGET_PAGE_SIZE,
 819                                      migrate_compress_level());
 820     bytes_sent += blen;
 821
 822     return bytes_sent;
 823 }
 824
 825 static inline void start_compression(CompressParam *param)
 826 {
 827     param->done = false;
 828     qemu_mutex_lock(&param->mutex);
 829     param->start = true;
 830     qemu_cond_signal(&param->cond);
 831     qemu_mutex_unlock(&param->mutex);
 832 }
 833
 834 static inline void start_decompression(DecompressParam *param)
 835 {
 836     qemu_mutex_lock(&param->mutex);
 837     param->start = true;
 838     qemu_cond_signal(&param->cond);
 839     qemu_mutex_unlock(&param->mutex);
 840 }
 841
 842 static uint64_t bytes_transferred;
 843
 844 static void flush_compressed_data(QEMUFile *f)
 845 {
 846     int idx, len, thread_count;
 847
 848     if (!migrate_use_compression()) {
 849         return;
 850     }
 851     thread_count = migrate_compress_threads();
 852     for (idx = 0; idx < thread_count; idx++) {
 853         if (!comp_param[idx].done) {
 854             qemu_mutex_lock(comp_done_lock);
 855             while (!comp_param[idx].done && !quit_comp_thread) {
 856                 qemu_cond_wait(comp_done_cond, comp_done_lock);
 857             }
 858             qemu_mutex_unlock(comp_done_lock);
 859         }
 860         if (!quit_comp_thread) {
 861             len = qemu_put_qemu_file(f, comp_param[idx].file);
 862             bytes_transferred += len;
 863         }
 864     }
 865 }
 866
 867 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 868                                        ram_addr_t offset)
 869 {
 870     param->block = block;
 871     param->offset = offset;
 872 }
 873
 874 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
 875                                            ram_addr_t offset,
 876                                            uint64_t *bytes_transferred)
 877 {
 878     int idx, thread_count, bytes_xmit = -1, pages = -1;
 879
 880     thread_count = migrate_compress_threads();
 881     qemu_mutex_lock(comp_done_lock);
 882     while (true) {
 883         for (idx = 0; idx < thread_count; idx++) {
 884             if (comp_param[idx].done) {
 885                 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
 886                 set_compress_params(&comp_param[idx], block, offset);
 887                 start_compression(&comp_param[idx]);
 888                 pages = 1;
 889                 acct_info.norm_pages++;
 890                 *bytes_transferred += bytes_xmit;
 891                 break;
 892             }
 893         }
 894         if (pages > 0) {
 895             break;
 896         } else {
 897             qemu_cond_wait(comp_done_cond, comp_done_lock);
 898         }
 899     }
 900     qemu_mutex_unlock(comp_done_lock);
 901
 902     return pages;
 903 }
 904
 905 /**
 906  * ram_save_compressed_page: compress the given page and send it to the stream
 907  *
 908  * Returns: Number of pages written.
 909  *
 910  * @f: QEMUFile where to send the data
 911  * @block: block that contains the page we want to send
 912  * @offset: offset inside the block for the page
 913  * @last_stage: if we are at the completion stage
 914  * @bytes_transferred: increase it with the number of transferred bytes
 915  */
 916 static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block,
 917                                     ram_addr_t offset, bool last_stage,
 918                                     uint64_t *bytes_transferred)
 919 {
 920     int pages = -1;
 921     uint64_t bytes_xmit;
 922     uint8_t *p;
 923     int ret;
 924
 925     p = block->host + offset;
 926
 927     bytes_xmit = 0;
 928     ret = ram_control_save_page(f, block->offset,
 929                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
 930     if (bytes_xmit) {
 931         *bytes_transferred += bytes_xmit;
 932         pages = 1;
 933     }
 934     if (block == last_sent_block) {
 935         offset |= RAM_SAVE_FLAG_CONTINUE;
 936     }
 937     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 938         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 939             if (bytes_xmit > 0) {
 940                 acct_info.norm_pages++;
 941             } else if (bytes_xmit == 0) {
 942                 acct_info.dup_pages++;
 943             }
 944         }
 945     } else {
 946         /* When starting the process of a new block, the first page of
 947          * the block should be sent out before other pages in the same
 948          * block, and all the pages in last block should have been sent
 949          * out, keeping this order is important, because the 'cont' flag
 950          * is used to avoid resending the block name.
 951          */
 952         if (block != last_sent_block) {
 953             flush_compressed_data(f);
 954             pages = save_zero_page(f, block, offset, p, bytes_transferred);
 955             if (pages == -1) {
 956                 set_compress_params(&comp_param[0], block, offset);
 957                 /* Use the qemu thread to compress the data to make sure the
 958                  * first page is sent out before other pages
 959                  */
 960                 bytes_xmit = do_compress_ram_page(&comp_param[0]);
 961                 acct_info.norm_pages++;
 962                 qemu_put_qemu_file(f, comp_param[0].file);
 963                 *bytes_transferred += bytes_xmit;
 964                 pages = 1;
 965             }
 966         } else {
 967             pages = save_zero_page(f, block, offset, p, bytes_transferred);
 968             if (pages == -1) {
 969                 pages = compress_page_with_multi_thread(f, block, offset,
 970                                                         bytes_transferred);
 971             }
 972         }
 973     }
 974
 975     return pages;
 976 }
 977
 978 /*
 979  * Find the next dirty page and update any state associated with
 980  * the search process.
 981  *
 982  * Returns: True if a page is found
 983  *
 984  * @f: Current migration stream.
 985  * @pss: Data about the state of the current dirty page scan.
 986  * @*again: Set to false if the search has scanned the whole of RAM
 987  * *ram_addr_abs: Pointer into which to store the address of the dirty page
 988  *               within the global ram_addr space
 989  */
 990 static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
 991                              bool *again, ram_addr_t *ram_addr_abs)
 992 {
 993     pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset,
 994                                               ram_addr_abs);
 995     if (pss->complete_round && pss->block == last_seen_block &&
 996         pss->offset >= last_offset) {
 997         /*
 998          * We've been once around the RAM and haven't found anything.
 999          * Give up.
1000          */
1001         *again = false;
1002         return false;
1003     }
1004     if (pss->offset >= pss->block->used_length) {
1005         /* Didn't find anything in this RAM Block */
1006         pss->offset = 0;
1007         pss->block = QLIST_NEXT_RCU(pss->block, next);
1008         if (!pss->block) {
1009             /* Hit the end of the list */
1010             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1011             /* Flag that we've looped */
1012             pss->complete_round = true;
1013             ram_bulk_stage = false;
1014             if (migrate_use_xbzrle()) {
1015                 /* If xbzrle is on, stop using the data compression at this
1016                  * point. In theory, xbzrle can do better than compression.
1017                  */
1018                 flush_compressed_data(f);
1019                 compression_switch = false;
1020             }
1021         }
1022         /* Didn't find anything this time, but try again on the new block */
1023         *again = true;
1024         return false;
1025     } else {
1026         /* Can go around again, but... */
1027         *again = true;
1028         /* We've found something so probably don't need to */
1029         return true;
1030     }
1031 }
1032
1033 /*
1034  * Helper for 'get_queued_page' - gets a page off the queue
1035  *      ms:      MigrationState in
1036  * *offset:      Used to return the offset within the RAMBlock
1037  * ram_addr_abs: global offset in the dirty/sent bitmaps
1038  *
1039  * Returns:      block (or NULL if none available)
1040  */
1041 static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1042                               ram_addr_t *ram_addr_abs)
1043 {
1044     RAMBlock *block = NULL;
1045
1046     qemu_mutex_lock(&ms->src_page_req_mutex);
1047     if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1048         struct MigrationSrcPageRequest *entry =
1049                                 QSIMPLEQ_FIRST(&ms->src_page_requests);
1050         block = entry->rb;
1051         *offset = entry->offset;
1052         *ram_addr_abs = (entry->offset + entry->rb->offset) &
1053                         TARGET_PAGE_MASK;
1054
1055         if (entry->len > TARGET_PAGE_SIZE) {
1056             entry->len -= TARGET_PAGE_SIZE;
1057             entry->offset += TARGET_PAGE_SIZE;
1058         } else {
1059             memory_region_unref(block->mr);
1060             QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1061             g_free(entry);
1062         }
1063     }
1064     qemu_mutex_unlock(&ms->src_page_req_mutex);
1065
1066     return block;
1067 }
1068
1069 /*
1070  * Unqueue a page from the queue fed by postcopy page requests; skips pages
1071  * that are already sent (!dirty)
1072  *
1073  *      ms:      MigrationState in
1074  *     pss:      PageSearchStatus structure updated with found block/offset
1075  * ram_addr_abs: global offset in the dirty/sent bitmaps
1076  *
1077  * Returns:      true if a queued page is found
1078  */
1079 static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss,
1080                             ram_addr_t *ram_addr_abs)
1081 {
1082     RAMBlock  *block;
1083     ram_addr_t offset;
1084     bool dirty;
1085
1086     do {
1087         block = unqueue_page(ms, &offset, ram_addr_abs);
1088         /*
1089          * We're sending this page, and since it's postcopy nothing else
1090          * will dirty it, and we must make sure it doesn't get sent again
1091          * even if this queue request was received after the background
1092          * search already sent it.
1093          */
1094         if (block) {
1095             unsigned long *bitmap;
1096             bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1097             dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1098             if (!dirty) {
1099                 trace_get_queued_page_not_dirty(
1100                     block->idstr, (uint64_t)offset,
1101                     (uint64_t)*ram_addr_abs,
1102                     test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1103                          atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1104             } else {
1105                 trace_get_queued_page(block->idstr,
1106                                       (uint64_t)offset,
1107                                       (uint64_t)*ram_addr_abs);
1108             }
1109         }
1110
1111     } while (block && !dirty);
1112
1113     if (block) {
1114         /*
1115          * As soon as we start servicing pages out of order, then we have
1116          * to kill the bulk stage, since the bulk stage assumes
1117          * in (migration_bitmap_find_and_reset_dirty) that every page is
1118          * dirty, that's no longer true.
1119          */
1120         ram_bulk_stage = false;
1121
1122         /*
1123          * We want the background search to continue from the queued page
1124          * since the guest is likely to want other pages near to the page
1125          * it just requested.
1126          */
1127         pss->block = block;
1128         pss->offset = offset;
1129     }
1130
1131     return !!block;
1132 }
1133
1134 /**
1135  * flush_page_queue: Flush any remaining pages in the ram request queue
1136  *    it should be empty at the end anyway, but in error cases there may be
1137  *    some left.
1138  *
1139  * ms: MigrationState
1140  */
1141 void flush_page_queue(MigrationState *ms)
1142 {
1143     struct MigrationSrcPageRequest *mspr, *next_mspr;
1144     /* This queue generally should be empty - but in the case of a failed
1145      * migration might have some droppings in.
1146      */
1147     rcu_read_lock();
1148     QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1149         memory_region_unref(mspr->rb->mr);
1150         QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1151         g_free(mspr);
1152     }
1153     rcu_read_unlock();
1154 }
1155
1156 /**
1157  * Queue the pages for transmission, e.g. a request from postcopy destination
1158  *   ms: MigrationStatus in which the queue is held
1159  *   rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
1160  *   start: Offset from the start of the RAMBlock
1161  *   len: Length (in bytes) to send
1162  *   Return: 0 on success
1163  */
1164 int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1165                          ram_addr_t start, ram_addr_t len)
1166 {
1167     RAMBlock *ramblock;
1168
1169     rcu_read_lock();
1170     if (!rbname) {
1171         /* Reuse last RAMBlock */
1172         ramblock = ms->last_req_rb;
1173
1174         if (!ramblock) {
1175             /*
1176              * Shouldn't happen, we can't reuse the last RAMBlock if
1177              * it's the 1st request.
1178              */
1179             error_report("ram_save_queue_pages no previous block");
1180             goto err;
1181         }
1182     } else {
1183         ramblock = qemu_ram_block_by_name(rbname);
1184
1185         if (!ramblock) {
1186             /* We shouldn't be asked for a non-existent RAMBlock */
1187             error_report("ram_save_queue_pages no block '%s'", rbname);
1188             goto err;
1189         }
1190         ms->last_req_rb = ramblock;
1191     }
1192     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1193     if (start+len > ramblock->used_length) {
1194         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1195                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1196                      __func__, start, len, ramblock->used_length);
1197         goto err;
1198     }
1199
1200     struct MigrationSrcPageRequest *new_entry =
1201         g_malloc0(sizeof(struct MigrationSrcPageRequest));
1202     new_entry->rb = ramblock;
1203     new_entry->offset = start;
1204     new_entry->len = len;
1205
1206     memory_region_ref(ramblock->mr);
1207     qemu_mutex_lock(&ms->src_page_req_mutex);
1208     QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1209     qemu_mutex_unlock(&ms->src_page_req_mutex);
1210     rcu_read_unlock();
1211
1212     return 0;
1213
1214 err:
1215     rcu_read_unlock();
1216     return -1;
1217 }
1218
1219 /**
1220  * ram_save_target_page: Save one target page
1221  *
1222  *
1223  * @f: QEMUFile where to send the data
1224  * @block: pointer to block that contains the page we want to send
1225  * @offset: offset inside the block for the page;
1226  * @last_stage: if we are at the completion stage
1227  * @bytes_transferred: increase it with the number of transferred bytes
1228  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1229  *
1230  * Returns: Number of pages written.
1231  */
1232 static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
1233                                 RAMBlock *block, ram_addr_t offset,
1234                                 bool last_stage,
1235                                 uint64_t *bytes_transferred,
1236                                 ram_addr_t dirty_ram_abs)
1237 {
1238     int res = 0;
1239
1240     /* Check the pages is dirty and if it is send it */
1241     if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1242         unsigned long *unsentmap;
1243         if (compression_switch && migrate_use_compression()) {
1244             res = ram_save_compressed_page(f, block, offset,
1245                                            last_stage,
1246                                            bytes_transferred);
1247         } else {
1248             res = ram_save_page(f, block, offset, last_stage,
1249                                 bytes_transferred);
1250         }
1251
1252         if (res < 0) {
1253             return res;
1254         }
1255         unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1256         if (unsentmap) {
1257             clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1258         }
1259         /* Only update last_sent_block if a block was actually sent; xbzrle
1260          * might have decided the page was identical so didn't bother writing
1261          * to the stream.
1262          */
1263         if (res > 0) {
1264             last_sent_block = block;
1265         }
1266     }
1267
1268     return res;
1269 }
1270
1271 /**
1272  * ram_save_host_page: Starting at *offset send pages upto the end
1273  *                     of the current host page.  It's valid for the initial
1274  *                     offset to point into the middle of a host page
1275  *                     in which case the remainder of the hostpage is sent.
1276  *                     Only dirty target pages are sent.
1277  *
1278  * Returns: Number of pages written.
1279  *
1280  * @f: QEMUFile where to send the data
1281  * @block: pointer to block that contains the page we want to send
1282  * @offset: offset inside the block for the page; updated to last target page
1283  *          sent
1284  * @last_stage: if we are at the completion stage
1285  * @bytes_transferred: increase it with the number of transferred bytes
1286  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1287  */
1288 static int ram_save_host_page(MigrationState *ms, QEMUFile *f, RAMBlock *block,
1289                               ram_addr_t *offset, bool last_stage,
1290                               uint64_t *bytes_transferred,
1291                               ram_addr_t dirty_ram_abs)
1292 {
1293     int tmppages, pages = 0;
1294     do {
1295         tmppages = ram_save_target_page(ms, f, block, *offset, last_stage,
1296                                         bytes_transferred, dirty_ram_abs);
1297         if (tmppages < 0) {
1298             return tmppages;
1299         }
1300
1301         pages += tmppages;
1302         *offset += TARGET_PAGE_SIZE;
1303         dirty_ram_abs += TARGET_PAGE_SIZE;
1304     } while (*offset & (qemu_host_page_size - 1));
1305
1306     /* The offset we leave with is the last one we looked at */
1307     *offset -= TARGET_PAGE_SIZE;
1308     return pages;
1309 }
1310
1311 /**
1312  * ram_find_and_save_block: Finds a dirty page and sends it to f
1313  *
1314  * Called within an RCU critical section.
1315  *
1316  * Returns:  The number of pages written
1317  *           0 means no dirty pages
1318  *
1319  * @f: QEMUFile where to send the data
1320  * @last_stage: if we are at the completion stage
1321  * @bytes_transferred: increase it with the number of transferred bytes
1322  *
1323  * On systems where host-page-size > target-page-size it will send all the
1324  * pages in a host page that are dirty.
1325  */
1326
1327 static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
1328                                    uint64_t *bytes_transferred)
1329 {
1330     PageSearchStatus pss;
1331     MigrationState *ms = migrate_get_current();
1332     int pages = 0;
1333     bool again, found;
1334     ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1335                                  ram_addr_t space */
1336
1337     pss.block = last_seen_block;
1338     pss.offset = last_offset;
1339     pss.complete_round = false;
1340
1341     if (!pss.block) {
1342         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1343     }
1344
1345     do {
1346         again = true;
1347         found = get_queued_page(ms, &pss, &dirty_ram_abs);
1348
1349         if (!found) {
1350             /* priority queue empty, so just search for something dirty */
1351             found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
1352         }
1353
1354         if (found) {
1355             pages = ram_save_host_page(ms, f, pss.block, &pss.offset,
1356                                        last_stage, bytes_transferred,
1357                                        dirty_ram_abs);
1358         }
1359     } while (!pages && again);
1360
1361     last_seen_block = pss.block;
1362     last_offset = pss.offset;
1363
1364     return pages;
1365 }
1366
1367 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1368 {
1369     uint64_t pages = size / TARGET_PAGE_SIZE;
1370     if (zero) {
1371         acct_info.dup_pages += pages;
1372     } else {
1373         acct_info.norm_pages += pages;
1374         bytes_transferred += size;
1375         qemu_update_position(f, size);
1376     }
1377 }
1378
1379 static ram_addr_t ram_save_remaining(void)
1380 {
1381     return migration_dirty_pages;
1382 }
1383
1384 uint64_t ram_bytes_remaining(void)
1385 {
1386     return ram_save_remaining() * TARGET_PAGE_SIZE;
1387 }
1388
1389 uint64_t ram_bytes_transferred(void)
1390 {
1391     return bytes_transferred;
1392 }
1393
1394 uint64_t ram_bytes_total(void)
1395 {
1396     RAMBlock *block;
1397     uint64_t total = 0;
1398
1399     rcu_read_lock();
1400     QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1401         total += block->used_length;
1402     rcu_read_unlock();
1403     return total;
1404 }
1405
1406 void free_xbzrle_decoded_buf(void)
1407 {
1408     g_free(xbzrle_decoded_buf);
1409     xbzrle_decoded_buf = NULL;
1410 }
1411
1412 static void migration_bitmap_free(struct BitmapRcu *bmap)
1413 {
1414     g_free(bmap->bmap);
1415     g_free(bmap->unsentmap);
1416     g_free(bmap);
1417 }
1418
1419 static void ram_migration_cleanup(void *opaque)
1420 {
1421     /* caller have hold iothread lock or is in a bh, so there is
1422      * no writing race against this migration_bitmap
1423      */
1424     struct BitmapRcu *bitmap = migration_bitmap_rcu;
1425     atomic_rcu_set(&migration_bitmap_rcu, NULL);
1426     if (bitmap) {
1427         memory_global_dirty_log_stop();
1428         call_rcu(bitmap, migration_bitmap_free, rcu);
1429     }
1430
1431     XBZRLE_cache_lock();
1432     if (XBZRLE.cache) {
1433         cache_fini(XBZRLE.cache);
1434         g_free(XBZRLE.encoded_buf);
1435         g_free(XBZRLE.current_buf);
1436         XBZRLE.cache = NULL;
1437         XBZRLE.encoded_buf = NULL;
1438         XBZRLE.current_buf = NULL;
1439     }
1440     XBZRLE_cache_unlock();
1441 }
1442
1443 static void reset_ram_globals(void)
1444 {
1445     last_seen_block = NULL;
1446     last_sent_block = NULL;
1447     last_offset = 0;
1448     last_version = ram_list.version;
1449     ram_bulk_stage = true;
1450 }
1451
1452 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1453
1454 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1455 {
1456     /* called in qemu main thread, so there is
1457      * no writing race against this migration_bitmap
1458      */
1459     if (migration_bitmap_rcu) {
1460         struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1461         bitmap = g_new(struct BitmapRcu, 1);
1462         bitmap->bmap = bitmap_new(new);
1463
1464         /* prevent migration_bitmap content from being set bit
1465          * by migration_bitmap_sync_range() at the same time.
1466          * it is safe to migration if migration_bitmap is cleared bit
1467          * at the same time.
1468          */
1469         qemu_mutex_lock(&migration_bitmap_mutex);
1470         bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1471         bitmap_set(bitmap->bmap, old, new - old);
1472
1473         /* We don't have a way to safely extend the sentmap
1474          * with RCU; so mark it as missing, entry to postcopy
1475          * will fail.
1476          */
1477         bitmap->unsentmap = NULL;
1478
1479         atomic_rcu_set(&migration_bitmap_rcu, bitmap);
1480         qemu_mutex_unlock(&migration_bitmap_mutex);
1481         migration_dirty_pages += new - old;
1482         call_rcu(old_bitmap, migration_bitmap_free, rcu);
1483     }
1484 }
1485
1486 /*
1487  * 'expected' is the value you expect the bitmap mostly to be full
1488  * of; it won't bother printing lines that are all this value.
1489  * If 'todump' is null the migration bitmap is dumped.
1490  */
1491 void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1492 {
1493     int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1494
1495     int64_t cur;
1496     int64_t linelen = 128;
1497     char linebuf[129];
1498
1499     if (!todump) {
1500         todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1501     }
1502
1503     for (cur = 0; cur < ram_pages; cur += linelen) {
1504         int64_t curb;
1505         bool found = false;
1506         /*
1507          * Last line; catch the case where the line length
1508          * is longer than remaining ram
1509          */
1510         if (cur + linelen > ram_pages) {
1511             linelen = ram_pages - cur;
1512         }
1513         for (curb = 0; curb < linelen; curb++) {
1514             bool thisbit = test_bit(cur + curb, todump);
1515             linebuf[curb] = thisbit ? '1' : '.';
1516             found = found || (thisbit != expected);
1517         }
1518         if (found) {
1519             linebuf[curb] = '\0';
1520             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1521         }
1522     }
1523 }
1524
1525 /* **** functions for postcopy ***** */
1526
1527 /*
1528  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1529  * Note: At this point the 'unsentmap' is the processed bitmap combined
1530  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1531  * start,length: Indexes into the bitmap for the first bit
1532  *            representing the named block and length in target-pages
1533  */
1534 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1535                                         PostcopyDiscardState *pds,
1536                                         unsigned long start,
1537                                         unsigned long length)
1538 {
1539     unsigned long end = start + length; /* one after the end */
1540     unsigned long current;
1541     unsigned long *unsentmap;
1542
1543     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1544     for (current = start; current < end; ) {
1545         unsigned long one = find_next_bit(unsentmap, end, current);
1546
1547         if (one <= end) {
1548             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1549             unsigned long discard_length;
1550
1551             if (zero >= end) {
1552                 discard_length = end - one;
1553             } else {
1554                 discard_length = zero - one;
1555             }
1556             postcopy_discard_send_range(ms, pds, one, discard_length);
1557             current = one + discard_length;
1558         } else {
1559             current = one;
1560         }
1561     }
1562
1563     return 0;
1564 }
1565
1566 /*
1567  * Utility for the outgoing postcopy code.
1568  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1569  *   passing it bitmap indexes and name.
1570  * Returns: 0 on success
1571  * (qemu_ram_foreach_block ends up passing unscaled lengths
1572  *  which would mean postcopy code would have to deal with target page)
1573  */
1574 static int postcopy_each_ram_send_discard(MigrationState *ms)
1575 {
1576     struct RAMBlock *block;
1577     int ret;
1578
1579     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1580         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1581         PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1582                                                                first,
1583                                                                block->idstr);
1584
1585         /*
1586          * Postcopy sends chunks of bitmap over the wire, but it
1587          * just needs indexes at this point, avoids it having
1588          * target page specific code.
1589          */
1590         ret = postcopy_send_discard_bm_ram(ms, pds, first,
1591                                     block->used_length >> TARGET_PAGE_BITS);
1592         postcopy_discard_send_finish(ms, pds);
1593         if (ret) {
1594             return ret;
1595         }
1596     }
1597
1598     return 0;
1599 }
1600
1601 /*
1602  * Helper for postcopy_chunk_hostpages; it's called twice to cleanup
1603  *   the two bitmaps, that are similar, but one is inverted.
1604  *
1605  * We search for runs of target-pages that don't start or end on a
1606  * host page boundary;
1607  * unsent_pass=true: Cleans up partially unsent host pages by searching
1608  *                 the unsentmap
1609  * unsent_pass=false: Cleans up partially dirty host pages by searching
1610  *                 the main migration bitmap
1611  *
1612  */
1613 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1614                                           RAMBlock *block,
1615                                           PostcopyDiscardState *pds)
1616 {
1617     unsigned long *bitmap;
1618     unsigned long *unsentmap;
1619     unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE;
1620     unsigned long first = block->offset >> TARGET_PAGE_BITS;
1621     unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1622     unsigned long last = first + (len - 1);
1623     unsigned long run_start;
1624
1625     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1626     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1627
1628     if (unsent_pass) {
1629         /* Find a sent page */
1630         run_start = find_next_zero_bit(unsentmap, last + 1, first);
1631     } else {
1632         /* Find a dirty page */
1633         run_start = find_next_bit(bitmap, last + 1, first);
1634     }
1635
1636     while (run_start <= last) {
1637         bool do_fixup = false;
1638         unsigned long fixup_start_addr;
1639         unsigned long host_offset;
1640
1641         /*
1642          * If the start of this run of pages is in the middle of a host
1643          * page, then we need to fixup this host page.
1644          */
1645         host_offset = run_start % host_ratio;
1646         if (host_offset) {
1647             do_fixup = true;
1648             run_start -= host_offset;
1649             fixup_start_addr = run_start;
1650             /* For the next pass */
1651             run_start = run_start + host_ratio;
1652         } else {
1653             /* Find the end of this run */
1654             unsigned long run_end;
1655             if (unsent_pass) {
1656                 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1657             } else {
1658                 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1659             }
1660             /*
1661              * If the end isn't at the start of a host page, then the
1662              * run doesn't finish at the end of a host page
1663              * and we need to discard.
1664              */
1665             host_offset = run_end % host_ratio;
1666             if (host_offset) {
1667                 do_fixup = true;
1668                 fixup_start_addr = run_end - host_offset;
1669                 /*
1670                  * This host page has gone, the next loop iteration starts
1671                  * from after the fixup
1672                  */
1673                 run_start = fixup_start_addr + host_ratio;
1674             } else {
1675                 /*
1676                  * No discards on this iteration, next loop starts from
1677                  * next sent/dirty page
1678                  */
1679                 run_start = run_end + 1;
1680             }
1681         }
1682
1683         if (do_fixup) {
1684             unsigned long page;
1685
1686             /* Tell the destination to discard this page */
1687             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1688                 /* For the unsent_pass we:
1689                  *     discard partially sent pages
1690                  * For the !unsent_pass (dirty) we:
1691                  *     discard partially dirty pages that were sent
1692                  *     (any partially sent pages were already discarded
1693                  *     by the previous unsent_pass)
1694                  */
1695                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1696                                             host_ratio);
1697             }
1698
1699             /* Clean up the bitmap */
1700             for (page = fixup_start_addr;
1701                  page < fixup_start_addr + host_ratio; page++) {
1702                 /* All pages in this host page are now not sent */
1703                 set_bit(page, unsentmap);
1704
1705                 /*
1706                  * Remark them as dirty, updating the count for any pages
1707                  * that weren't previously dirty.
1708                  */
1709                 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1710             }
1711         }
1712
1713         if (unsent_pass) {
1714             /* Find the next sent page for the next iteration */
1715             run_start = find_next_zero_bit(unsentmap, last + 1,
1716                                            run_start);
1717         } else {
1718             /* Find the next dirty page for the next iteration */
1719             run_start = find_next_bit(bitmap, last + 1, run_start);
1720         }
1721     }
1722 }
1723
1724 /*
1725  * Utility for the outgoing postcopy code.
1726  *
1727  * Discard any partially sent host-page size chunks, mark any partially
1728  * dirty host-page size chunks as all dirty.
1729  *
1730  * Returns: 0 on success
1731  */
1732 static int postcopy_chunk_hostpages(MigrationState *ms)
1733 {
1734     struct RAMBlock *block;
1735
1736     if (qemu_host_page_size == TARGET_PAGE_SIZE) {
1737         /* Easy case - TPS==HPS - nothing to be done */
1738         return 0;
1739     }
1740
1741     /* Easiest way to make sure we don't resume in the middle of a host-page */
1742     last_seen_block = NULL;
1743     last_sent_block = NULL;
1744     last_offset     = 0;
1745
1746     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1747         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1748
1749         PostcopyDiscardState *pds =
1750                          postcopy_discard_send_init(ms, first, block->idstr);
1751
1752         /* First pass: Discard all partially sent host pages */
1753         postcopy_chunk_hostpages_pass(ms, true, block, pds);
1754         /*
1755          * Second pass: Ensure that all partially dirty host pages are made
1756          * fully dirty.
1757          */
1758         postcopy_chunk_hostpages_pass(ms, false, block, pds);
1759
1760         postcopy_discard_send_finish(ms, pds);
1761     } /* ram_list loop */
1762
1763     return 0;
1764 }
1765
1766 /*
1767  * Transmit the set of pages to be discarded after precopy to the target
1768  * these are pages that:
1769  *     a) Have been previously transmitted but are now dirty again
1770  *     b) Pages that have never been transmitted, this ensures that
1771  *        any pages on the destination that have been mapped by background
1772  *        tasks get discarded (transparent huge pages is the specific concern)
1773  * Hopefully this is pretty sparse
1774  */
1775 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1776 {
1777     int ret;
1778     unsigned long *bitmap, *unsentmap;
1779
1780     rcu_read_lock();
1781
1782     /* This should be our last sync, the src is now paused */
1783     migration_bitmap_sync();
1784
1785     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1786     if (!unsentmap) {
1787         /* We don't have a safe way to resize the sentmap, so
1788          * if the bitmap was resized it will be NULL at this
1789          * point.
1790          */
1791         error_report("migration ram resized during precopy phase");
1792         rcu_read_unlock();
1793         return -EINVAL;
1794     }
1795
1796     /* Deal with TPS != HPS */
1797     ret = postcopy_chunk_hostpages(ms);
1798     if (ret) {
1799         rcu_read_unlock();
1800         return ret;
1801     }
1802
1803     /*
1804      * Update the unsentmap to be unsentmap = unsentmap | dirty
1805      */
1806     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1807     bitmap_or(unsentmap, unsentmap, bitmap,
1808                last_ram_offset() >> TARGET_PAGE_BITS);
1809
1810
1811     trace_ram_postcopy_send_discard_bitmap();
1812 #ifdef DEBUG_POSTCOPY
1813     ram_debug_dump_bitmap(unsentmap, true);
1814 #endif
1815
1816     ret = postcopy_each_ram_send_discard(ms);
1817     rcu_read_unlock();
1818
1819     return ret;
1820 }
1821
1822 /*
1823  * At the start of the postcopy phase of migration, any now-dirty
1824  * precopied pages are discarded.
1825  *
1826  * start, length describe a byte address range within the RAMBlock
1827  *
1828  * Returns 0 on success.
1829  */
1830 int ram_discard_range(MigrationIncomingState *mis,
1831                       const char *block_name,
1832                       uint64_t start, size_t length)
1833 {
1834     int ret = -1;
1835
1836     rcu_read_lock();
1837     RAMBlock *rb = qemu_ram_block_by_name(block_name);
1838
1839     if (!rb) {
1840         error_report("ram_discard_range: Failed to find block '%s'",
1841                      block_name);
1842         goto err;
1843     }
1844
1845     uint8_t *host_startaddr = rb->host + start;
1846
1847     if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) {
1848         error_report("ram_discard_range: Unaligned start address: %p",
1849                      host_startaddr);
1850         goto err;
1851     }
1852
1853     if ((start + length) <= rb->used_length) {
1854         uint8_t *host_endaddr = host_startaddr + length;
1855         if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) {
1856             error_report("ram_discard_range: Unaligned end address: %p",
1857                          host_endaddr);
1858             goto err;
1859         }
1860         ret = postcopy_ram_discard_range(mis, host_startaddr, length);
1861     } else {
1862         error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
1863                      "/%zx/" RAM_ADDR_FMT")",
1864                      block_name, start, length, rb->used_length);
1865     }
1866
1867 err:
1868     rcu_read_unlock();
1869
1870     return ret;
1871 }
1872
1873
1874 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1875  * long-running RCU critical section.  When rcu-reclaims in the code
1876  * start to become numerous it will be necessary to reduce the
1877  * granularity of these critical sections.
1878  */
1879
1880 static int ram_save_setup(QEMUFile *f, void *opaque)
1881 {
1882     RAMBlock *block;
1883     int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1884
1885     dirty_rate_high_cnt = 0;
1886     bitmap_sync_count = 0;
1887     migration_bitmap_sync_init();
1888     qemu_mutex_init(&migration_bitmap_mutex);
1889
1890     if (migrate_use_xbzrle()) {
1891         XBZRLE_cache_lock();
1892         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1893                                   TARGET_PAGE_SIZE,
1894                                   TARGET_PAGE_SIZE);
1895         if (!XBZRLE.cache) {
1896             XBZRLE_cache_unlock();
1897             error_report("Error creating cache");
1898             return -1;
1899         }
1900         XBZRLE_cache_unlock();
1901
1902         /* We prefer not to abort if there is no memory */
1903         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1904         if (!XBZRLE.encoded_buf) {
1905             error_report("Error allocating encoded_buf");
1906             return -1;
1907         }
1908
1909         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1910         if (!XBZRLE.current_buf) {
1911             error_report("Error allocating current_buf");
1912             g_free(XBZRLE.encoded_buf);
1913             XBZRLE.encoded_buf = NULL;
1914             return -1;
1915         }
1916
1917         acct_clear();
1918     }
1919
1920     /* iothread lock needed for ram_list.dirty_memory[] */
1921     qemu_mutex_lock_iothread();
1922     qemu_mutex_lock_ramlist();
1923     rcu_read_lock();
1924     bytes_transferred = 0;
1925     reset_ram_globals();
1926
1927     ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1928     migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
1929     migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
1930     bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
1931
1932     if (migrate_postcopy_ram()) {
1933         migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
1934         bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
1935     }
1936
1937     /*
1938      * Count the total number of pages used by ram blocks not including any
1939      * gaps due to alignment or unplugs.
1940      */
1941     migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1942
1943     memory_global_dirty_log_start();
1944     migration_bitmap_sync();
1945     qemu_mutex_unlock_ramlist();
1946     qemu_mutex_unlock_iothread();
1947
1948     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1949
1950     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1951         qemu_put_byte(f, strlen(block->idstr));
1952         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1953         qemu_put_be64(f, block->used_length);
1954     }
1955
1956     rcu_read_unlock();
1957
1958     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1959     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1960
1961     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1962
1963     return 0;
1964 }
1965
1966 static int ram_save_iterate(QEMUFile *f, void *opaque)
1967 {
1968     int ret;
1969     int i;
1970     int64_t t0;
1971     int pages_sent = 0;
1972
1973     rcu_read_lock();
1974     if (ram_list.version != last_version) {
1975         reset_ram_globals();
1976     }
1977
1978     /* Read version before ram_list.blocks */
1979     smp_rmb();
1980
1981     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1982
1983     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1984     i = 0;
1985     while ((ret = qemu_file_rate_limit(f)) == 0) {
1986         int pages;
1987
1988         pages = ram_find_and_save_block(f, false, &bytes_transferred);
1989         /* no more pages to sent */
1990         if (pages == 0) {
1991             break;
1992         }
1993         pages_sent += pages;
1994         acct_info.iterations++;
1995
1996         /* we want to check in the 1st loop, just in case it was the 1st time
1997            and we had to sync the dirty bitmap.
1998            qemu_get_clock_ns() is a bit expensive, so we only check each some
1999            iterations
2000         */
2001         if ((i & 63) == 0) {
2002             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2003             if (t1 > MAX_WAIT) {
2004                 DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
2005                         t1, i);
2006                 break;
2007             }
2008         }
2009         i++;
2010     }
2011     flush_compressed_data(f);
2012     rcu_read_unlock();
2013
2014     /*
2015      * Must occur before EOS (or any QEMUFile operation)
2016      * because of RDMA protocol.
2017      */
2018     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2019
2020     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2021     bytes_transferred += 8;
2022
2023     ret = qemu_file_get_error(f);
2024     if (ret < 0) {
2025         return ret;
2026     }
2027
2028     return pages_sent;
2029 }
2030
2031 /* Called with iothread lock */
2032 static int ram_save_complete(QEMUFile *f, void *opaque)
2033 {
2034     rcu_read_lock();
2035
2036     if (!migration_in_postcopy(migrate_get_current())) {
2037         migration_bitmap_sync();
2038     }
2039
2040     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2041
2042     /* try transferring iterative blocks of memory */
2043
2044     /* flush all remaining blocks regardless of rate limiting */
2045     while (true) {
2046         int pages;
2047
2048         pages = ram_find_and_save_block(f, true, &bytes_transferred);
2049         /* no more blocks to sent */
2050         if (pages == 0) {
2051             break;
2052         }
2053     }
2054
2055     flush_compressed_data(f);
2056     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2057
2058     rcu_read_unlock();
2059
2060     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2061
2062     return 0;
2063 }
2064
2065 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2066                              uint64_t *non_postcopiable_pending,
2067                              uint64_t *postcopiable_pending)
2068 {
2069     uint64_t remaining_size;
2070
2071     remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2072
2073     if (!migration_in_postcopy(migrate_get_current()) &&
2074         remaining_size < max_size) {
2075         qemu_mutex_lock_iothread();
2076         rcu_read_lock();
2077         migration_bitmap_sync();
2078         rcu_read_unlock();
2079         qemu_mutex_unlock_iothread();
2080         remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2081     }
2082
2083     /* We can do postcopy, and all the data is postcopiable */
2084     *postcopiable_pending += remaining_size;
2085 }
2086
2087 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2088 {
2089     unsigned int xh_len;
2090     int xh_flags;
2091
2092     if (!xbzrle_decoded_buf) {
2093         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2094     }
2095
2096     /* extract RLE header */
2097     xh_flags = qemu_get_byte(f);
2098     xh_len = qemu_get_be16(f);
2099
2100     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2101         error_report("Failed to load XBZRLE page - wrong compression!");
2102         return -1;
2103     }
2104
2105     if (xh_len > TARGET_PAGE_SIZE) {
2106         error_report("Failed to load XBZRLE page - len overflow!");
2107         return -1;
2108     }
2109     /* load data and decode */
2110     qemu_get_buffer(f, xbzrle_decoded_buf, xh_len);
2111
2112     /* decode RLE */
2113     if (xbzrle_decode_buffer(xbzrle_decoded_buf, xh_len, host,
2114                              TARGET_PAGE_SIZE) == -1) {
2115         error_report("Failed to load XBZRLE page - decode error!");
2116         return -1;
2117     }
2118
2119     return 0;
2120 }
2121
2122 /* Must be called from within a rcu critical section.
2123  * Returns a pointer from within the RCU-protected ram_list.
2124  */
2125 /*
2126  * Read a RAMBlock ID from the stream f, find the host address of the
2127  * start of that block and add on 'offset'
2128  *
2129  * f: Stream to read from
2130  * offset: Offset within the block
2131  * flags: Page flags (mostly to see if it's a continuation of previous block)
2132  */
2133 static inline void *host_from_stream_offset(QEMUFile *f,
2134                                             ram_addr_t offset,
2135                                             int flags)
2136 {
2137     static RAMBlock *block = NULL;
2138     char id[256];
2139     uint8_t len;
2140
2141     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2142         if (!block || block->max_length <= offset) {
2143             error_report("Ack, bad migration stream!");
2144             return NULL;
2145         }
2146
2147         return block->host + offset;
2148     }
2149
2150     len = qemu_get_byte(f);
2151     qemu_get_buffer(f, (uint8_t *)id, len);
2152     id[len] = 0;
2153
2154     block = qemu_ram_block_by_name(id);
2155     if (block && block->max_length > offset) {
2156         return block->host + offset;
2157     }
2158
2159     error_report("Can't find block %s", id);
2160     return NULL;
2161 }
2162
2163 /*
2164  * If a page (or a whole RDMA chunk) has been
2165  * determined to be zero, then zap it.
2166  */
2167 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2168 {
2169     if (ch != 0 || !is_zero_range(host, size)) {
2170         memset(host, ch, size);
2171     }
2172 }
2173
2174 static void *do_data_decompress(void *opaque)
2175 {
2176     DecompressParam *param = opaque;
2177     unsigned long pagesize;
2178
2179     while (!quit_decomp_thread) {
2180         qemu_mutex_lock(&param->mutex);
2181         while (!param->start && !quit_decomp_thread) {
2182             qemu_cond_wait(&param->cond, &param->mutex);
2183             pagesize = TARGET_PAGE_SIZE;
2184             if (!quit_decomp_thread) {
2185                 /* uncompress() will return failed in some case, especially
2186                  * when the page is dirted when doing the compression, it's
2187                  * not a problem because the dirty page will be retransferred
2188                  * and uncompress() won't break the data in other pages.
2189                  */
2190                 uncompress((Bytef *)param->des, &pagesize,
2191                            (const Bytef *)param->compbuf, param->len);
2192             }
2193             param->start = false;
2194         }
2195         qemu_mutex_unlock(&param->mutex);
2196     }
2197
2198     return NULL;
2199 }
2200
2201 void migrate_decompress_threads_create(void)
2202 {
2203     int i, thread_count;
2204
2205     thread_count = migrate_decompress_threads();
2206     decompress_threads = g_new0(QemuThread, thread_count);
2207     decomp_param = g_new0(DecompressParam, thread_count);
2208     compressed_data_buf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2209     quit_decomp_thread = false;
2210     for (i = 0; i < thread_count; i++) {
2211         qemu_mutex_init(&decomp_param[i].mutex);
2212         qemu_cond_init(&decomp_param[i].cond);
2213         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2214         qemu_thread_create(decompress_threads + i, "decompress",
2215                            do_data_decompress, decomp_param + i,
2216                            QEMU_THREAD_JOINABLE);
2217     }
2218 }
2219
2220 void migrate_decompress_threads_join(void)
2221 {
2222     int i, thread_count;
2223
2224     quit_decomp_thread = true;
2225     thread_count = migrate_decompress_threads();
2226     for (i = 0; i < thread_count; i++) {
2227         qemu_mutex_lock(&decomp_param[i].mutex);
2228         qemu_cond_signal(&decomp_param[i].cond);
2229         qemu_mutex_unlock(&decomp_param[i].mutex);
2230     }
2231     for (i = 0; i < thread_count; i++) {
2232         qemu_thread_join(decompress_threads + i);
2233         qemu_mutex_destroy(&decomp_param[i].mutex);
2234         qemu_cond_destroy(&decomp_param[i].cond);
2235         g_free(decomp_param[i].compbuf);
2236     }
2237     g_free(decompress_threads);
2238     g_free(decomp_param);
2239     g_free(compressed_data_buf);
2240     decompress_threads = NULL;
2241     decomp_param = NULL;
2242     compressed_data_buf = NULL;
2243 }
2244
2245 static void decompress_data_with_multi_threads(uint8_t *compbuf,
2246                                                void *host, int len)
2247 {
2248     int idx, thread_count;
2249
2250     thread_count = migrate_decompress_threads();
2251     while (true) {
2252         for (idx = 0; idx < thread_count; idx++) {
2253             if (!decomp_param[idx].start) {
2254                 memcpy(decomp_param[idx].compbuf, compbuf, len);
2255                 decomp_param[idx].des = host;
2256                 decomp_param[idx].len = len;
2257                 start_decompression(&decomp_param[idx]);
2258                 break;
2259             }
2260         }
2261         if (idx < thread_count) {
2262             break;
2263         }
2264     }
2265 }
2266
2267 /*
2268  * Allocate data structures etc needed by incoming migration with postcopy-ram
2269  * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
2270  */
2271 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2272 {
2273     size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2274
2275     return postcopy_ram_incoming_init(mis, ram_pages);
2276 }
2277
2278 /*
2279  * Called in postcopy mode by ram_load().
2280  * rcu_read_lock is taken prior to this being called.
2281  */
2282 static int ram_load_postcopy(QEMUFile *f)
2283 {
2284     int flags = 0, ret = 0;
2285     bool place_needed = false;
2286     bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE;
2287     MigrationIncomingState *mis = migration_incoming_get_current();
2288     /* Temporary page that is later 'placed' */
2289     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2290     void *last_host = NULL;
2291     bool all_zero = false;
2292
2293     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2294         ram_addr_t addr;
2295         void *host = NULL;
2296         void *page_buffer = NULL;
2297         void *place_source = NULL;
2298         uint8_t ch;
2299
2300         addr = qemu_get_be64(f);
2301         flags = addr & ~TARGET_PAGE_MASK;
2302         addr &= TARGET_PAGE_MASK;
2303
2304         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2305         place_needed = false;
2306         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2307             host = host_from_stream_offset(f, addr, flags);
2308             if (!host) {
2309                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2310                 ret = -EINVAL;
2311                 break;
2312             }
2313             page_buffer = host;
2314             /*
2315              * Postcopy requires that we place whole host pages atomically.
2316              * To make it atomic, the data is read into a temporary page
2317              * that's moved into place later.
2318              * The migration protocol uses,  possibly smaller, target-pages
2319              * however the source ensures it always sends all the components
2320              * of a host page in order.
2321              */
2322             page_buffer = postcopy_host_page +
2323                           ((uintptr_t)host & ~qemu_host_page_mask);
2324             /* If all TP are zero then we can optimise the place */
2325             if (!((uintptr_t)host & ~qemu_host_page_mask)) {
2326                 all_zero = true;
2327             } else {
2328                 /* not the 1st TP within the HP */
2329                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2330                     error_report("Non-sequential target page %p/%p\n",
2331                                   host, last_host);
2332                     ret = -EINVAL;
2333                     break;
2334                 }
2335             }
2336
2337
2338             /*
2339              * If it's the last part of a host page then we place the host
2340              * page
2341              */
2342             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2343                                      ~qemu_host_page_mask) == 0;
2344             place_source = postcopy_host_page;
2345         }
2346         last_host = host;
2347
2348         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2349         case RAM_SAVE_FLAG_COMPRESS:
2350             ch = qemu_get_byte(f);
2351             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2352             if (ch) {
2353                 all_zero = false;
2354             }
2355             break;
2356
2357         case RAM_SAVE_FLAG_PAGE:
2358             all_zero = false;
2359             if (!place_needed || !matching_page_sizes) {
2360                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2361             } else {
2362                 /* Avoids the qemu_file copy during postcopy, which is
2363                  * going to do a copy later; can only do it when we
2364                  * do this read in one go (matching page sizes)
2365                  */
2366                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2367                                          TARGET_PAGE_SIZE);
2368             }
2369             break;
2370         case RAM_SAVE_FLAG_EOS:
2371             /* normal exit */
2372             break;
2373         default:
2374             error_report("Unknown combination of migration flags: %#x"
2375                          " (postcopy mode)", flags);
2376             ret = -EINVAL;
2377         }
2378
2379         if (place_needed) {
2380             /* This gets called at the last target page in the host page */
2381             if (all_zero) {
2382                 ret = postcopy_place_page_zero(mis,
2383                                                host + TARGET_PAGE_SIZE -
2384                                                qemu_host_page_size);
2385             } else {
2386                 ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE -
2387                                                qemu_host_page_size,
2388                                                place_source);
2389             }
2390         }
2391         if (!ret) {
2392             ret = qemu_file_get_error(f);
2393         }
2394     }
2395
2396     return ret;
2397 }
2398
2399 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2400 {
2401     int flags = 0, ret = 0;
2402     static uint64_t seq_iter;
2403     int len = 0;
2404     /*
2405      * If system is running in postcopy mode, page inserts to host memory must
2406      * be atomic
2407      */
2408     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2409
2410     seq_iter++;
2411
2412     if (version_id != 4) {
2413         ret = -EINVAL;
2414     }
2415
2416     /* This RCU critical section can be very long running.
2417      * When RCU reclaims in the code start to become numerous,
2418      * it will be necessary to reduce the granularity of this
2419      * critical section.
2420      */
2421     rcu_read_lock();
2422
2423     if (postcopy_running) {
2424         ret = ram_load_postcopy(f);
2425     }
2426
2427     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2428         ram_addr_t addr, total_ram_bytes;
2429         void *host = NULL;
2430         uint8_t ch;
2431
2432         addr = qemu_get_be64(f);
2433         flags = addr & ~TARGET_PAGE_MASK;
2434         addr &= TARGET_PAGE_MASK;
2435
2436         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2437                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2438             host = host_from_stream_offset(f, addr, flags);
2439             if (!host) {
2440                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2441                 ret = -EINVAL;
2442                 break;
2443             }
2444         }
2445
2446         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2447         case RAM_SAVE_FLAG_MEM_SIZE:
2448             /* Synchronize RAM block list */
2449             total_ram_bytes = addr;
2450             while (!ret && total_ram_bytes) {
2451                 RAMBlock *block;
2452                 char id[256];
2453                 ram_addr_t length;
2454
2455                 len = qemu_get_byte(f);
2456                 qemu_get_buffer(f, (uint8_t *)id, len);
2457                 id[len] = 0;
2458                 length = qemu_get_be64(f);
2459
2460                 block = qemu_ram_block_by_name(id);
2461                 if (block) {
2462                     if (length != block->used_length) {
2463                         Error *local_err = NULL;
2464
2465                         ret = qemu_ram_resize(block->offset, length,
2466                                               &local_err);
2467                         if (local_err) {
2468                             error_report_err(local_err);
2469                         }
2470                     }
2471                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2472                                           block->idstr);
2473                 } else {
2474                     error_report("Unknown ramblock \"%s\", cannot "
2475                                  "accept migration", id);
2476                     ret = -EINVAL;
2477                 }
2478
2479                 total_ram_bytes -= length;
2480             }
2481             break;
2482
2483         case RAM_SAVE_FLAG_COMPRESS:
2484             ch = qemu_get_byte(f);
2485             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2486             break;
2487
2488         case RAM_SAVE_FLAG_PAGE:
2489             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2490             break;
2491
2492         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2493             len = qemu_get_be32(f);
2494             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2495                 error_report("Invalid compressed data length: %d", len);
2496                 ret = -EINVAL;
2497                 break;
2498             }
2499             qemu_get_buffer(f, compressed_data_buf, len);
2500             decompress_data_with_multi_threads(compressed_data_buf, host, len);
2501             break;
2502
2503         case RAM_SAVE_FLAG_XBZRLE:
2504             if (load_xbzrle(f, addr, host) < 0) {
2505                 error_report("Failed to decompress XBZRLE page at "
2506                              RAM_ADDR_FMT, addr);
2507                 ret = -EINVAL;
2508                 break;
2509             }
2510             break;
2511         case RAM_SAVE_FLAG_EOS:
2512             /* normal exit */
2513             break;
2514         default:
2515             if (flags & RAM_SAVE_FLAG_HOOK) {
2516                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2517             } else {
2518                 error_report("Unknown combination of migration flags: %#x",
2519                              flags);
2520                 ret = -EINVAL;
2521             }
2522         }
2523         if (!ret) {
2524             ret = qemu_file_get_error(f);
2525         }
2526     }
2527
2528     rcu_read_unlock();
2529     DPRINTF("Completed load of VM with exit code %d seq iteration "
2530             "%" PRIu64 "\n", ret, seq_iter);
2531     return ret;
2532 }
2533
2534 static SaveVMHandlers savevm_ram_handlers = {
2535     .save_live_setup = ram_save_setup,
2536     .save_live_iterate = ram_save_iterate,
2537     .save_live_complete_postcopy = ram_save_complete,
2538     .save_live_complete_precopy = ram_save_complete,
2539     .save_live_pending = ram_save_pending,
2540     .load_state = ram_load,
2541     .cleanup = ram_migration_cleanup,
2542 };
2543
2544 void ram_mig_init(void)
2545 {
2546     qemu_mutex_init(&XBZRLE.lock);
2547     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
2548 }