migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "qemu-common.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qapi-event.h"
  33 #include "qemu/cutils.h"
  34 #include "qemu/bitops.h"
  35 #include "qemu/bitmap.h"
  36 #include "qemu/timer.h"
  37 #include "qemu/main-loop.h"
  38 #include "migration/migration.h"
  39 #include "migration/postcopy-ram.h"
  40 #include "exec/address-spaces.h"
  41 #include "migration/page_cache.h"
  42 #include "qemu/error-report.h"
  43 #include "trace.h"
  44 #include "exec/ram_addr.h"
  45 #include "qemu/rcu_queue.h"
  46
  47 #ifdef DEBUG_MIGRATION_RAM
  48 #define DPRINTF(fmt, ...) \
  49     do { fprintf(stdout, "migration_ram: " fmt, ## __VA_ARGS__); } while (0)
  50 #else
  51 #define DPRINTF(fmt, ...) \
  52     do { } while (0)
  53 #endif
  54
  55 static int dirty_rate_high_cnt;
  56
  57 static uint64_t bitmap_sync_count;
  58
  59 /***********************************************************/
  60 /* ram save/restore */
  61
  62 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  63 #define RAM_SAVE_FLAG_COMPRESS 0x02
  64 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  65 #define RAM_SAVE_FLAG_PAGE     0x08
  66 #define RAM_SAVE_FLAG_EOS      0x10
  67 #define RAM_SAVE_FLAG_CONTINUE 0x20
  68 #define RAM_SAVE_FLAG_XBZRLE   0x40
  69 /* 0x80 is reserved in migration.h start with 0x100 next */
  70 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  71
  72 static uint8_t *ZERO_TARGET_PAGE;
  73
  74 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  75 {
  76     return buffer_is_zero(p, size);
  77 }
  78
  79 /* struct contains XBZRLE cache and a static page
  80    used by the compression */
  81 static struct {
  82     /* buffer used for XBZRLE encoding */
  83     uint8_t *encoded_buf;
  84     /* buffer for storing page content */
  85     uint8_t *current_buf;
  86     /* Cache for XBZRLE, Protected by lock. */
  87     PageCache *cache;
  88     QemuMutex lock;
  89 } XBZRLE;
  90
  91 /* buffer used for XBZRLE decoding */
  92 static uint8_t *xbzrle_decoded_buf;
  93
  94 static void XBZRLE_cache_lock(void)
  95 {
  96     if (migrate_use_xbzrle())
  97         qemu_mutex_lock(&XBZRLE.lock);
  98 }
  99
 100 static void XBZRLE_cache_unlock(void)
 101 {
 102     if (migrate_use_xbzrle())
 103         qemu_mutex_unlock(&XBZRLE.lock);
 104 }
 105
 106 /*
 107  * called from qmp_migrate_set_cache_size in main thread, possibly while
 108  * a migration is in progress.
 109  * A running migration maybe using the cache and might finish during this
 110  * call, hence changes to the cache are protected by XBZRLE.lock().
 111  */
 112 int64_t xbzrle_cache_resize(int64_t new_size)
 113 {
 114     PageCache *new_cache;
 115     int64_t ret;
 116
 117     if (new_size < TARGET_PAGE_SIZE) {
 118         return -1;
 119     }
 120
 121     XBZRLE_cache_lock();
 122
 123     if (XBZRLE.cache != NULL) {
 124         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 125             goto out_new_size;
 126         }
 127         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 128                                         TARGET_PAGE_SIZE);
 129         if (!new_cache) {
 130             error_report("Error creating cache");
 131             ret = -1;
 132             goto out;
 133         }
 134
 135         cache_fini(XBZRLE.cache);
 136         XBZRLE.cache = new_cache;
 137     }
 138
 139 out_new_size:
 140     ret = pow2floor(new_size);
 141 out:
 142     XBZRLE_cache_unlock();
 143     return ret;
 144 }
 145
 146 /* accounting for migration statistics */
 147 typedef struct AccountingInfo {
 148     uint64_t dup_pages;
 149     uint64_t skipped_pages;
 150     uint64_t norm_pages;
 151     uint64_t iterations;
 152     uint64_t xbzrle_bytes;
 153     uint64_t xbzrle_pages;
 154     uint64_t xbzrle_cache_miss;
 155     double xbzrle_cache_miss_rate;
 156     uint64_t xbzrle_overflows;
 157 } AccountingInfo;
 158
 159 static AccountingInfo acct_info;
 160
 161 static void acct_clear(void)
 162 {
 163     memset(&acct_info, 0, sizeof(acct_info));
 164 }
 165
 166 uint64_t dup_mig_bytes_transferred(void)
 167 {
 168     return acct_info.dup_pages * TARGET_PAGE_SIZE;
 169 }
 170
 171 uint64_t dup_mig_pages_transferred(void)
 172 {
 173     return acct_info.dup_pages;
 174 }
 175
 176 uint64_t skipped_mig_bytes_transferred(void)
 177 {
 178     return acct_info.skipped_pages * TARGET_PAGE_SIZE;
 179 }
 180
 181 uint64_t skipped_mig_pages_transferred(void)
 182 {
 183     return acct_info.skipped_pages;
 184 }
 185
 186 uint64_t norm_mig_bytes_transferred(void)
 187 {
 188     return acct_info.norm_pages * TARGET_PAGE_SIZE;
 189 }
 190
 191 uint64_t norm_mig_pages_transferred(void)
 192 {
 193     return acct_info.norm_pages;
 194 }
 195
 196 uint64_t xbzrle_mig_bytes_transferred(void)
 197 {
 198     return acct_info.xbzrle_bytes;
 199 }
 200
 201 uint64_t xbzrle_mig_pages_transferred(void)
 202 {
 203     return acct_info.xbzrle_pages;
 204 }
 205
 206 uint64_t xbzrle_mig_pages_cache_miss(void)
 207 {
 208     return acct_info.xbzrle_cache_miss;
 209 }
 210
 211 double xbzrle_mig_cache_miss_rate(void)
 212 {
 213     return acct_info.xbzrle_cache_miss_rate;
 214 }
 215
 216 uint64_t xbzrle_mig_pages_overflow(void)
 217 {
 218     return acct_info.xbzrle_overflows;
 219 }
 220
 221 /* This is the last block that we have visited serching for dirty pages
 222  */
 223 static RAMBlock *last_seen_block;
 224 /* This is the last block from where we have sent data */
 225 static RAMBlock *last_sent_block;
 226 static ram_addr_t last_offset;
 227 static QemuMutex migration_bitmap_mutex;
 228 static uint64_t migration_dirty_pages;
 229 static uint32_t last_version;
 230 static bool ram_bulk_stage;
 231
 232 /* used by the search for pages to send */
 233 struct PageSearchStatus {
 234     /* Current block being searched */
 235     RAMBlock    *block;
 236     /* Current offset to search from */
 237     ram_addr_t   offset;
 238     /* Set once we wrap around */
 239     bool         complete_round;
 240 };
 241 typedef struct PageSearchStatus PageSearchStatus;
 242
 243 static struct BitmapRcu {
 244     struct rcu_head rcu;
 245     /* Main migration bitmap */
 246     unsigned long *bmap;
 247     /* bitmap of pages that haven't been sent even once
 248      * only maintained and used in postcopy at the moment
 249      * where it's used to send the dirtymap at the start
 250      * of the postcopy phase
 251      */
 252     unsigned long *unsentmap;
 253 } *migration_bitmap_rcu;
 254
 255 struct CompressParam {
 256     bool done;
 257     bool quit;
 258     QEMUFile *file;
 259     QemuMutex mutex;
 260     QemuCond cond;
 261     RAMBlock *block;
 262     ram_addr_t offset;
 263 };
 264 typedef struct CompressParam CompressParam;
 265
 266 struct DecompressParam {
 267     bool done;
 268     bool quit;
 269     QemuMutex mutex;
 270     QemuCond cond;
 271     void *des;
 272     uint8_t *compbuf;
 273     int len;
 274 };
 275 typedef struct DecompressParam DecompressParam;
 276
 277 static CompressParam *comp_param;
 278 static QemuThread *compress_threads;
 279 /* comp_done_cond is used to wake up the migration thread when
 280  * one of the compression threads has finished the compression.
 281  * comp_done_lock is used to co-work with comp_done_cond.
 282  */
 283 static QemuMutex comp_done_lock;
 284 static QemuCond comp_done_cond;
 285 /* The empty QEMUFileOps will be used by file in CompressParam */
 286 static const QEMUFileOps empty_ops = { };
 287
 288 static bool compression_switch;
 289 static DecompressParam *decomp_param;
 290 static QemuThread *decompress_threads;
 291 static QemuMutex decomp_done_lock;
 292 static QemuCond decomp_done_cond;
 293
 294 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 295                                 ram_addr_t offset);
 296
 297 static void *do_data_compress(void *opaque)
 298 {
 299     CompressParam *param = opaque;
 300     RAMBlock *block;
 301     ram_addr_t offset;
 302
 303     qemu_mutex_lock(&param->mutex);
 304     while (!param->quit) {
 305         if (param->block) {
 306             block = param->block;
 307             offset = param->offset;
 308             param->block = NULL;
 309             qemu_mutex_unlock(&param->mutex);
 310
 311             do_compress_ram_page(param->file, block, offset);
 312
 313             qemu_mutex_lock(&comp_done_lock);
 314             param->done = true;
 315             qemu_cond_signal(&comp_done_cond);
 316             qemu_mutex_unlock(&comp_done_lock);
 317
 318             qemu_mutex_lock(&param->mutex);
 319         } else {
 320             qemu_cond_wait(&param->cond, &param->mutex);
 321         }
 322     }
 323     qemu_mutex_unlock(&param->mutex);
 324
 325     return NULL;
 326 }
 327
 328 static inline void terminate_compression_threads(void)
 329 {
 330     int idx, thread_count;
 331
 332     thread_count = migrate_compress_threads();
 333     for (idx = 0; idx < thread_count; idx++) {
 334         qemu_mutex_lock(&comp_param[idx].mutex);
 335         comp_param[idx].quit = true;
 336         qemu_cond_signal(&comp_param[idx].cond);
 337         qemu_mutex_unlock(&comp_param[idx].mutex);
 338     }
 339 }
 340
 341 void migrate_compress_threads_join(void)
 342 {
 343     int i, thread_count;
 344
 345     if (!migrate_use_compression()) {
 346         return;
 347     }
 348     terminate_compression_threads();
 349     thread_count = migrate_compress_threads();
 350     for (i = 0; i < thread_count; i++) {
 351         qemu_thread_join(compress_threads + i);
 352         qemu_fclose(comp_param[i].file);
 353         qemu_mutex_destroy(&comp_param[i].mutex);
 354         qemu_cond_destroy(&comp_param[i].cond);
 355     }
 356     qemu_mutex_destroy(&comp_done_lock);
 357     qemu_cond_destroy(&comp_done_cond);
 358     g_free(compress_threads);
 359     g_free(comp_param);
 360     compress_threads = NULL;
 361     comp_param = NULL;
 362 }
 363
 364 void migrate_compress_threads_create(void)
 365 {
 366     int i, thread_count;
 367
 368     if (!migrate_use_compression()) {
 369         return;
 370     }
 371     compression_switch = true;
 372     thread_count = migrate_compress_threads();
 373     compress_threads = g_new0(QemuThread, thread_count);
 374     comp_param = g_new0(CompressParam, thread_count);
 375     qemu_cond_init(&comp_done_cond);
 376     qemu_mutex_init(&comp_done_lock);
 377     for (i = 0; i < thread_count; i++) {
 378         /* comp_param[i].file is just used as a dummy buffer to save data,
 379          * set its ops to empty.
 380          */
 381         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 382         comp_param[i].done = true;
 383         comp_param[i].quit = false;
 384         qemu_mutex_init(&comp_param[i].mutex);
 385         qemu_cond_init(&comp_param[i].cond);
 386         qemu_thread_create(compress_threads + i, "compress",
 387                            do_data_compress, comp_param + i,
 388                            QEMU_THREAD_JOINABLE);
 389     }
 390 }
 391
 392 /**
 393  * save_page_header: Write page header to wire
 394  *
 395  * If this is the 1st block, it also writes the block identification
 396  *
 397  * Returns: Number of bytes written
 398  *
 399  * @f: QEMUFile where to send the data
 400  * @block: block that contains the page we want to send
 401  * @offset: offset inside the block for the page
 402  *          in the lower bits, it contains flags
 403  */
 404 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
 405 {
 406     size_t size, len;
 407
 408     qemu_put_be64(f, offset);
 409     size = 8;
 410
 411     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 412         len = strlen(block->idstr);
 413         qemu_put_byte(f, len);
 414         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 415         size += 1 + len;
 416     }
 417     return size;
 418 }
 419
 420 /* Reduce amount of guest cpu execution to hopefully slow down memory writes.
 421  * If guest dirty memory rate is reduced below the rate at which we can
 422  * transfer pages to the destination then we should be able to complete
 423  * migration. Some workloads dirty memory way too fast and will not effectively
 424  * converge, even with auto-converge.
 425  */
 426 static void mig_throttle_guest_down(void)
 427 {
 428     MigrationState *s = migrate_get_current();
 429     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 430     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 431
 432     /* We have not started throttling yet. Let's start it. */
 433     if (!cpu_throttle_active()) {
 434         cpu_throttle_set(pct_initial);
 435     } else {
 436         /* Throttling already on, just increase the rate */
 437         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 438     }
 439 }
 440
 441 /* Update the xbzrle cache to reflect a page that's been sent as all 0.
 442  * The important thing is that a stale (not-yet-0'd) page be replaced
 443  * by the new data.
 444  * As a bonus, if the page wasn't in the cache it gets added so that
 445  * when a small write is made into the 0'd page it gets XBZRLE sent
 446  */
 447 static void xbzrle_cache_zero_page(ram_addr_t current_addr)
 448 {
 449     if (ram_bulk_stage || !migrate_use_xbzrle()) {
 450         return;
 451     }
 452
 453     /* We don't care if this fails to allocate a new cache page
 454      * as long as it updated an old one */
 455     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
 456                  bitmap_sync_count);
 457 }
 458
 459 #define ENCODING_FLAG_XBZRLE 0x1
 460
 461 /**
 462  * save_xbzrle_page: compress and send current page
 463  *
 464  * Returns: 1 means that we wrote the page
 465  *          0 means that page is identical to the one already sent
 466  *          -1 means that xbzrle would be longer than normal
 467  *
 468  * @f: QEMUFile where to send the data
 469  * @current_data:
 470  * @current_addr:
 471  * @block: block that contains the page we want to send
 472  * @offset: offset inside the block for the page
 473  * @last_stage: if we are at the completion stage
 474  * @bytes_transferred: increase it with the number of transferred bytes
 475  */
 476 static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
 477                             ram_addr_t current_addr, RAMBlock *block,
 478                             ram_addr_t offset, bool last_stage,
 479                             uint64_t *bytes_transferred)
 480 {
 481     int encoded_len = 0, bytes_xbzrle;
 482     uint8_t *prev_cached_page;
 483
 484     if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
 485         acct_info.xbzrle_cache_miss++;
 486         if (!last_stage) {
 487             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 488                              bitmap_sync_count) == -1) {
 489                 return -1;
 490             } else {
 491                 /* update *current_data when the page has been
 492                    inserted into cache */
 493                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 494             }
 495         }
 496         return -1;
 497     }
 498
 499     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 500
 501     /* save current buffer into memory */
 502     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 503
 504     /* XBZRLE encoding (if there is no overflow) */
 505     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 506                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 507                                        TARGET_PAGE_SIZE);
 508     if (encoded_len == 0) {
 509         DPRINTF("Skipping unmodified page\n");
 510         return 0;
 511     } else if (encoded_len == -1) {
 512         DPRINTF("Overflow\n");
 513         acct_info.xbzrle_overflows++;
 514         /* update data in the cache */
 515         if (!last_stage) {
 516             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 517             *current_data = prev_cached_page;
 518         }
 519         return -1;
 520     }
 521
 522     /* we need to update the data in the cache, in order to get the same data */
 523     if (!last_stage) {
 524         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 525     }
 526
 527     /* Send XBZRLE based compressed page */
 528     bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
 529     qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
 530     qemu_put_be16(f, encoded_len);
 531     qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
 532     bytes_xbzrle += encoded_len + 1 + 2;
 533     acct_info.xbzrle_pages++;
 534     acct_info.xbzrle_bytes += bytes_xbzrle;
 535     *bytes_transferred += bytes_xbzrle;
 536
 537     return 1;
 538 }
 539
 540 /* Called with rcu_read_lock() to protect migration_bitmap
 541  * rb: The RAMBlock  to search for dirty pages in
 542  * start: Start address (typically so we can continue from previous page)
 543  * ram_addr_abs: Pointer into which to store the address of the dirty page
 544  *               within the global ram_addr space
 545  *
 546  * Returns: byte offset within memory region of the start of a dirty page
 547  */
 548 static inline
 549 ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
 550                                        ram_addr_t start,
 551                                        ram_addr_t *ram_addr_abs)
 552 {
 553     unsigned long base = rb->offset >> TARGET_PAGE_BITS;
 554     unsigned long nr = base + (start >> TARGET_PAGE_BITS);
 555     uint64_t rb_size = rb->used_length;
 556     unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
 557     unsigned long *bitmap;
 558
 559     unsigned long next;
 560
 561     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 562     if (ram_bulk_stage && nr > base) {
 563         next = nr + 1;
 564     } else {
 565         next = find_next_bit(bitmap, size, nr);
 566     }
 567
 568     *ram_addr_abs = next << TARGET_PAGE_BITS;
 569     return (next - base) << TARGET_PAGE_BITS;
 570 }
 571
 572 static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
 573 {
 574     bool ret;
 575     int nr = addr >> TARGET_PAGE_BITS;
 576     unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 577
 578     ret = test_and_clear_bit(nr, bitmap);
 579
 580     if (ret) {
 581         migration_dirty_pages--;
 582     }
 583     return ret;
 584 }
 585
 586 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
 587 {
 588     unsigned long *bitmap;
 589     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 590     migration_dirty_pages +=
 591         cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
 592 }
 593
 594 /* Fix me: there are too many global variables used in migration process. */
 595 static int64_t start_time;
 596 static int64_t bytes_xfer_prev;
 597 static int64_t num_dirty_pages_period;
 598 static uint64_t xbzrle_cache_miss_prev;
 599 static uint64_t iterations_prev;
 600
 601 static void migration_bitmap_sync_init(void)
 602 {
 603     start_time = 0;
 604     bytes_xfer_prev = 0;
 605     num_dirty_pages_period = 0;
 606     xbzrle_cache_miss_prev = 0;
 607     iterations_prev = 0;
 608 }
 609
 610 static void migration_bitmap_sync(void)
 611 {
 612     RAMBlock *block;
 613     uint64_t num_dirty_pages_init = migration_dirty_pages;
 614     MigrationState *s = migrate_get_current();
 615     int64_t end_time;
 616     int64_t bytes_xfer_now;
 617
 618     bitmap_sync_count++;
 619
 620     if (!bytes_xfer_prev) {
 621         bytes_xfer_prev = ram_bytes_transferred();
 622     }
 623
 624     if (!start_time) {
 625         start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 626     }
 627
 628     trace_migration_bitmap_sync_start();
 629     memory_global_dirty_log_sync();
 630
 631     qemu_mutex_lock(&migration_bitmap_mutex);
 632     rcu_read_lock();
 633     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 634         migration_bitmap_sync_range(block->offset, block->used_length);
 635     }
 636     rcu_read_unlock();
 637     qemu_mutex_unlock(&migration_bitmap_mutex);
 638
 639     trace_migration_bitmap_sync_end(migration_dirty_pages
 640                                     - num_dirty_pages_init);
 641     num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
 642     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 643
 644     /* more than 1 second = 1000 millisecons */
 645     if (end_time > start_time + 1000) {
 646         if (migrate_auto_converge()) {
 647             /* The following detection logic can be refined later. For now:
 648                Check to see if the dirtied bytes is 50% more than the approx.
 649                amount of bytes that just got transferred since the last time we
 650                were in this routine. If that happens twice, start or increase
 651                throttling */
 652             bytes_xfer_now = ram_bytes_transferred();
 653
 654             if (s->dirty_pages_rate &&
 655                (num_dirty_pages_period * TARGET_PAGE_SIZE >
 656                    (bytes_xfer_now - bytes_xfer_prev)/2) &&
 657                (dirty_rate_high_cnt++ >= 2)) {
 658                     trace_migration_throttle();
 659                     dirty_rate_high_cnt = 0;
 660                     mig_throttle_guest_down();
 661              }
 662              bytes_xfer_prev = bytes_xfer_now;
 663         }
 664
 665         if (migrate_use_xbzrle()) {
 666             if (iterations_prev != acct_info.iterations) {
 667                 acct_info.xbzrle_cache_miss_rate =
 668                    (double)(acct_info.xbzrle_cache_miss -
 669                             xbzrle_cache_miss_prev) /
 670                    (acct_info.iterations - iterations_prev);
 671             }
 672             iterations_prev = acct_info.iterations;
 673             xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
 674         }
 675         s->dirty_pages_rate = num_dirty_pages_period * 1000
 676             / (end_time - start_time);
 677         s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
 678         start_time = end_time;
 679         num_dirty_pages_period = 0;
 680     }
 681     s->dirty_sync_count = bitmap_sync_count;
 682     if (migrate_use_events()) {
 683         qapi_event_send_migration_pass(bitmap_sync_count, NULL);
 684     }
 685 }
 686
 687 /**
 688  * save_zero_page: Send the zero page to the stream
 689  *
 690  * Returns: Number of pages written.
 691  *
 692  * @f: QEMUFile where to send the data
 693  * @block: block that contains the page we want to send
 694  * @offset: offset inside the block for the page
 695  * @p: pointer to the page
 696  * @bytes_transferred: increase it with the number of transferred bytes
 697  */
 698 static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
 699                           uint8_t *p, uint64_t *bytes_transferred)
 700 {
 701     int pages = -1;
 702
 703     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 704         acct_info.dup_pages++;
 705         *bytes_transferred += save_page_header(f, block,
 706                                                offset | RAM_SAVE_FLAG_COMPRESS);
 707         qemu_put_byte(f, 0);
 708         *bytes_transferred += 1;
 709         pages = 1;
 710     }
 711
 712     return pages;
 713 }
 714
 715 /**
 716  * ram_save_page: Send the given page to the stream
 717  *
 718  * Returns: Number of pages written.
 719  *          < 0 - error
 720  *          >=0 - Number of pages written - this might legally be 0
 721  *                if xbzrle noticed the page was the same.
 722  *
 723  * @f: QEMUFile where to send the data
 724  * @block: block that contains the page we want to send
 725  * @offset: offset inside the block for the page
 726  * @last_stage: if we are at the completion stage
 727  * @bytes_transferred: increase it with the number of transferred bytes
 728  */
 729 static int ram_save_page(QEMUFile *f, PageSearchStatus *pss,
 730                          bool last_stage, uint64_t *bytes_transferred)
 731 {
 732     int pages = -1;
 733     uint64_t bytes_xmit;
 734     ram_addr_t current_addr;
 735     uint8_t *p;
 736     int ret;
 737     bool send_async = true;
 738     RAMBlock *block = pss->block;
 739     ram_addr_t offset = pss->offset;
 740
 741     p = block->host + offset;
 742
 743     /* In doubt sent page as normal */
 744     bytes_xmit = 0;
 745     ret = ram_control_save_page(f, block->offset,
 746                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 747     if (bytes_xmit) {
 748         *bytes_transferred += bytes_xmit;
 749         pages = 1;
 750     }
 751
 752     XBZRLE_cache_lock();
 753
 754     current_addr = block->offset + offset;
 755
 756     if (block == last_sent_block) {
 757         offset |= RAM_SAVE_FLAG_CONTINUE;
 758     }
 759     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 760         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 761             if (bytes_xmit > 0) {
 762                 acct_info.norm_pages++;
 763             } else if (bytes_xmit == 0) {
 764                 acct_info.dup_pages++;
 765             }
 766         }
 767     } else {
 768         pages = save_zero_page(f, block, offset, p, bytes_transferred);
 769         if (pages > 0) {
 770             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 771              * page would be stale
 772              */
 773             xbzrle_cache_zero_page(current_addr);
 774         } else if (!ram_bulk_stage &&
 775                    !migration_in_postcopy(migrate_get_current()) &&
 776                    migrate_use_xbzrle()) {
 777             pages = save_xbzrle_page(f, &p, current_addr, block,
 778                                      offset, last_stage, bytes_transferred);
 779             if (!last_stage) {
 780                 /* Can't send this cached data async, since the cache page
 781                  * might get updated before it gets to the wire
 782                  */
 783                 send_async = false;
 784             }
 785         }
 786     }
 787
 788     /* XBZRLE overflow or normal page */
 789     if (pages == -1) {
 790         *bytes_transferred += save_page_header(f, block,
 791                                                offset | RAM_SAVE_FLAG_PAGE);
 792         if (send_async) {
 793             qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
 794         } else {
 795             qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
 796         }
 797         *bytes_transferred += TARGET_PAGE_SIZE;
 798         pages = 1;
 799         acct_info.norm_pages++;
 800     }
 801
 802     XBZRLE_cache_unlock();
 803
 804     return pages;
 805 }
 806
 807 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 808                                 ram_addr_t offset)
 809 {
 810     int bytes_sent, blen;
 811     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 812
 813     bytes_sent = save_page_header(f, block, offset |
 814                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
 815     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 816                                      migrate_compress_level());
 817     if (blen < 0) {
 818         bytes_sent = 0;
 819         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
 820         error_report("compressed data failed!");
 821     } else {
 822         bytes_sent += blen;
 823     }
 824
 825     return bytes_sent;
 826 }
 827
 828 static uint64_t bytes_transferred;
 829
 830 static void flush_compressed_data(QEMUFile *f)
 831 {
 832     int idx, len, thread_count;
 833
 834     if (!migrate_use_compression()) {
 835         return;
 836     }
 837     thread_count = migrate_compress_threads();
 838
 839     qemu_mutex_lock(&comp_done_lock);
 840     for (idx = 0; idx < thread_count; idx++) {
 841         while (!comp_param[idx].done) {
 842             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 843         }
 844     }
 845     qemu_mutex_unlock(&comp_done_lock);
 846
 847     for (idx = 0; idx < thread_count; idx++) {
 848         qemu_mutex_lock(&comp_param[idx].mutex);
 849         if (!comp_param[idx].quit) {
 850             len = qemu_put_qemu_file(f, comp_param[idx].file);
 851             bytes_transferred += len;
 852         }
 853         qemu_mutex_unlock(&comp_param[idx].mutex);
 854     }
 855 }
 856
 857 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 858                                        ram_addr_t offset)
 859 {
 860     param->block = block;
 861     param->offset = offset;
 862 }
 863
 864 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
 865                                            ram_addr_t offset,
 866                                            uint64_t *bytes_transferred)
 867 {
 868     int idx, thread_count, bytes_xmit = -1, pages = -1;
 869
 870     thread_count = migrate_compress_threads();
 871     qemu_mutex_lock(&comp_done_lock);
 872     while (true) {
 873         for (idx = 0; idx < thread_count; idx++) {
 874             if (comp_param[idx].done) {
 875                 comp_param[idx].done = false;
 876                 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
 877                 qemu_mutex_lock(&comp_param[idx].mutex);
 878                 set_compress_params(&comp_param[idx], block, offset);
 879                 qemu_cond_signal(&comp_param[idx].cond);
 880                 qemu_mutex_unlock(&comp_param[idx].mutex);
 881                 pages = 1;
 882                 acct_info.norm_pages++;
 883                 *bytes_transferred += bytes_xmit;
 884                 break;
 885             }
 886         }
 887         if (pages > 0) {
 888             break;
 889         } else {
 890             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 891         }
 892     }
 893     qemu_mutex_unlock(&comp_done_lock);
 894
 895     return pages;
 896 }
 897
 898 /**
 899  * ram_save_compressed_page: compress the given page and send it to the stream
 900  *
 901  * Returns: Number of pages written.
 902  *
 903  * @f: QEMUFile where to send the data
 904  * @block: block that contains the page we want to send
 905  * @offset: offset inside the block for the page
 906  * @last_stage: if we are at the completion stage
 907  * @bytes_transferred: increase it with the number of transferred bytes
 908  */
 909 static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss,
 910                                     bool last_stage,
 911                                     uint64_t *bytes_transferred)
 912 {
 913     int pages = -1;
 914     uint64_t bytes_xmit = 0;
 915     uint8_t *p;
 916     int ret, blen;
 917     RAMBlock *block = pss->block;
 918     ram_addr_t offset = pss->offset;
 919
 920     p = block->host + offset;
 921
 922     ret = ram_control_save_page(f, block->offset,
 923                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
 924     if (bytes_xmit) {
 925         *bytes_transferred += bytes_xmit;
 926         pages = 1;
 927     }
 928     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 929         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 930             if (bytes_xmit > 0) {
 931                 acct_info.norm_pages++;
 932             } else if (bytes_xmit == 0) {
 933                 acct_info.dup_pages++;
 934             }
 935         }
 936     } else {
 937         /* When starting the process of a new block, the first page of
 938          * the block should be sent out before other pages in the same
 939          * block, and all the pages in last block should have been sent
 940          * out, keeping this order is important, because the 'cont' flag
 941          * is used to avoid resending the block name.
 942          */
 943         if (block != last_sent_block) {
 944             flush_compressed_data(f);
 945             pages = save_zero_page(f, block, offset, p, bytes_transferred);
 946             if (pages == -1) {
 947                 /* Make sure the first page is sent out before other pages */
 948                 bytes_xmit = save_page_header(f, block, offset |
 949                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
 950                 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 951                                                  migrate_compress_level());
 952                 if (blen > 0) {
 953                     *bytes_transferred += bytes_xmit + blen;
 954                     acct_info.norm_pages++;
 955                     pages = 1;
 956                 } else {
 957                     qemu_file_set_error(f, blen);
 958                     error_report("compressed data failed!");
 959                 }
 960             }
 961         } else {
 962             offset |= RAM_SAVE_FLAG_CONTINUE;
 963             pages = save_zero_page(f, block, offset, p, bytes_transferred);
 964             if (pages == -1) {
 965                 pages = compress_page_with_multi_thread(f, block, offset,
 966                                                         bytes_transferred);
 967             }
 968         }
 969     }
 970
 971     return pages;
 972 }
 973
 974 /*
 975  * Find the next dirty page and update any state associated with
 976  * the search process.
 977  *
 978  * Returns: True if a page is found
 979  *
 980  * @f: Current migration stream.
 981  * @pss: Data about the state of the current dirty page scan.
 982  * @*again: Set to false if the search has scanned the whole of RAM
 983  * *ram_addr_abs: Pointer into which to store the address of the dirty page
 984  *               within the global ram_addr space
 985  */
 986 static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
 987                              bool *again, ram_addr_t *ram_addr_abs)
 988 {
 989     pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset,
 990                                               ram_addr_abs);
 991     if (pss->complete_round && pss->block == last_seen_block &&
 992         pss->offset >= last_offset) {
 993         /*
 994          * We've been once around the RAM and haven't found anything.
 995          * Give up.
 996          */
 997         *again = false;
 998         return false;
 999     }
1000     if (pss->offset >= pss->block->used_length) {
1001         /* Didn't find anything in this RAM Block */
1002         pss->offset = 0;
1003         pss->block = QLIST_NEXT_RCU(pss->block, next);
1004         if (!pss->block) {
1005             /* Hit the end of the list */
1006             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1007             /* Flag that we've looped */
1008             pss->complete_round = true;
1009             ram_bulk_stage = false;
1010             if (migrate_use_xbzrle()) {
1011                 /* If xbzrle is on, stop using the data compression at this
1012                  * point. In theory, xbzrle can do better than compression.
1013                  */
1014                 flush_compressed_data(f);
1015                 compression_switch = false;
1016             }
1017         }
1018         /* Didn't find anything this time, but try again on the new block */
1019         *again = true;
1020         return false;
1021     } else {
1022         /* Can go around again, but... */
1023         *again = true;
1024         /* We've found something so probably don't need to */
1025         return true;
1026     }
1027 }
1028
1029 /*
1030  * Helper for 'get_queued_page' - gets a page off the queue
1031  *      ms:      MigrationState in
1032  * *offset:      Used to return the offset within the RAMBlock
1033  * ram_addr_abs: global offset in the dirty/sent bitmaps
1034  *
1035  * Returns:      block (or NULL if none available)
1036  */
1037 static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1038                               ram_addr_t *ram_addr_abs)
1039 {
1040     RAMBlock *block = NULL;
1041
1042     qemu_mutex_lock(&ms->src_page_req_mutex);
1043     if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1044         struct MigrationSrcPageRequest *entry =
1045                                 QSIMPLEQ_FIRST(&ms->src_page_requests);
1046         block = entry->rb;
1047         *offset = entry->offset;
1048         *ram_addr_abs = (entry->offset + entry->rb->offset) &
1049                         TARGET_PAGE_MASK;
1050
1051         if (entry->len > TARGET_PAGE_SIZE) {
1052             entry->len -= TARGET_PAGE_SIZE;
1053             entry->offset += TARGET_PAGE_SIZE;
1054         } else {
1055             memory_region_unref(block->mr);
1056             QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1057             g_free(entry);
1058         }
1059     }
1060     qemu_mutex_unlock(&ms->src_page_req_mutex);
1061
1062     return block;
1063 }
1064
1065 /*
1066  * Unqueue a page from the queue fed by postcopy page requests; skips pages
1067  * that are already sent (!dirty)
1068  *
1069  *      ms:      MigrationState in
1070  *     pss:      PageSearchStatus structure updated with found block/offset
1071  * ram_addr_abs: global offset in the dirty/sent bitmaps
1072  *
1073  * Returns:      true if a queued page is found
1074  */
1075 static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss,
1076                             ram_addr_t *ram_addr_abs)
1077 {
1078     RAMBlock  *block;
1079     ram_addr_t offset;
1080     bool dirty;
1081
1082     do {
1083         block = unqueue_page(ms, &offset, ram_addr_abs);
1084         /*
1085          * We're sending this page, and since it's postcopy nothing else
1086          * will dirty it, and we must make sure it doesn't get sent again
1087          * even if this queue request was received after the background
1088          * search already sent it.
1089          */
1090         if (block) {
1091             unsigned long *bitmap;
1092             bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1093             dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1094             if (!dirty) {
1095                 trace_get_queued_page_not_dirty(
1096                     block->idstr, (uint64_t)offset,
1097                     (uint64_t)*ram_addr_abs,
1098                     test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1099                          atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1100             } else {
1101                 trace_get_queued_page(block->idstr,
1102                                       (uint64_t)offset,
1103                                       (uint64_t)*ram_addr_abs);
1104             }
1105         }
1106
1107     } while (block && !dirty);
1108
1109     if (block) {
1110         /*
1111          * As soon as we start servicing pages out of order, then we have
1112          * to kill the bulk stage, since the bulk stage assumes
1113          * in (migration_bitmap_find_and_reset_dirty) that every page is
1114          * dirty, that's no longer true.
1115          */
1116         ram_bulk_stage = false;
1117
1118         /*
1119          * We want the background search to continue from the queued page
1120          * since the guest is likely to want other pages near to the page
1121          * it just requested.
1122          */
1123         pss->block = block;
1124         pss->offset = offset;
1125     }
1126
1127     return !!block;
1128 }
1129
1130 /**
1131  * flush_page_queue: Flush any remaining pages in the ram request queue
1132  *    it should be empty at the end anyway, but in error cases there may be
1133  *    some left.
1134  *
1135  * ms: MigrationState
1136  */
1137 void flush_page_queue(MigrationState *ms)
1138 {
1139     struct MigrationSrcPageRequest *mspr, *next_mspr;
1140     /* This queue generally should be empty - but in the case of a failed
1141      * migration might have some droppings in.
1142      */
1143     rcu_read_lock();
1144     QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1145         memory_region_unref(mspr->rb->mr);
1146         QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1147         g_free(mspr);
1148     }
1149     rcu_read_unlock();
1150 }
1151
1152 /**
1153  * Queue the pages for transmission, e.g. a request from postcopy destination
1154  *   ms: MigrationStatus in which the queue is held
1155  *   rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
1156  *   start: Offset from the start of the RAMBlock
1157  *   len: Length (in bytes) to send
1158  *   Return: 0 on success
1159  */
1160 int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1161                          ram_addr_t start, ram_addr_t len)
1162 {
1163     RAMBlock *ramblock;
1164
1165     ms->postcopy_requests++;
1166     rcu_read_lock();
1167     if (!rbname) {
1168         /* Reuse last RAMBlock */
1169         ramblock = ms->last_req_rb;
1170
1171         if (!ramblock) {
1172             /*
1173              * Shouldn't happen, we can't reuse the last RAMBlock if
1174              * it's the 1st request.
1175              */
1176             error_report("ram_save_queue_pages no previous block");
1177             goto err;
1178         }
1179     } else {
1180         ramblock = qemu_ram_block_by_name(rbname);
1181
1182         if (!ramblock) {
1183             /* We shouldn't be asked for a non-existent RAMBlock */
1184             error_report("ram_save_queue_pages no block '%s'", rbname);
1185             goto err;
1186         }
1187         ms->last_req_rb = ramblock;
1188     }
1189     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1190     if (start+len > ramblock->used_length) {
1191         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1192                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1193                      __func__, start, len, ramblock->used_length);
1194         goto err;
1195     }
1196
1197     struct MigrationSrcPageRequest *new_entry =
1198         g_malloc0(sizeof(struct MigrationSrcPageRequest));
1199     new_entry->rb = ramblock;
1200     new_entry->offset = start;
1201     new_entry->len = len;
1202
1203     memory_region_ref(ramblock->mr);
1204     qemu_mutex_lock(&ms->src_page_req_mutex);
1205     QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1206     qemu_mutex_unlock(&ms->src_page_req_mutex);
1207     rcu_read_unlock();
1208
1209     return 0;
1210
1211 err:
1212     rcu_read_unlock();
1213     return -1;
1214 }
1215
1216 /**
1217  * ram_save_target_page: Save one target page
1218  *
1219  *
1220  * @f: QEMUFile where to send the data
1221  * @block: pointer to block that contains the page we want to send
1222  * @offset: offset inside the block for the page;
1223  * @last_stage: if we are at the completion stage
1224  * @bytes_transferred: increase it with the number of transferred bytes
1225  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1226  *
1227  * Returns: Number of pages written.
1228  */
1229 static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
1230                                 PageSearchStatus *pss,
1231                                 bool last_stage,
1232                                 uint64_t *bytes_transferred,
1233                                 ram_addr_t dirty_ram_abs)
1234 {
1235     int res = 0;
1236
1237     /* Check the pages is dirty and if it is send it */
1238     if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1239         unsigned long *unsentmap;
1240         if (compression_switch && migrate_use_compression()) {
1241             res = ram_save_compressed_page(f, pss,
1242                                            last_stage,
1243                                            bytes_transferred);
1244         } else {
1245             res = ram_save_page(f, pss, last_stage,
1246                                 bytes_transferred);
1247         }
1248
1249         if (res < 0) {
1250             return res;
1251         }
1252         unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1253         if (unsentmap) {
1254             clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1255         }
1256         /* Only update last_sent_block if a block was actually sent; xbzrle
1257          * might have decided the page was identical so didn't bother writing
1258          * to the stream.
1259          */
1260         if (res > 0) {
1261             last_sent_block = pss->block;
1262         }
1263     }
1264
1265     return res;
1266 }
1267
1268 /**
1269  * ram_save_host_page: Starting at *offset send pages up to the end
1270  *                     of the current host page.  It's valid for the initial
1271  *                     offset to point into the middle of a host page
1272  *                     in which case the remainder of the hostpage is sent.
1273  *                     Only dirty target pages are sent.
1274  *
1275  * Returns: Number of pages written.
1276  *
1277  * @f: QEMUFile where to send the data
1278  * @block: pointer to block that contains the page we want to send
1279  * @offset: offset inside the block for the page; updated to last target page
1280  *          sent
1281  * @last_stage: if we are at the completion stage
1282  * @bytes_transferred: increase it with the number of transferred bytes
1283  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1284  */
1285 static int ram_save_host_page(MigrationState *ms, QEMUFile *f,
1286                               PageSearchStatus *pss,
1287                               bool last_stage,
1288                               uint64_t *bytes_transferred,
1289                               ram_addr_t dirty_ram_abs)
1290 {
1291     int tmppages, pages = 0;
1292     do {
1293         tmppages = ram_save_target_page(ms, f, pss, last_stage,
1294                                         bytes_transferred, dirty_ram_abs);
1295         if (tmppages < 0) {
1296             return tmppages;
1297         }
1298
1299         pages += tmppages;
1300         pss->offset += TARGET_PAGE_SIZE;
1301         dirty_ram_abs += TARGET_PAGE_SIZE;
1302     } while (pss->offset & (qemu_host_page_size - 1));
1303
1304     /* The offset we leave with is the last one we looked at */
1305     pss->offset -= TARGET_PAGE_SIZE;
1306     return pages;
1307 }
1308
1309 /**
1310  * ram_find_and_save_block: Finds a dirty page and sends it to f
1311  *
1312  * Called within an RCU critical section.
1313  *
1314  * Returns:  The number of pages written
1315  *           0 means no dirty pages
1316  *
1317  * @f: QEMUFile where to send the data
1318  * @last_stage: if we are at the completion stage
1319  * @bytes_transferred: increase it with the number of transferred bytes
1320  *
1321  * On systems where host-page-size > target-page-size it will send all the
1322  * pages in a host page that are dirty.
1323  */
1324
1325 static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
1326                                    uint64_t *bytes_transferred)
1327 {
1328     PageSearchStatus pss;
1329     MigrationState *ms = migrate_get_current();
1330     int pages = 0;
1331     bool again, found;
1332     ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1333                                  ram_addr_t space */
1334
1335     pss.block = last_seen_block;
1336     pss.offset = last_offset;
1337     pss.complete_round = false;
1338
1339     if (!pss.block) {
1340         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1341     }
1342
1343     do {
1344         again = true;
1345         found = get_queued_page(ms, &pss, &dirty_ram_abs);
1346
1347         if (!found) {
1348             /* priority queue empty, so just search for something dirty */
1349             found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
1350         }
1351
1352         if (found) {
1353             pages = ram_save_host_page(ms, f, &pss,
1354                                        last_stage, bytes_transferred,
1355                                        dirty_ram_abs);
1356         }
1357     } while (!pages && again);
1358
1359     last_seen_block = pss.block;
1360     last_offset = pss.offset;
1361
1362     return pages;
1363 }
1364
1365 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1366 {
1367     uint64_t pages = size / TARGET_PAGE_SIZE;
1368     if (zero) {
1369         acct_info.dup_pages += pages;
1370     } else {
1371         acct_info.norm_pages += pages;
1372         bytes_transferred += size;
1373         qemu_update_position(f, size);
1374     }
1375 }
1376
1377 static ram_addr_t ram_save_remaining(void)
1378 {
1379     return migration_dirty_pages;
1380 }
1381
1382 uint64_t ram_bytes_remaining(void)
1383 {
1384     return ram_save_remaining() * TARGET_PAGE_SIZE;
1385 }
1386
1387 uint64_t ram_bytes_transferred(void)
1388 {
1389     return bytes_transferred;
1390 }
1391
1392 uint64_t ram_bytes_total(void)
1393 {
1394     RAMBlock *block;
1395     uint64_t total = 0;
1396
1397     rcu_read_lock();
1398     QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1399         total += block->used_length;
1400     rcu_read_unlock();
1401     return total;
1402 }
1403
1404 void free_xbzrle_decoded_buf(void)
1405 {
1406     g_free(xbzrle_decoded_buf);
1407     xbzrle_decoded_buf = NULL;
1408 }
1409
1410 static void migration_bitmap_free(struct BitmapRcu *bmap)
1411 {
1412     g_free(bmap->bmap);
1413     g_free(bmap->unsentmap);
1414     g_free(bmap);
1415 }
1416
1417 static void ram_migration_cleanup(void *opaque)
1418 {
1419     /* caller have hold iothread lock or is in a bh, so there is
1420      * no writing race against this migration_bitmap
1421      */
1422     struct BitmapRcu *bitmap = migration_bitmap_rcu;
1423     atomic_rcu_set(&migration_bitmap_rcu, NULL);
1424     if (bitmap) {
1425         memory_global_dirty_log_stop();
1426         call_rcu(bitmap, migration_bitmap_free, rcu);
1427     }
1428
1429     XBZRLE_cache_lock();
1430     if (XBZRLE.cache) {
1431         cache_fini(XBZRLE.cache);
1432         g_free(XBZRLE.encoded_buf);
1433         g_free(XBZRLE.current_buf);
1434         g_free(ZERO_TARGET_PAGE);
1435         XBZRLE.cache = NULL;
1436         XBZRLE.encoded_buf = NULL;
1437         XBZRLE.current_buf = NULL;
1438     }
1439     XBZRLE_cache_unlock();
1440 }
1441
1442 static void reset_ram_globals(void)
1443 {
1444     last_seen_block = NULL;
1445     last_sent_block = NULL;
1446     last_offset = 0;
1447     last_version = ram_list.version;
1448     ram_bulk_stage = true;
1449 }
1450
1451 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1452
1453 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1454 {
1455     /* called in qemu main thread, so there is
1456      * no writing race against this migration_bitmap
1457      */
1458     if (migration_bitmap_rcu) {
1459         struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1460         bitmap = g_new(struct BitmapRcu, 1);
1461         bitmap->bmap = bitmap_new(new);
1462
1463         /* prevent migration_bitmap content from being set bit
1464          * by migration_bitmap_sync_range() at the same time.
1465          * it is safe to migration if migration_bitmap is cleared bit
1466          * at the same time.
1467          */
1468         qemu_mutex_lock(&migration_bitmap_mutex);
1469         bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1470         bitmap_set(bitmap->bmap, old, new - old);
1471
1472         /* We don't have a way to safely extend the sentmap
1473          * with RCU; so mark it as missing, entry to postcopy
1474          * will fail.
1475          */
1476         bitmap->unsentmap = NULL;
1477
1478         atomic_rcu_set(&migration_bitmap_rcu, bitmap);
1479         qemu_mutex_unlock(&migration_bitmap_mutex);
1480         migration_dirty_pages += new - old;
1481         call_rcu(old_bitmap, migration_bitmap_free, rcu);
1482     }
1483 }
1484
1485 /*
1486  * 'expected' is the value you expect the bitmap mostly to be full
1487  * of; it won't bother printing lines that are all this value.
1488  * If 'todump' is null the migration bitmap is dumped.
1489  */
1490 void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1491 {
1492     int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1493
1494     int64_t cur;
1495     int64_t linelen = 128;
1496     char linebuf[129];
1497
1498     if (!todump) {
1499         todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1500     }
1501
1502     for (cur = 0; cur < ram_pages; cur += linelen) {
1503         int64_t curb;
1504         bool found = false;
1505         /*
1506          * Last line; catch the case where the line length
1507          * is longer than remaining ram
1508          */
1509         if (cur + linelen > ram_pages) {
1510             linelen = ram_pages - cur;
1511         }
1512         for (curb = 0; curb < linelen; curb++) {
1513             bool thisbit = test_bit(cur + curb, todump);
1514             linebuf[curb] = thisbit ? '1' : '.';
1515             found = found || (thisbit != expected);
1516         }
1517         if (found) {
1518             linebuf[curb] = '\0';
1519             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1520         }
1521     }
1522 }
1523
1524 /* **** functions for postcopy ***** */
1525
1526 /*
1527  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1528  * Note: At this point the 'unsentmap' is the processed bitmap combined
1529  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1530  * start,length: Indexes into the bitmap for the first bit
1531  *            representing the named block and length in target-pages
1532  */
1533 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1534                                         PostcopyDiscardState *pds,
1535                                         unsigned long start,
1536                                         unsigned long length)
1537 {
1538     unsigned long end = start + length; /* one after the end */
1539     unsigned long current;
1540     unsigned long *unsentmap;
1541
1542     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1543     for (current = start; current < end; ) {
1544         unsigned long one = find_next_bit(unsentmap, end, current);
1545
1546         if (one <= end) {
1547             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1548             unsigned long discard_length;
1549
1550             if (zero >= end) {
1551                 discard_length = end - one;
1552             } else {
1553                 discard_length = zero - one;
1554             }
1555             if (discard_length) {
1556                 postcopy_discard_send_range(ms, pds, one, discard_length);
1557             }
1558             current = one + discard_length;
1559         } else {
1560             current = one;
1561         }
1562     }
1563
1564     return 0;
1565 }
1566
1567 /*
1568  * Utility for the outgoing postcopy code.
1569  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1570  *   passing it bitmap indexes and name.
1571  * Returns: 0 on success
1572  * (qemu_ram_foreach_block ends up passing unscaled lengths
1573  *  which would mean postcopy code would have to deal with target page)
1574  */
1575 static int postcopy_each_ram_send_discard(MigrationState *ms)
1576 {
1577     struct RAMBlock *block;
1578     int ret;
1579
1580     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1581         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1582         PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1583                                                                first,
1584                                                                block->idstr);
1585
1586         /*
1587          * Postcopy sends chunks of bitmap over the wire, but it
1588          * just needs indexes at this point, avoids it having
1589          * target page specific code.
1590          */
1591         ret = postcopy_send_discard_bm_ram(ms, pds, first,
1592                                     block->used_length >> TARGET_PAGE_BITS);
1593         postcopy_discard_send_finish(ms, pds);
1594         if (ret) {
1595             return ret;
1596         }
1597     }
1598
1599     return 0;
1600 }
1601
1602 /*
1603  * Helper for postcopy_chunk_hostpages; it's called twice to cleanup
1604  *   the two bitmaps, that are similar, but one is inverted.
1605  *
1606  * We search for runs of target-pages that don't start or end on a
1607  * host page boundary;
1608  * unsent_pass=true: Cleans up partially unsent host pages by searching
1609  *                 the unsentmap
1610  * unsent_pass=false: Cleans up partially dirty host pages by searching
1611  *                 the main migration bitmap
1612  *
1613  */
1614 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1615                                           RAMBlock *block,
1616                                           PostcopyDiscardState *pds)
1617 {
1618     unsigned long *bitmap;
1619     unsigned long *unsentmap;
1620     unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE;
1621     unsigned long first = block->offset >> TARGET_PAGE_BITS;
1622     unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1623     unsigned long last = first + (len - 1);
1624     unsigned long run_start;
1625
1626     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1627     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1628
1629     if (unsent_pass) {
1630         /* Find a sent page */
1631         run_start = find_next_zero_bit(unsentmap, last + 1, first);
1632     } else {
1633         /* Find a dirty page */
1634         run_start = find_next_bit(bitmap, last + 1, first);
1635     }
1636
1637     while (run_start <= last) {
1638         bool do_fixup = false;
1639         unsigned long fixup_start_addr;
1640         unsigned long host_offset;
1641
1642         /*
1643          * If the start of this run of pages is in the middle of a host
1644          * page, then we need to fixup this host page.
1645          */
1646         host_offset = run_start % host_ratio;
1647         if (host_offset) {
1648             do_fixup = true;
1649             run_start -= host_offset;
1650             fixup_start_addr = run_start;
1651             /* For the next pass */
1652             run_start = run_start + host_ratio;
1653         } else {
1654             /* Find the end of this run */
1655             unsigned long run_end;
1656             if (unsent_pass) {
1657                 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1658             } else {
1659                 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1660             }
1661             /*
1662              * If the end isn't at the start of a host page, then the
1663              * run doesn't finish at the end of a host page
1664              * and we need to discard.
1665              */
1666             host_offset = run_end % host_ratio;
1667             if (host_offset) {
1668                 do_fixup = true;
1669                 fixup_start_addr = run_end - host_offset;
1670                 /*
1671                  * This host page has gone, the next loop iteration starts
1672                  * from after the fixup
1673                  */
1674                 run_start = fixup_start_addr + host_ratio;
1675             } else {
1676                 /*
1677                  * No discards on this iteration, next loop starts from
1678                  * next sent/dirty page
1679                  */
1680                 run_start = run_end + 1;
1681             }
1682         }
1683
1684         if (do_fixup) {
1685             unsigned long page;
1686
1687             /* Tell the destination to discard this page */
1688             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1689                 /* For the unsent_pass we:
1690                  *     discard partially sent pages
1691                  * For the !unsent_pass (dirty) we:
1692                  *     discard partially dirty pages that were sent
1693                  *     (any partially sent pages were already discarded
1694                  *     by the previous unsent_pass)
1695                  */
1696                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1697                                             host_ratio);
1698             }
1699
1700             /* Clean up the bitmap */
1701             for (page = fixup_start_addr;
1702                  page < fixup_start_addr + host_ratio; page++) {
1703                 /* All pages in this host page are now not sent */
1704                 set_bit(page, unsentmap);
1705
1706                 /*
1707                  * Remark them as dirty, updating the count for any pages
1708                  * that weren't previously dirty.
1709                  */
1710                 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1711             }
1712         }
1713
1714         if (unsent_pass) {
1715             /* Find the next sent page for the next iteration */
1716             run_start = find_next_zero_bit(unsentmap, last + 1,
1717                                            run_start);
1718         } else {
1719             /* Find the next dirty page for the next iteration */
1720             run_start = find_next_bit(bitmap, last + 1, run_start);
1721         }
1722     }
1723 }
1724
1725 /*
1726  * Utility for the outgoing postcopy code.
1727  *
1728  * Discard any partially sent host-page size chunks, mark any partially
1729  * dirty host-page size chunks as all dirty.
1730  *
1731  * Returns: 0 on success
1732  */
1733 static int postcopy_chunk_hostpages(MigrationState *ms)
1734 {
1735     struct RAMBlock *block;
1736
1737     if (qemu_host_page_size == TARGET_PAGE_SIZE) {
1738         /* Easy case - TPS==HPS - nothing to be done */
1739         return 0;
1740     }
1741
1742     /* Easiest way to make sure we don't resume in the middle of a host-page */
1743     last_seen_block = NULL;
1744     last_sent_block = NULL;
1745     last_offset     = 0;
1746
1747     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1748         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1749
1750         PostcopyDiscardState *pds =
1751                          postcopy_discard_send_init(ms, first, block->idstr);
1752
1753         /* First pass: Discard all partially sent host pages */
1754         postcopy_chunk_hostpages_pass(ms, true, block, pds);
1755         /*
1756          * Second pass: Ensure that all partially dirty host pages are made
1757          * fully dirty.
1758          */
1759         postcopy_chunk_hostpages_pass(ms, false, block, pds);
1760
1761         postcopy_discard_send_finish(ms, pds);
1762     } /* ram_list loop */
1763
1764     return 0;
1765 }
1766
1767 /*
1768  * Transmit the set of pages to be discarded after precopy to the target
1769  * these are pages that:
1770  *     a) Have been previously transmitted but are now dirty again
1771  *     b) Pages that have never been transmitted, this ensures that
1772  *        any pages on the destination that have been mapped by background
1773  *        tasks get discarded (transparent huge pages is the specific concern)
1774  * Hopefully this is pretty sparse
1775  */
1776 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1777 {
1778     int ret;
1779     unsigned long *bitmap, *unsentmap;
1780
1781     rcu_read_lock();
1782
1783     /* This should be our last sync, the src is now paused */
1784     migration_bitmap_sync();
1785
1786     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1787     if (!unsentmap) {
1788         /* We don't have a safe way to resize the sentmap, so
1789          * if the bitmap was resized it will be NULL at this
1790          * point.
1791          */
1792         error_report("migration ram resized during precopy phase");
1793         rcu_read_unlock();
1794         return -EINVAL;
1795     }
1796
1797     /* Deal with TPS != HPS */
1798     ret = postcopy_chunk_hostpages(ms);
1799     if (ret) {
1800         rcu_read_unlock();
1801         return ret;
1802     }
1803
1804     /*
1805      * Update the unsentmap to be unsentmap = unsentmap | dirty
1806      */
1807     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1808     bitmap_or(unsentmap, unsentmap, bitmap,
1809                last_ram_offset() >> TARGET_PAGE_BITS);
1810
1811
1812     trace_ram_postcopy_send_discard_bitmap();
1813 #ifdef DEBUG_POSTCOPY
1814     ram_debug_dump_bitmap(unsentmap, true);
1815 #endif
1816
1817     ret = postcopy_each_ram_send_discard(ms);
1818     rcu_read_unlock();
1819
1820     return ret;
1821 }
1822
1823 /*
1824  * At the start of the postcopy phase of migration, any now-dirty
1825  * precopied pages are discarded.
1826  *
1827  * start, length describe a byte address range within the RAMBlock
1828  *
1829  * Returns 0 on success.
1830  */
1831 int ram_discard_range(MigrationIncomingState *mis,
1832                       const char *block_name,
1833                       uint64_t start, size_t length)
1834 {
1835     int ret = -1;
1836
1837     rcu_read_lock();
1838     RAMBlock *rb = qemu_ram_block_by_name(block_name);
1839
1840     if (!rb) {
1841         error_report("ram_discard_range: Failed to find block '%s'",
1842                      block_name);
1843         goto err;
1844     }
1845
1846     uint8_t *host_startaddr = rb->host + start;
1847
1848     if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) {
1849         error_report("ram_discard_range: Unaligned start address: %p",
1850                      host_startaddr);
1851         goto err;
1852     }
1853
1854     if ((start + length) <= rb->used_length) {
1855         uint8_t *host_endaddr = host_startaddr + length;
1856         if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) {
1857             error_report("ram_discard_range: Unaligned end address: %p",
1858                          host_endaddr);
1859             goto err;
1860         }
1861         ret = postcopy_ram_discard_range(mis, host_startaddr, length);
1862     } else {
1863         error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
1864                      "/%zx/" RAM_ADDR_FMT")",
1865                      block_name, start, length, rb->used_length);
1866     }
1867
1868 err:
1869     rcu_read_unlock();
1870
1871     return ret;
1872 }
1873
1874
1875 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1876  * long-running RCU critical section.  When rcu-reclaims in the code
1877  * start to become numerous it will be necessary to reduce the
1878  * granularity of these critical sections.
1879  */
1880
1881 static int ram_save_setup(QEMUFile *f, void *opaque)
1882 {
1883     RAMBlock *block;
1884     int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1885
1886     dirty_rate_high_cnt = 0;
1887     bitmap_sync_count = 0;
1888     migration_bitmap_sync_init();
1889     qemu_mutex_init(&migration_bitmap_mutex);
1890
1891     if (migrate_use_xbzrle()) {
1892         XBZRLE_cache_lock();
1893         ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
1894         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1895                                   TARGET_PAGE_SIZE,
1896                                   TARGET_PAGE_SIZE);
1897         if (!XBZRLE.cache) {
1898             XBZRLE_cache_unlock();
1899             error_report("Error creating cache");
1900             return -1;
1901         }
1902         XBZRLE_cache_unlock();
1903
1904         /* We prefer not to abort if there is no memory */
1905         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1906         if (!XBZRLE.encoded_buf) {
1907             error_report("Error allocating encoded_buf");
1908             return -1;
1909         }
1910
1911         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1912         if (!XBZRLE.current_buf) {
1913             error_report("Error allocating current_buf");
1914             g_free(XBZRLE.encoded_buf);
1915             XBZRLE.encoded_buf = NULL;
1916             return -1;
1917         }
1918
1919         acct_clear();
1920     }
1921
1922     /* For memory_global_dirty_log_start below.  */
1923     qemu_mutex_lock_iothread();
1924
1925     qemu_mutex_lock_ramlist();
1926     rcu_read_lock();
1927     bytes_transferred = 0;
1928     reset_ram_globals();
1929
1930     ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1931     migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
1932     migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
1933     bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
1934
1935     if (migrate_postcopy_ram()) {
1936         migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
1937         bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
1938     }
1939
1940     /*
1941      * Count the total number of pages used by ram blocks not including any
1942      * gaps due to alignment or unplugs.
1943      */
1944     migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1945
1946     memory_global_dirty_log_start();
1947     migration_bitmap_sync();
1948     qemu_mutex_unlock_ramlist();
1949     qemu_mutex_unlock_iothread();
1950
1951     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1952
1953     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1954         qemu_put_byte(f, strlen(block->idstr));
1955         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1956         qemu_put_be64(f, block->used_length);
1957     }
1958
1959     rcu_read_unlock();
1960
1961     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1962     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1963
1964     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1965
1966     return 0;
1967 }
1968
1969 static int ram_save_iterate(QEMUFile *f, void *opaque)
1970 {
1971     int ret;
1972     int i;
1973     int64_t t0;
1974     int pages_sent = 0;
1975
1976     rcu_read_lock();
1977     if (ram_list.version != last_version) {
1978         reset_ram_globals();
1979     }
1980
1981     /* Read version before ram_list.blocks */
1982     smp_rmb();
1983
1984     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1985
1986     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1987     i = 0;
1988     while ((ret = qemu_file_rate_limit(f)) == 0) {
1989         int pages;
1990
1991         pages = ram_find_and_save_block(f, false, &bytes_transferred);
1992         /* no more pages to sent */
1993         if (pages == 0) {
1994             break;
1995         }
1996         pages_sent += pages;
1997         acct_info.iterations++;
1998
1999         /* we want to check in the 1st loop, just in case it was the 1st time
2000            and we had to sync the dirty bitmap.
2001            qemu_get_clock_ns() is a bit expensive, so we only check each some
2002            iterations
2003         */
2004         if ((i & 63) == 0) {
2005             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2006             if (t1 > MAX_WAIT) {
2007                 DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
2008                         t1, i);
2009                 break;
2010             }
2011         }
2012         i++;
2013     }
2014     flush_compressed_data(f);
2015     rcu_read_unlock();
2016
2017     /*
2018      * Must occur before EOS (or any QEMUFile operation)
2019      * because of RDMA protocol.
2020      */
2021     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2022
2023     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2024     bytes_transferred += 8;
2025
2026     ret = qemu_file_get_error(f);
2027     if (ret < 0) {
2028         return ret;
2029     }
2030
2031     return pages_sent;
2032 }
2033
2034 /* Called with iothread lock */
2035 static int ram_save_complete(QEMUFile *f, void *opaque)
2036 {
2037     rcu_read_lock();
2038
2039     if (!migration_in_postcopy(migrate_get_current())) {
2040         migration_bitmap_sync();
2041     }
2042
2043     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2044
2045     /* try transferring iterative blocks of memory */
2046
2047     /* flush all remaining blocks regardless of rate limiting */
2048     while (true) {
2049         int pages;
2050
2051         pages = ram_find_and_save_block(f, true, &bytes_transferred);
2052         /* no more blocks to sent */
2053         if (pages == 0) {
2054             break;
2055         }
2056     }
2057
2058     flush_compressed_data(f);
2059     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2060
2061     rcu_read_unlock();
2062
2063     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2064
2065     return 0;
2066 }
2067
2068 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2069                              uint64_t *non_postcopiable_pending,
2070                              uint64_t *postcopiable_pending)
2071 {
2072     uint64_t remaining_size;
2073
2074     remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2075
2076     if (!migration_in_postcopy(migrate_get_current()) &&
2077         remaining_size < max_size) {
2078         qemu_mutex_lock_iothread();
2079         rcu_read_lock();
2080         migration_bitmap_sync();
2081         rcu_read_unlock();
2082         qemu_mutex_unlock_iothread();
2083         remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2084     }
2085
2086     /* We can do postcopy, and all the data is postcopiable */
2087     *postcopiable_pending += remaining_size;
2088 }
2089
2090 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2091 {
2092     unsigned int xh_len;
2093     int xh_flags;
2094     uint8_t *loaded_data;
2095
2096     if (!xbzrle_decoded_buf) {
2097         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2098     }
2099     loaded_data = xbzrle_decoded_buf;
2100
2101     /* extract RLE header */
2102     xh_flags = qemu_get_byte(f);
2103     xh_len = qemu_get_be16(f);
2104
2105     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2106         error_report("Failed to load XBZRLE page - wrong compression!");
2107         return -1;
2108     }
2109
2110     if (xh_len > TARGET_PAGE_SIZE) {
2111         error_report("Failed to load XBZRLE page - len overflow!");
2112         return -1;
2113     }
2114     /* load data and decode */
2115     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2116
2117     /* decode RLE */
2118     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2119                              TARGET_PAGE_SIZE) == -1) {
2120         error_report("Failed to load XBZRLE page - decode error!");
2121         return -1;
2122     }
2123
2124     return 0;
2125 }
2126
2127 /* Must be called from within a rcu critical section.
2128  * Returns a pointer from within the RCU-protected ram_list.
2129  */
2130 /*
2131  * Read a RAMBlock ID from the stream f.
2132  *
2133  * f: Stream to read from
2134  * flags: Page flags (mostly to see if it's a continuation of previous block)
2135  */
2136 static inline RAMBlock *ram_block_from_stream(QEMUFile *f,
2137                                               int flags)
2138 {
2139     static RAMBlock *block = NULL;
2140     char id[256];
2141     uint8_t len;
2142
2143     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2144         if (!block) {
2145             error_report("Ack, bad migration stream!");
2146             return NULL;
2147         }
2148         return block;
2149     }
2150
2151     len = qemu_get_byte(f);
2152     qemu_get_buffer(f, (uint8_t *)id, len);
2153     id[len] = 0;
2154
2155     block = qemu_ram_block_by_name(id);
2156     if (!block) {
2157         error_report("Can't find block %s", id);
2158         return NULL;
2159     }
2160
2161     return block;
2162 }
2163
2164 static inline void *host_from_ram_block_offset(RAMBlock *block,
2165                                                ram_addr_t offset)
2166 {
2167     if (!offset_in_ramblock(block, offset)) {
2168         return NULL;
2169     }
2170
2171     return block->host + offset;
2172 }
2173
2174 /*
2175  * If a page (or a whole RDMA chunk) has been
2176  * determined to be zero, then zap it.
2177  */
2178 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2179 {
2180     if (ch != 0 || !is_zero_range(host, size)) {
2181         memset(host, ch, size);
2182     }
2183 }
2184
2185 static void *do_data_decompress(void *opaque)
2186 {
2187     DecompressParam *param = opaque;
2188     unsigned long pagesize;
2189     uint8_t *des;
2190     int len;
2191
2192     qemu_mutex_lock(&param->mutex);
2193     while (!param->quit) {
2194         if (param->des) {
2195             des = param->des;
2196             len = param->len;
2197             param->des = 0;
2198             qemu_mutex_unlock(&param->mutex);
2199
2200             pagesize = TARGET_PAGE_SIZE;
2201             /* uncompress() will return failed in some case, especially
2202              * when the page is dirted when doing the compression, it's
2203              * not a problem because the dirty page will be retransferred
2204              * and uncompress() won't break the data in other pages.
2205              */
2206             uncompress((Bytef *)des, &pagesize,
2207                        (const Bytef *)param->compbuf, len);
2208
2209             qemu_mutex_lock(&decomp_done_lock);
2210             param->done = true;
2211             qemu_cond_signal(&decomp_done_cond);
2212             qemu_mutex_unlock(&decomp_done_lock);
2213
2214             qemu_mutex_lock(&param->mutex);
2215         } else {
2216             qemu_cond_wait(&param->cond, &param->mutex);
2217         }
2218     }
2219     qemu_mutex_unlock(&param->mutex);
2220
2221     return NULL;
2222 }
2223
2224 static void wait_for_decompress_done(void)
2225 {
2226     int idx, thread_count;
2227
2228     if (!migrate_use_compression()) {
2229         return;
2230     }
2231
2232     thread_count = migrate_decompress_threads();
2233     qemu_mutex_lock(&decomp_done_lock);
2234     for (idx = 0; idx < thread_count; idx++) {
2235         while (!decomp_param[idx].done) {
2236             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2237         }
2238     }
2239     qemu_mutex_unlock(&decomp_done_lock);
2240 }
2241
2242 void migrate_decompress_threads_create(void)
2243 {
2244     int i, thread_count;
2245
2246     thread_count = migrate_decompress_threads();
2247     decompress_threads = g_new0(QemuThread, thread_count);
2248     decomp_param = g_new0(DecompressParam, thread_count);
2249     qemu_mutex_init(&decomp_done_lock);
2250     qemu_cond_init(&decomp_done_cond);
2251     for (i = 0; i < thread_count; i++) {
2252         qemu_mutex_init(&decomp_param[i].mutex);
2253         qemu_cond_init(&decomp_param[i].cond);
2254         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2255         decomp_param[i].done = true;
2256         decomp_param[i].quit = false;
2257         qemu_thread_create(decompress_threads + i, "decompress",
2258                            do_data_decompress, decomp_param + i,
2259                            QEMU_THREAD_JOINABLE);
2260     }
2261 }
2262
2263 void migrate_decompress_threads_join(void)
2264 {
2265     int i, thread_count;
2266
2267     thread_count = migrate_decompress_threads();
2268     for (i = 0; i < thread_count; i++) {
2269         qemu_mutex_lock(&decomp_param[i].mutex);
2270         decomp_param[i].quit = true;
2271         qemu_cond_signal(&decomp_param[i].cond);
2272         qemu_mutex_unlock(&decomp_param[i].mutex);
2273     }
2274     for (i = 0; i < thread_count; i++) {
2275         qemu_thread_join(decompress_threads + i);
2276         qemu_mutex_destroy(&decomp_param[i].mutex);
2277         qemu_cond_destroy(&decomp_param[i].cond);
2278         g_free(decomp_param[i].compbuf);
2279     }
2280     g_free(decompress_threads);
2281     g_free(decomp_param);
2282     decompress_threads = NULL;
2283     decomp_param = NULL;
2284 }
2285
2286 static void decompress_data_with_multi_threads(QEMUFile *f,
2287                                                void *host, int len)
2288 {
2289     int idx, thread_count;
2290
2291     thread_count = migrate_decompress_threads();
2292     qemu_mutex_lock(&decomp_done_lock);
2293     while (true) {
2294         for (idx = 0; idx < thread_count; idx++) {
2295             if (decomp_param[idx].done) {
2296                 decomp_param[idx].done = false;
2297                 qemu_mutex_lock(&decomp_param[idx].mutex);
2298                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2299                 decomp_param[idx].des = host;
2300                 decomp_param[idx].len = len;
2301                 qemu_cond_signal(&decomp_param[idx].cond);
2302                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2303                 break;
2304             }
2305         }
2306         if (idx < thread_count) {
2307             break;
2308         } else {
2309             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2310         }
2311     }
2312     qemu_mutex_unlock(&decomp_done_lock);
2313 }
2314
2315 /*
2316  * Allocate data structures etc needed by incoming migration with postcopy-ram
2317  * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
2318  */
2319 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2320 {
2321     size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2322
2323     return postcopy_ram_incoming_init(mis, ram_pages);
2324 }
2325
2326 /*
2327  * Called in postcopy mode by ram_load().
2328  * rcu_read_lock is taken prior to this being called.
2329  */
2330 static int ram_load_postcopy(QEMUFile *f)
2331 {
2332     int flags = 0, ret = 0;
2333     bool place_needed = false;
2334     bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE;
2335     MigrationIncomingState *mis = migration_incoming_get_current();
2336     /* Temporary page that is later 'placed' */
2337     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2338     void *last_host = NULL;
2339     bool all_zero = false;
2340
2341     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2342         ram_addr_t addr;
2343         void *host = NULL;
2344         void *page_buffer = NULL;
2345         void *place_source = NULL;
2346         uint8_t ch;
2347
2348         addr = qemu_get_be64(f);
2349         flags = addr & ~TARGET_PAGE_MASK;
2350         addr &= TARGET_PAGE_MASK;
2351
2352         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2353         place_needed = false;
2354         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2355             RAMBlock *block = ram_block_from_stream(f, flags);
2356
2357             host = host_from_ram_block_offset(block, addr);
2358             if (!host) {
2359                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2360                 ret = -EINVAL;
2361                 break;
2362             }
2363             /*
2364              * Postcopy requires that we place whole host pages atomically.
2365              * To make it atomic, the data is read into a temporary page
2366              * that's moved into place later.
2367              * The migration protocol uses,  possibly smaller, target-pages
2368              * however the source ensures it always sends all the components
2369              * of a host page in order.
2370              */
2371             page_buffer = postcopy_host_page +
2372                           ((uintptr_t)host & ~qemu_host_page_mask);
2373             /* If all TP are zero then we can optimise the place */
2374             if (!((uintptr_t)host & ~qemu_host_page_mask)) {
2375                 all_zero = true;
2376             } else {
2377                 /* not the 1st TP within the HP */
2378                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2379                     error_report("Non-sequential target page %p/%p",
2380                                   host, last_host);
2381                     ret = -EINVAL;
2382                     break;
2383                 }
2384             }
2385
2386
2387             /*
2388              * If it's the last part of a host page then we place the host
2389              * page
2390              */
2391             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2392                                      ~qemu_host_page_mask) == 0;
2393             place_source = postcopy_host_page;
2394         }
2395         last_host = host;
2396
2397         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2398         case RAM_SAVE_FLAG_COMPRESS:
2399             ch = qemu_get_byte(f);
2400             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2401             if (ch) {
2402                 all_zero = false;
2403             }
2404             break;
2405
2406         case RAM_SAVE_FLAG_PAGE:
2407             all_zero = false;
2408             if (!place_needed || !matching_page_sizes) {
2409                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2410             } else {
2411                 /* Avoids the qemu_file copy during postcopy, which is
2412                  * going to do a copy later; can only do it when we
2413                  * do this read in one go (matching page sizes)
2414                  */
2415                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2416                                          TARGET_PAGE_SIZE);
2417             }
2418             break;
2419         case RAM_SAVE_FLAG_EOS:
2420             /* normal exit */
2421             break;
2422         default:
2423             error_report("Unknown combination of migration flags: %#x"
2424                          " (postcopy mode)", flags);
2425             ret = -EINVAL;
2426         }
2427
2428         if (place_needed) {
2429             /* This gets called at the last target page in the host page */
2430             if (all_zero) {
2431                 ret = postcopy_place_page_zero(mis,
2432                                                host + TARGET_PAGE_SIZE -
2433                                                qemu_host_page_size);
2434             } else {
2435                 ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE -
2436                                                qemu_host_page_size,
2437                                                place_source);
2438             }
2439         }
2440         if (!ret) {
2441             ret = qemu_file_get_error(f);
2442         }
2443     }
2444
2445     return ret;
2446 }
2447
2448 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2449 {
2450     int flags = 0, ret = 0;
2451     static uint64_t seq_iter;
2452     int len = 0;
2453     /*
2454      * If system is running in postcopy mode, page inserts to host memory must
2455      * be atomic
2456      */
2457     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2458
2459     seq_iter++;
2460
2461     if (version_id != 4) {
2462         ret = -EINVAL;
2463     }
2464
2465     /* This RCU critical section can be very long running.
2466      * When RCU reclaims in the code start to become numerous,
2467      * it will be necessary to reduce the granularity of this
2468      * critical section.
2469      */
2470     rcu_read_lock();
2471
2472     if (postcopy_running) {
2473         ret = ram_load_postcopy(f);
2474     }
2475
2476     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2477         ram_addr_t addr, total_ram_bytes;
2478         void *host = NULL;
2479         uint8_t ch;
2480
2481         addr = qemu_get_be64(f);
2482         flags = addr & ~TARGET_PAGE_MASK;
2483         addr &= TARGET_PAGE_MASK;
2484
2485         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2486                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2487             RAMBlock *block = ram_block_from_stream(f, flags);
2488
2489             host = host_from_ram_block_offset(block, addr);
2490             if (!host) {
2491                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2492                 ret = -EINVAL;
2493                 break;
2494             }
2495         }
2496
2497         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2498         case RAM_SAVE_FLAG_MEM_SIZE:
2499             /* Synchronize RAM block list */
2500             total_ram_bytes = addr;
2501             while (!ret && total_ram_bytes) {
2502                 RAMBlock *block;
2503                 char id[256];
2504                 ram_addr_t length;
2505
2506                 len = qemu_get_byte(f);
2507                 qemu_get_buffer(f, (uint8_t *)id, len);
2508                 id[len] = 0;
2509                 length = qemu_get_be64(f);
2510
2511                 block = qemu_ram_block_by_name(id);
2512                 if (block) {
2513                     if (length != block->used_length) {
2514                         Error *local_err = NULL;
2515
2516                         ret = qemu_ram_resize(block, length,
2517                                               &local_err);
2518                         if (local_err) {
2519                             error_report_err(local_err);
2520                         }
2521                     }
2522                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2523                                           block->idstr);
2524                 } else {
2525                     error_report("Unknown ramblock \"%s\", cannot "
2526                                  "accept migration", id);
2527                     ret = -EINVAL;
2528                 }
2529
2530                 total_ram_bytes -= length;
2531             }
2532             break;
2533
2534         case RAM_SAVE_FLAG_COMPRESS:
2535             ch = qemu_get_byte(f);
2536             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2537             break;
2538
2539         case RAM_SAVE_FLAG_PAGE:
2540             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2541             break;
2542
2543         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2544             len = qemu_get_be32(f);
2545             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2546                 error_report("Invalid compressed data length: %d", len);
2547                 ret = -EINVAL;
2548                 break;
2549             }
2550             decompress_data_with_multi_threads(f, host, len);
2551             break;
2552
2553         case RAM_SAVE_FLAG_XBZRLE:
2554             if (load_xbzrle(f, addr, host) < 0) {
2555                 error_report("Failed to decompress XBZRLE page at "
2556                              RAM_ADDR_FMT, addr);
2557                 ret = -EINVAL;
2558                 break;
2559             }
2560             break;
2561         case RAM_SAVE_FLAG_EOS:
2562             /* normal exit */
2563             break;
2564         default:
2565             if (flags & RAM_SAVE_FLAG_HOOK) {
2566                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2567             } else {
2568                 error_report("Unknown combination of migration flags: %#x",
2569                              flags);
2570                 ret = -EINVAL;
2571             }
2572         }
2573         if (!ret) {
2574             ret = qemu_file_get_error(f);
2575         }
2576     }
2577
2578     wait_for_decompress_done();
2579     rcu_read_unlock();
2580     DPRINTF("Completed load of VM with exit code %d seq iteration "
2581             "%" PRIu64 "\n", ret, seq_iter);
2582     return ret;
2583 }
2584
2585 static SaveVMHandlers savevm_ram_handlers = {
2586     .save_live_setup = ram_save_setup,
2587     .save_live_iterate = ram_save_iterate,
2588     .save_live_complete_postcopy = ram_save_complete,
2589     .save_live_complete_precopy = ram_save_complete,
2590     .save_live_pending = ram_save_pending,
2591     .load_state = ram_load,
2592     .cleanup = ram_migration_cleanup,
2593 };
2594
2595 void ram_mig_init(void)
2596 {
2597     qemu_mutex_init(&XBZRLE.lock);
2598     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
2599 }