migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include "qemu/cutils.h"
  32 #include "qemu/bitops.h"
  33 #include "qemu/bitmap.h"
  34 #include "qemu/main-loop.h"
  35 #include "xbzrle.h"
  36 #include "ram.h"
  37 #include "migration.h"
  38 #include "migration/register.h"
  39 #include "migration/misc.h"
  40 #include "qemu-file.h"
  41 #include "postcopy-ram.h"
  42 #include "page_cache.h"
  43 #include "qemu/error-report.h"
  44 #include "qapi/error.h"
  45 #include "qapi/qapi-types-migration.h"
  46 #include "qapi/qapi-events-migration.h"
  47 #include "qapi/qmp/qerror.h"
  48 #include "trace.h"
  49 #include "exec/ram_addr.h"
  50 #include "exec/target_page.h"
  51 #include "qemu/rcu_queue.h"
  52 #include "migration/colo.h"
  53 #include "block.h"
  54 #include "sysemu/sysemu.h"
  55 #include "sysemu/cpu-throttle.h"
  56 #include "savevm.h"
  57 #include "qemu/iov.h"
  58 #include "multifd.h"
  59 #include "sysemu/runstate.h"
  60
  61 #if defined(__linux__)
  62 #include "qemu/userfaultfd.h"
  63 #endif /* defined(__linux__) */
  64
  65 /***********************************************************/
  66 /* ram save/restore */
  67
  68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  69  * worked for pages that where filled with the same char.  We switched
  70  * it to only search for the zero value.  And to avoid confusion with
  71  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  72  */
  73
  74 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  75 #define RAM_SAVE_FLAG_ZERO     0x02
  76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  77 #define RAM_SAVE_FLAG_PAGE     0x08
  78 #define RAM_SAVE_FLAG_EOS      0x10
  79 #define RAM_SAVE_FLAG_CONTINUE 0x20
  80 #define RAM_SAVE_FLAG_XBZRLE   0x40
  81 /* 0x80 is reserved in migration.h start with 0x100 next */
  82 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  83
  84 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  85 {
  86     return buffer_is_zero(p, size);
  87 }
  88
  89 XBZRLECacheStats xbzrle_counters;
  90
  91 /* struct contains XBZRLE cache and a static page
  92    used by the compression */
  93 static struct {
  94     /* buffer used for XBZRLE encoding */
  95     uint8_t *encoded_buf;
  96     /* buffer for storing page content */
  97     uint8_t *current_buf;
  98     /* Cache for XBZRLE, Protected by lock. */
  99     PageCache *cache;
 100     QemuMutex lock;
 101     /* it will store a page full of zeros */
 102     uint8_t *zero_target_page;
 103     /* buffer used for XBZRLE decoding */
 104     uint8_t *decoded_buf;
 105 } XBZRLE;
 106
 107 static void XBZRLE_cache_lock(void)
 108 {
 109     if (migrate_use_xbzrle()) {
 110         qemu_mutex_lock(&XBZRLE.lock);
 111     }
 112 }
 113
 114 static void XBZRLE_cache_unlock(void)
 115 {
 116     if (migrate_use_xbzrle()) {
 117         qemu_mutex_unlock(&XBZRLE.lock);
 118     }
 119 }
 120
 121 /**
 122  * xbzrle_cache_resize: resize the xbzrle cache
 123  *
 124  * This function is called from migrate_params_apply in main
 125  * thread, possibly while a migration is in progress.  A running
 126  * migration may be using the cache and might finish during this call,
 127  * hence changes to the cache are protected by XBZRLE.lock().
 128  *
 129  * Returns 0 for success or -1 for error
 130  *
 131  * @new_size: new cache size
 132  * @errp: set *errp if the check failed, with reason
 133  */
 134 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 135 {
 136     PageCache *new_cache;
 137     int64_t ret = 0;
 138
 139     /* Check for truncation */
 140     if (new_size != (size_t)new_size) {
 141         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 142                    "exceeding address space");
 143         return -1;
 144     }
 145
 146     if (new_size == migrate_xbzrle_cache_size()) {
 147         /* nothing to do */
 148         return 0;
 149     }
 150
 151     XBZRLE_cache_lock();
 152
 153     if (XBZRLE.cache != NULL) {
 154         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 155         if (!new_cache) {
 156             ret = -1;
 157             goto out;
 158         }
 159
 160         cache_fini(XBZRLE.cache);
 161         XBZRLE.cache = new_cache;
 162     }
 163 out:
 164     XBZRLE_cache_unlock();
 165     return ret;
 166 }
 167
 168 bool ramblock_is_ignored(RAMBlock *block)
 169 {
 170     return !qemu_ram_is_migratable(block) ||
 171            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 172 }
 173
 174 #undef RAMBLOCK_FOREACH
 175
 176 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 177 {
 178     RAMBlock *block;
 179     int ret = 0;
 180
 181     RCU_READ_LOCK_GUARD();
 182
 183     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 184         ret = func(block, opaque);
 185         if (ret) {
 186             break;
 187         }
 188     }
 189     return ret;
 190 }
 191
 192 static void ramblock_recv_map_init(void)
 193 {
 194     RAMBlock *rb;
 195
 196     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 197         assert(!rb->receivedmap);
 198         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 199     }
 200 }
 201
 202 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 203 {
 204     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 205                     rb->receivedmap);
 206 }
 207
 208 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 209 {
 210     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 211 }
 212
 213 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 214 {
 215     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 216 }
 217
 218 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 219                                     size_t nr)
 220 {
 221     bitmap_set_atomic(rb->receivedmap,
 222                       ramblock_recv_bitmap_offset(host_addr, rb),
 223                       nr);
 224 }
 225
 226 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 227
 228 /*
 229  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 230  *
 231  * Returns >0 if success with sent bytes, or <0 if error.
 232  */
 233 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 234                                   const char *block_name)
 235 {
 236     RAMBlock *block = qemu_ram_block_by_name(block_name);
 237     unsigned long *le_bitmap, nbits;
 238     uint64_t size;
 239
 240     if (!block) {
 241         error_report("%s: invalid block name: %s", __func__, block_name);
 242         return -1;
 243     }
 244
 245     nbits = block->used_length >> TARGET_PAGE_BITS;
 246
 247     /*
 248      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 249      * machines we may need 4 more bytes for padding (see below
 250      * comment). So extend it a bit before hand.
 251      */
 252     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 253
 254     /*
 255      * Always use little endian when sending the bitmap. This is
 256      * required that when source and destination VMs are not using the
 257      * same endianness. (Note: big endian won't work.)
 258      */
 259     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 260
 261     /* Size of the bitmap, in bytes */
 262     size = DIV_ROUND_UP(nbits, 8);
 263
 264     /*
 265      * size is always aligned to 8 bytes for 64bit machines, but it
 266      * may not be true for 32bit machines. We need this padding to
 267      * make sure the migration can survive even between 32bit and
 268      * 64bit machines.
 269      */
 270     size = ROUND_UP(size, 8);
 271
 272     qemu_put_be64(file, size);
 273     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 274     /*
 275      * Mark as an end, in case the middle part is screwed up due to
 276      * some "mysterious" reason.
 277      */
 278     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 279     qemu_fflush(file);
 280
 281     g_free(le_bitmap);
 282
 283     if (qemu_file_get_error(file)) {
 284         return qemu_file_get_error(file);
 285     }
 286
 287     return size + sizeof(size);
 288 }
 289
 290 /*
 291  * An outstanding page request, on the source, having been received
 292  * and queued
 293  */
 294 struct RAMSrcPageRequest {
 295     RAMBlock *rb;
 296     hwaddr    offset;
 297     hwaddr    len;
 298
 299     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 300 };
 301
 302 /* State of RAM for migration */
 303 struct RAMState {
 304     /* QEMUFile used for this migration */
 305     QEMUFile *f;
 306     /* UFFD file descriptor, used in 'write-tracking' migration */
 307     int uffdio_fd;
 308     /* Last block that we have visited searching for dirty pages */
 309     RAMBlock *last_seen_block;
 310     /* Last block from where we have sent data */
 311     RAMBlock *last_sent_block;
 312     /* Last dirty target page we have sent */
 313     ram_addr_t last_page;
 314     /* last ram version we have seen */
 315     uint32_t last_version;
 316     /* We are in the first round */
 317     bool ram_bulk_stage;
 318     /* The free page optimization is enabled */
 319     bool fpo_enabled;
 320     /* How many times we have dirty too many pages */
 321     int dirty_rate_high_cnt;
 322     /* these variables are used for bitmap sync */
 323     /* last time we did a full bitmap_sync */
 324     int64_t time_last_bitmap_sync;
 325     /* bytes transferred at start_time */
 326     uint64_t bytes_xfer_prev;
 327     /* number of dirty pages since start_time */
 328     uint64_t num_dirty_pages_period;
 329     /* xbzrle misses since the beginning of the period */
 330     uint64_t xbzrle_cache_miss_prev;
 331     /* Amount of xbzrle pages since the beginning of the period */
 332     uint64_t xbzrle_pages_prev;
 333     /* Amount of xbzrle encoded bytes since the beginning of the period */
 334     uint64_t xbzrle_bytes_prev;
 335
 336     /* compression statistics since the beginning of the period */
 337     /* amount of count that no free thread to compress data */
 338     uint64_t compress_thread_busy_prev;
 339     /* amount bytes after compression */
 340     uint64_t compressed_size_prev;
 341     /* amount of compressed pages */
 342     uint64_t compress_pages_prev;
 343
 344     /* total handled target pages at the beginning of period */
 345     uint64_t target_page_count_prev;
 346     /* total handled target pages since start */
 347     uint64_t target_page_count;
 348     /* number of dirty bits in the bitmap */
 349     uint64_t migration_dirty_pages;
 350     /* Protects modification of the bitmap and migration dirty pages */
 351     QemuMutex bitmap_mutex;
 352     /* The RAMBlock used in the last src_page_requests */
 353     RAMBlock *last_req_rb;
 354     /* Queue of outstanding page requests from the destination */
 355     QemuMutex src_page_req_mutex;
 356     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 357 };
 358 typedef struct RAMState RAMState;
 359
 360 static RAMState *ram_state;
 361
 362 static NotifierWithReturnList precopy_notifier_list;
 363
 364 void precopy_infrastructure_init(void)
 365 {
 366     notifier_with_return_list_init(&precopy_notifier_list);
 367 }
 368
 369 void precopy_add_notifier(NotifierWithReturn *n)
 370 {
 371     notifier_with_return_list_add(&precopy_notifier_list, n);
 372 }
 373
 374 void precopy_remove_notifier(NotifierWithReturn *n)
 375 {
 376     notifier_with_return_remove(n);
 377 }
 378
 379 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 380 {
 381     PrecopyNotifyData pnd;
 382     pnd.reason = reason;
 383     pnd.errp = errp;
 384
 385     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 386 }
 387
 388 void precopy_enable_free_page_optimization(void)
 389 {
 390     if (!ram_state) {
 391         return;
 392     }
 393
 394     ram_state->fpo_enabled = true;
 395 }
 396
 397 uint64_t ram_bytes_remaining(void)
 398 {
 399     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 400                        0;
 401 }
 402
 403 MigrationStats ram_counters;
 404
 405 /* used by the search for pages to send */
 406 struct PageSearchStatus {
 407     /* Current block being searched */
 408     RAMBlock    *block;
 409     /* Current page to search from */
 410     unsigned long page;
 411     /* Set once we wrap around */
 412     bool         complete_round;
 413 };
 414 typedef struct PageSearchStatus PageSearchStatus;
 415
 416 CompressionStats compression_counters;
 417
 418 struct CompressParam {
 419     bool done;
 420     bool quit;
 421     bool zero_page;
 422     QEMUFile *file;
 423     QemuMutex mutex;
 424     QemuCond cond;
 425     RAMBlock *block;
 426     ram_addr_t offset;
 427
 428     /* internally used fields */
 429     z_stream stream;
 430     uint8_t *originbuf;
 431 };
 432 typedef struct CompressParam CompressParam;
 433
 434 struct DecompressParam {
 435     bool done;
 436     bool quit;
 437     QemuMutex mutex;
 438     QemuCond cond;
 439     void *des;
 440     uint8_t *compbuf;
 441     int len;
 442     z_stream stream;
 443 };
 444 typedef struct DecompressParam DecompressParam;
 445
 446 static CompressParam *comp_param;
 447 static QemuThread *compress_threads;
 448 /* comp_done_cond is used to wake up the migration thread when
 449  * one of the compression threads has finished the compression.
 450  * comp_done_lock is used to co-work with comp_done_cond.
 451  */
 452 static QemuMutex comp_done_lock;
 453 static QemuCond comp_done_cond;
 454 /* The empty QEMUFileOps will be used by file in CompressParam */
 455 static const QEMUFileOps empty_ops = { };
 456
 457 static QEMUFile *decomp_file;
 458 static DecompressParam *decomp_param;
 459 static QemuThread *decompress_threads;
 460 static QemuMutex decomp_done_lock;
 461 static QemuCond decomp_done_cond;
 462
 463 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 464                                  ram_addr_t offset, uint8_t *source_buf);
 465
 466 static void *do_data_compress(void *opaque)
 467 {
 468     CompressParam *param = opaque;
 469     RAMBlock *block;
 470     ram_addr_t offset;
 471     bool zero_page;
 472
 473     qemu_mutex_lock(&param->mutex);
 474     while (!param->quit) {
 475         if (param->block) {
 476             block = param->block;
 477             offset = param->offset;
 478             param->block = NULL;
 479             qemu_mutex_unlock(&param->mutex);
 480
 481             zero_page = do_compress_ram_page(param->file, &param->stream,
 482                                              block, offset, param->originbuf);
 483
 484             qemu_mutex_lock(&comp_done_lock);
 485             param->done = true;
 486             param->zero_page = zero_page;
 487             qemu_cond_signal(&comp_done_cond);
 488             qemu_mutex_unlock(&comp_done_lock);
 489
 490             qemu_mutex_lock(&param->mutex);
 491         } else {
 492             qemu_cond_wait(&param->cond, &param->mutex);
 493         }
 494     }
 495     qemu_mutex_unlock(&param->mutex);
 496
 497     return NULL;
 498 }
 499
 500 static void compress_threads_save_cleanup(void)
 501 {
 502     int i, thread_count;
 503
 504     if (!migrate_use_compression() || !comp_param) {
 505         return;
 506     }
 507
 508     thread_count = migrate_compress_threads();
 509     for (i = 0; i < thread_count; i++) {
 510         /*
 511          * we use it as a indicator which shows if the thread is
 512          * properly init'd or not
 513          */
 514         if (!comp_param[i].file) {
 515             break;
 516         }
 517
 518         qemu_mutex_lock(&comp_param[i].mutex);
 519         comp_param[i].quit = true;
 520         qemu_cond_signal(&comp_param[i].cond);
 521         qemu_mutex_unlock(&comp_param[i].mutex);
 522
 523         qemu_thread_join(compress_threads + i);
 524         qemu_mutex_destroy(&comp_param[i].mutex);
 525         qemu_cond_destroy(&comp_param[i].cond);
 526         deflateEnd(&comp_param[i].stream);
 527         g_free(comp_param[i].originbuf);
 528         qemu_fclose(comp_param[i].file);
 529         comp_param[i].file = NULL;
 530     }
 531     qemu_mutex_destroy(&comp_done_lock);
 532     qemu_cond_destroy(&comp_done_cond);
 533     g_free(compress_threads);
 534     g_free(comp_param);
 535     compress_threads = NULL;
 536     comp_param = NULL;
 537 }
 538
 539 static int compress_threads_save_setup(void)
 540 {
 541     int i, thread_count;
 542
 543     if (!migrate_use_compression()) {
 544         return 0;
 545     }
 546     thread_count = migrate_compress_threads();
 547     compress_threads = g_new0(QemuThread, thread_count);
 548     comp_param = g_new0(CompressParam, thread_count);
 549     qemu_cond_init(&comp_done_cond);
 550     qemu_mutex_init(&comp_done_lock);
 551     for (i = 0; i < thread_count; i++) {
 552         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 553         if (!comp_param[i].originbuf) {
 554             goto exit;
 555         }
 556
 557         if (deflateInit(&comp_param[i].stream,
 558                         migrate_compress_level()) != Z_OK) {
 559             g_free(comp_param[i].originbuf);
 560             goto exit;
 561         }
 562
 563         /* comp_param[i].file is just used as a dummy buffer to save data,
 564          * set its ops to empty.
 565          */
 566         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 567         comp_param[i].done = true;
 568         comp_param[i].quit = false;
 569         qemu_mutex_init(&comp_param[i].mutex);
 570         qemu_cond_init(&comp_param[i].cond);
 571         qemu_thread_create(compress_threads + i, "compress",
 572                            do_data_compress, comp_param + i,
 573                            QEMU_THREAD_JOINABLE);
 574     }
 575     return 0;
 576
 577 exit:
 578     compress_threads_save_cleanup();
 579     return -1;
 580 }
 581
 582 /**
 583  * save_page_header: write page header to wire
 584  *
 585  * If this is the 1st block, it also writes the block identification
 586  *
 587  * Returns the number of bytes written
 588  *
 589  * @f: QEMUFile where to send the data
 590  * @block: block that contains the page we want to send
 591  * @offset: offset inside the block for the page
 592  *          in the lower bits, it contains flags
 593  */
 594 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 595                                ram_addr_t offset)
 596 {
 597     size_t size, len;
 598
 599     if (block == rs->last_sent_block) {
 600         offset |= RAM_SAVE_FLAG_CONTINUE;
 601     }
 602     qemu_put_be64(f, offset);
 603     size = 8;
 604
 605     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 606         len = strlen(block->idstr);
 607         qemu_put_byte(f, len);
 608         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 609         size += 1 + len;
 610         rs->last_sent_block = block;
 611     }
 612     return size;
 613 }
 614
 615 /**
 616  * mig_throttle_guest_down: throotle down the guest
 617  *
 618  * Reduce amount of guest cpu execution to hopefully slow down memory
 619  * writes. If guest dirty memory rate is reduced below the rate at
 620  * which we can transfer pages to the destination then we should be
 621  * able to complete migration. Some workloads dirty memory way too
 622  * fast and will not effectively converge, even with auto-converge.
 623  */
 624 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 625                                     uint64_t bytes_dirty_threshold)
 626 {
 627     MigrationState *s = migrate_get_current();
 628     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 629     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 630     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 631     int pct_max = s->parameters.max_cpu_throttle;
 632
 633     uint64_t throttle_now = cpu_throttle_get_percentage();
 634     uint64_t cpu_now, cpu_ideal, throttle_inc;
 635
 636     /* We have not started throttling yet. Let's start it. */
 637     if (!cpu_throttle_active()) {
 638         cpu_throttle_set(pct_initial);
 639     } else {
 640         /* Throttling already on, just increase the rate */
 641         if (!pct_tailslow) {
 642             throttle_inc = pct_increment;
 643         } else {
 644             /* Compute the ideal CPU percentage used by Guest, which may
 645              * make the dirty rate match the dirty rate threshold. */
 646             cpu_now = 100 - throttle_now;
 647             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 648                         bytes_dirty_period);
 649             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 650         }
 651         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 652     }
 653 }
 654
 655 /**
 656  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 657  *
 658  * @rs: current RAM state
 659  * @current_addr: address for the zero page
 660  *
 661  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 662  * The important thing is that a stale (not-yet-0'd) page be replaced
 663  * by the new data.
 664  * As a bonus, if the page wasn't in the cache it gets added so that
 665  * when a small write is made into the 0'd page it gets XBZRLE sent.
 666  */
 667 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 668 {
 669     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 670         return;
 671     }
 672
 673     /* We don't care if this fails to allocate a new cache page
 674      * as long as it updated an old one */
 675     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 676                  ram_counters.dirty_sync_count);
 677 }
 678
 679 #define ENCODING_FLAG_XBZRLE 0x1
 680
 681 /**
 682  * save_xbzrle_page: compress and send current page
 683  *
 684  * Returns: 1 means that we wrote the page
 685  *          0 means that page is identical to the one already sent
 686  *          -1 means that xbzrle would be longer than normal
 687  *
 688  * @rs: current RAM state
 689  * @current_data: pointer to the address of the page contents
 690  * @current_addr: addr of the page
 691  * @block: block that contains the page we want to send
 692  * @offset: offset inside the block for the page
 693  * @last_stage: if we are at the completion stage
 694  */
 695 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 696                             ram_addr_t current_addr, RAMBlock *block,
 697                             ram_addr_t offset, bool last_stage)
 698 {
 699     int encoded_len = 0, bytes_xbzrle;
 700     uint8_t *prev_cached_page;
 701
 702     if (!cache_is_cached(XBZRLE.cache, current_addr,
 703                          ram_counters.dirty_sync_count)) {
 704         xbzrle_counters.cache_miss++;
 705         if (!last_stage) {
 706             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 707                              ram_counters.dirty_sync_count) == -1) {
 708                 return -1;
 709             } else {
 710                 /* update *current_data when the page has been
 711                    inserted into cache */
 712                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 713             }
 714         }
 715         return -1;
 716     }
 717
 718     /*
 719      * Reaching here means the page has hit the xbzrle cache, no matter what
 720      * encoding result it is (normal encoding, overflow or skipping the page),
 721      * count the page as encoded. This is used to calculate the encoding rate.
 722      *
 723      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 724      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 725      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 726      * skipped page included. In this way, the encoding rate can tell if the
 727      * guest page is good for xbzrle encoding.
 728      */
 729     xbzrle_counters.pages++;
 730     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 731
 732     /* save current buffer into memory */
 733     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 734
 735     /* XBZRLE encoding (if there is no overflow) */
 736     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 737                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 738                                        TARGET_PAGE_SIZE);
 739
 740     /*
 741      * Update the cache contents, so that it corresponds to the data
 742      * sent, in all cases except where we skip the page.
 743      */
 744     if (!last_stage && encoded_len != 0) {
 745         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 746         /*
 747          * In the case where we couldn't compress, ensure that the caller
 748          * sends the data from the cache, since the guest might have
 749          * changed the RAM since we copied it.
 750          */
 751         *current_data = prev_cached_page;
 752     }
 753
 754     if (encoded_len == 0) {
 755         trace_save_xbzrle_page_skipping();
 756         return 0;
 757     } else if (encoded_len == -1) {
 758         trace_save_xbzrle_page_overflow();
 759         xbzrle_counters.overflow++;
 760         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 761         return -1;
 762     }
 763
 764     /* Send XBZRLE based compressed page */
 765     bytes_xbzrle = save_page_header(rs, rs->f, block,
 766                                     offset | RAM_SAVE_FLAG_XBZRLE);
 767     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 768     qemu_put_be16(rs->f, encoded_len);
 769     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 770     bytes_xbzrle += encoded_len + 1 + 2;
 771     /*
 772      * Like compressed_size (please see update_compress_thread_counts),
 773      * the xbzrle encoded bytes don't count the 8 byte header with
 774      * RAM_SAVE_FLAG_CONTINUE.
 775      */
 776     xbzrle_counters.bytes += bytes_xbzrle - 8;
 777     ram_counters.transferred += bytes_xbzrle;
 778
 779     return 1;
 780 }
 781
 782 /**
 783  * migration_bitmap_find_dirty: find the next dirty page from start
 784  *
 785  * Returns the page offset within memory region of the start of a dirty page
 786  *
 787  * @rs: current RAM state
 788  * @rb: RAMBlock where to search for dirty pages
 789  * @start: page where we start the search
 790  */
 791 static inline
 792 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 793                                           unsigned long start)
 794 {
 795     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 796     unsigned long *bitmap = rb->bmap;
 797     unsigned long next;
 798
 799     if (ramblock_is_ignored(rb)) {
 800         return size;
 801     }
 802
 803     /*
 804      * When the free page optimization is enabled, we need to check the bitmap
 805      * to send the non-free pages rather than all the pages in the bulk stage.
 806      */
 807     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
 808         next = start + 1;
 809     } else {
 810         next = find_next_bit(bitmap, size, start);
 811     }
 812
 813     return next;
 814 }
 815
 816 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 817                                                 RAMBlock *rb,
 818                                                 unsigned long page)
 819 {
 820     bool ret;
 821
 822     QEMU_LOCK_GUARD(&rs->bitmap_mutex);
 823
 824     /*
 825      * Clear dirty bitmap if needed.  This _must_ be called before we
 826      * send any of the page in the chunk because we need to make sure
 827      * we can capture further page content changes when we sync dirty
 828      * log the next time.  So as long as we are going to send any of
 829      * the page in the chunk we clear the remote dirty bitmap for all.
 830      * Clearing it earlier won't be a problem, but too late will.
 831      */
 832     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
 833         uint8_t shift = rb->clear_bmap_shift;
 834         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
 835         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
 836
 837         /*
 838          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 839          * can make things easier sometimes since then start address
 840          * of the small chunk will always be 64 pages aligned so the
 841          * bitmap will always be aligned to unsigned long.  We should
 842          * even be able to remove this restriction but I'm simply
 843          * keeping it.
 844          */
 845         assert(shift >= 6);
 846         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 847         memory_region_clear_dirty_bitmap(rb->mr, start, size);
 848     }
 849
 850     ret = test_and_clear_bit(page, rb->bmap);
 851
 852     if (ret) {
 853         rs->migration_dirty_pages--;
 854     }
 855
 856     return ret;
 857 }
 858
 859 /* Called with RCU critical section */
 860 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 861 {
 862     uint64_t new_dirty_pages =
 863         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
 864
 865     rs->migration_dirty_pages += new_dirty_pages;
 866     rs->num_dirty_pages_period += new_dirty_pages;
 867 }
 868
 869 /**
 870  * ram_pagesize_summary: calculate all the pagesizes of a VM
 871  *
 872  * Returns a summary bitmap of the page sizes of all RAMBlocks
 873  *
 874  * For VMs with just normal pages this is equivalent to the host page
 875  * size. If it's got some huge pages then it's the OR of all the
 876  * different page sizes.
 877  */
 878 uint64_t ram_pagesize_summary(void)
 879 {
 880     RAMBlock *block;
 881     uint64_t summary = 0;
 882
 883     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 884         summary |= block->page_size;
 885     }
 886
 887     return summary;
 888 }
 889
 890 uint64_t ram_get_total_transferred_pages(void)
 891 {
 892     return  ram_counters.normal + ram_counters.duplicate +
 893                 compression_counters.pages + xbzrle_counters.pages;
 894 }
 895
 896 static void migration_update_rates(RAMState *rs, int64_t end_time)
 897 {
 898     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 899     double compressed_size;
 900
 901     /* calculate period counters */
 902     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 903                 / (end_time - rs->time_last_bitmap_sync);
 904
 905     if (!page_count) {
 906         return;
 907     }
 908
 909     if (migrate_use_xbzrle()) {
 910         double encoded_size, unencoded_size;
 911
 912         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 913             rs->xbzrle_cache_miss_prev) / page_count;
 914         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 915         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
 916                          TARGET_PAGE_SIZE;
 917         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
 918         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
 919             xbzrle_counters.encoding_rate = 0;
 920         } else {
 921             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
 922         }
 923         rs->xbzrle_pages_prev = xbzrle_counters.pages;
 924         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
 925     }
 926
 927     if (migrate_use_compression()) {
 928         compression_counters.busy_rate = (double)(compression_counters.busy -
 929             rs->compress_thread_busy_prev) / page_count;
 930         rs->compress_thread_busy_prev = compression_counters.busy;
 931
 932         compressed_size = compression_counters.compressed_size -
 933                           rs->compressed_size_prev;
 934         if (compressed_size) {
 935             double uncompressed_size = (compression_counters.pages -
 936                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
 937
 938             /* Compression-Ratio = Uncompressed-size / Compressed-size */
 939             compression_counters.compression_rate =
 940                                         uncompressed_size / compressed_size;
 941
 942             rs->compress_pages_prev = compression_counters.pages;
 943             rs->compressed_size_prev = compression_counters.compressed_size;
 944         }
 945     }
 946 }
 947
 948 static void migration_trigger_throttle(RAMState *rs)
 949 {
 950     MigrationState *s = migrate_get_current();
 951     uint64_t threshold = s->parameters.throttle_trigger_threshold;
 952
 953     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
 954     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
 955     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
 956
 957     /* During block migration the auto-converge logic incorrectly detects
 958      * that ram migration makes no progress. Avoid this by disabling the
 959      * throttling logic during the bulk phase of block migration. */
 960     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 961         /* The following detection logic can be refined later. For now:
 962            Check to see if the ratio between dirtied bytes and the approx.
 963            amount of bytes that just got transferred since the last time
 964            we were in this routine reaches the threshold. If that happens
 965            twice, start or increase throttling. */
 966
 967         if ((bytes_dirty_period > bytes_dirty_threshold) &&
 968             (++rs->dirty_rate_high_cnt >= 2)) {
 969             trace_migration_throttle();
 970             rs->dirty_rate_high_cnt = 0;
 971             mig_throttle_guest_down(bytes_dirty_period,
 972                                     bytes_dirty_threshold);
 973         }
 974     }
 975 }
 976
 977 static void migration_bitmap_sync(RAMState *rs)
 978 {
 979     RAMBlock *block;
 980     int64_t end_time;
 981
 982     ram_counters.dirty_sync_count++;
 983
 984     if (!rs->time_last_bitmap_sync) {
 985         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 986     }
 987
 988     trace_migration_bitmap_sync_start();
 989     memory_global_dirty_log_sync();
 990
 991     qemu_mutex_lock(&rs->bitmap_mutex);
 992     WITH_RCU_READ_LOCK_GUARD() {
 993         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 994             ramblock_sync_dirty_bitmap(rs, block);
 995         }
 996         ram_counters.remaining = ram_bytes_remaining();
 997     }
 998     qemu_mutex_unlock(&rs->bitmap_mutex);
 999
1000     memory_global_after_dirty_log_sync();
1001     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1002
1003     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1004
1005     /* more than 1 second = 1000 millisecons */
1006     if (end_time > rs->time_last_bitmap_sync + 1000) {
1007         migration_trigger_throttle(rs);
1008
1009         migration_update_rates(rs, end_time);
1010
1011         rs->target_page_count_prev = rs->target_page_count;
1012
1013         /* reset period counters */
1014         rs->time_last_bitmap_sync = end_time;
1015         rs->num_dirty_pages_period = 0;
1016         rs->bytes_xfer_prev = ram_counters.transferred;
1017     }
1018     if (migrate_use_events()) {
1019         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1020     }
1021 }
1022
1023 static void migration_bitmap_sync_precopy(RAMState *rs)
1024 {
1025     Error *local_err = NULL;
1026
1027     /*
1028      * The current notifier usage is just an optimization to migration, so we
1029      * don't stop the normal migration process in the error case.
1030      */
1031     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1032         error_report_err(local_err);
1033         local_err = NULL;
1034     }
1035
1036     migration_bitmap_sync(rs);
1037
1038     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1039         error_report_err(local_err);
1040     }
1041 }
1042
1043 /**
1044  * save_zero_page_to_file: send the zero page to the file
1045  *
1046  * Returns the size of data written to the file, 0 means the page is not
1047  * a zero page
1048  *
1049  * @rs: current RAM state
1050  * @file: the file where the data is saved
1051  * @block: block that contains the page we want to send
1052  * @offset: offset inside the block for the page
1053  */
1054 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1055                                   RAMBlock *block, ram_addr_t offset)
1056 {
1057     uint8_t *p = block->host + offset;
1058     int len = 0;
1059
1060     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1061         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1062         qemu_put_byte(file, 0);
1063         len += 1;
1064     }
1065     return len;
1066 }
1067
1068 /**
1069  * save_zero_page: send the zero page to the stream
1070  *
1071  * Returns the number of pages written.
1072  *
1073  * @rs: current RAM state
1074  * @block: block that contains the page we want to send
1075  * @offset: offset inside the block for the page
1076  */
1077 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1078 {
1079     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1080
1081     if (len) {
1082         ram_counters.duplicate++;
1083         ram_counters.transferred += len;
1084         return 1;
1085     }
1086     return -1;
1087 }
1088
1089 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1090 {
1091     if (!migrate_release_ram() || !migration_in_postcopy()) {
1092         return;
1093     }
1094
1095     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1096 }
1097
1098 /*
1099  * @pages: the number of pages written by the control path,
1100  *        < 0 - error
1101  *        > 0 - number of pages written
1102  *
1103  * Return true if the pages has been saved, otherwise false is returned.
1104  */
1105 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1106                               int *pages)
1107 {
1108     uint64_t bytes_xmit = 0;
1109     int ret;
1110
1111     *pages = -1;
1112     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1113                                 &bytes_xmit);
1114     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1115         return false;
1116     }
1117
1118     if (bytes_xmit) {
1119         ram_counters.transferred += bytes_xmit;
1120         *pages = 1;
1121     }
1122
1123     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1124         return true;
1125     }
1126
1127     if (bytes_xmit > 0) {
1128         ram_counters.normal++;
1129     } else if (bytes_xmit == 0) {
1130         ram_counters.duplicate++;
1131     }
1132
1133     return true;
1134 }
1135
1136 /*
1137  * directly send the page to the stream
1138  *
1139  * Returns the number of pages written.
1140  *
1141  * @rs: current RAM state
1142  * @block: block that contains the page we want to send
1143  * @offset: offset inside the block for the page
1144  * @buf: the page to be sent
1145  * @async: send to page asyncly
1146  */
1147 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1148                             uint8_t *buf, bool async)
1149 {
1150     ram_counters.transferred += save_page_header(rs, rs->f, block,
1151                                                  offset | RAM_SAVE_FLAG_PAGE);
1152     if (async) {
1153         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1154                               migrate_release_ram() &
1155                               migration_in_postcopy());
1156     } else {
1157         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1158     }
1159     ram_counters.transferred += TARGET_PAGE_SIZE;
1160     ram_counters.normal++;
1161     return 1;
1162 }
1163
1164 /**
1165  * ram_save_page: send the given page to the stream
1166  *
1167  * Returns the number of pages written.
1168  *          < 0 - error
1169  *          >=0 - Number of pages written - this might legally be 0
1170  *                if xbzrle noticed the page was the same.
1171  *
1172  * @rs: current RAM state
1173  * @block: block that contains the page we want to send
1174  * @offset: offset inside the block for the page
1175  * @last_stage: if we are at the completion stage
1176  */
1177 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1178 {
1179     int pages = -1;
1180     uint8_t *p;
1181     bool send_async = true;
1182     RAMBlock *block = pss->block;
1183     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1184     ram_addr_t current_addr = block->offset + offset;
1185
1186     p = block->host + offset;
1187     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1188
1189     XBZRLE_cache_lock();
1190     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1191         migrate_use_xbzrle()) {
1192         pages = save_xbzrle_page(rs, &p, current_addr, block,
1193                                  offset, last_stage);
1194         if (!last_stage) {
1195             /* Can't send this cached data async, since the cache page
1196              * might get updated before it gets to the wire
1197              */
1198             send_async = false;
1199         }
1200     }
1201
1202     /* XBZRLE overflow or normal page */
1203     if (pages == -1) {
1204         pages = save_normal_page(rs, block, offset, p, send_async);
1205     }
1206
1207     XBZRLE_cache_unlock();
1208
1209     return pages;
1210 }
1211
1212 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1213                                  ram_addr_t offset)
1214 {
1215     if (multifd_queue_page(rs->f, block, offset) < 0) {
1216         return -1;
1217     }
1218     ram_counters.normal++;
1219
1220     return 1;
1221 }
1222
1223 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1224                                  ram_addr_t offset, uint8_t *source_buf)
1225 {
1226     RAMState *rs = ram_state;
1227     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1228     bool zero_page = false;
1229     int ret;
1230
1231     if (save_zero_page_to_file(rs, f, block, offset)) {
1232         zero_page = true;
1233         goto exit;
1234     }
1235
1236     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1237
1238     /*
1239      * copy it to a internal buffer to avoid it being modified by VM
1240      * so that we can catch up the error during compression and
1241      * decompression
1242      */
1243     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1244     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1245     if (ret < 0) {
1246         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1247         error_report("compressed data failed!");
1248         return false;
1249     }
1250
1251 exit:
1252     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1253     return zero_page;
1254 }
1255
1256 static void
1257 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1258 {
1259     ram_counters.transferred += bytes_xmit;
1260
1261     if (param->zero_page) {
1262         ram_counters.duplicate++;
1263         return;
1264     }
1265
1266     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1267     compression_counters.compressed_size += bytes_xmit - 8;
1268     compression_counters.pages++;
1269 }
1270
1271 static bool save_page_use_compression(RAMState *rs);
1272
1273 static void flush_compressed_data(RAMState *rs)
1274 {
1275     int idx, len, thread_count;
1276
1277     if (!save_page_use_compression(rs)) {
1278         return;
1279     }
1280     thread_count = migrate_compress_threads();
1281
1282     qemu_mutex_lock(&comp_done_lock);
1283     for (idx = 0; idx < thread_count; idx++) {
1284         while (!comp_param[idx].done) {
1285             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1286         }
1287     }
1288     qemu_mutex_unlock(&comp_done_lock);
1289
1290     for (idx = 0; idx < thread_count; idx++) {
1291         qemu_mutex_lock(&comp_param[idx].mutex);
1292         if (!comp_param[idx].quit) {
1293             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1294             /*
1295              * it's safe to fetch zero_page without holding comp_done_lock
1296              * as there is no further request submitted to the thread,
1297              * i.e, the thread should be waiting for a request at this point.
1298              */
1299             update_compress_thread_counts(&comp_param[idx], len);
1300         }
1301         qemu_mutex_unlock(&comp_param[idx].mutex);
1302     }
1303 }
1304
1305 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1306                                        ram_addr_t offset)
1307 {
1308     param->block = block;
1309     param->offset = offset;
1310 }
1311
1312 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1313                                            ram_addr_t offset)
1314 {
1315     int idx, thread_count, bytes_xmit = -1, pages = -1;
1316     bool wait = migrate_compress_wait_thread();
1317
1318     thread_count = migrate_compress_threads();
1319     qemu_mutex_lock(&comp_done_lock);
1320 retry:
1321     for (idx = 0; idx < thread_count; idx++) {
1322         if (comp_param[idx].done) {
1323             comp_param[idx].done = false;
1324             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1325             qemu_mutex_lock(&comp_param[idx].mutex);
1326             set_compress_params(&comp_param[idx], block, offset);
1327             qemu_cond_signal(&comp_param[idx].cond);
1328             qemu_mutex_unlock(&comp_param[idx].mutex);
1329             pages = 1;
1330             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1331             break;
1332         }
1333     }
1334
1335     /*
1336      * wait for the free thread if the user specifies 'compress-wait-thread',
1337      * otherwise we will post the page out in the main thread as normal page.
1338      */
1339     if (pages < 0 && wait) {
1340         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1341         goto retry;
1342     }
1343     qemu_mutex_unlock(&comp_done_lock);
1344
1345     return pages;
1346 }
1347
1348 /**
1349  * find_dirty_block: find the next dirty page and update any state
1350  * associated with the search process.
1351  *
1352  * Returns true if a page is found
1353  *
1354  * @rs: current RAM state
1355  * @pss: data about the state of the current dirty page scan
1356  * @again: set to false if the search has scanned the whole of RAM
1357  */
1358 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1359 {
1360     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1361     if (pss->complete_round && pss->block == rs->last_seen_block &&
1362         pss->page >= rs->last_page) {
1363         /*
1364          * We've been once around the RAM and haven't found anything.
1365          * Give up.
1366          */
1367         *again = false;
1368         return false;
1369     }
1370     if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1371         >= pss->block->used_length) {
1372         /* Didn't find anything in this RAM Block */
1373         pss->page = 0;
1374         pss->block = QLIST_NEXT_RCU(pss->block, next);
1375         if (!pss->block) {
1376             /*
1377              * If memory migration starts over, we will meet a dirtied page
1378              * which may still exists in compression threads's ring, so we
1379              * should flush the compressed data to make sure the new page
1380              * is not overwritten by the old one in the destination.
1381              *
1382              * Also If xbzrle is on, stop using the data compression at this
1383              * point. In theory, xbzrle can do better than compression.
1384              */
1385             flush_compressed_data(rs);
1386
1387             /* Hit the end of the list */
1388             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1389             /* Flag that we've looped */
1390             pss->complete_round = true;
1391             rs->ram_bulk_stage = false;
1392         }
1393         /* Didn't find anything this time, but try again on the new block */
1394         *again = true;
1395         return false;
1396     } else {
1397         /* Can go around again, but... */
1398         *again = true;
1399         /* We've found something so probably don't need to */
1400         return true;
1401     }
1402 }
1403
1404 /**
1405  * unqueue_page: gets a page of the queue
1406  *
1407  * Helper for 'get_queued_page' - gets a page off the queue
1408  *
1409  * Returns the block of the page (or NULL if none available)
1410  *
1411  * @rs: current RAM state
1412  * @offset: used to return the offset within the RAMBlock
1413  */
1414 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1415 {
1416     RAMBlock *block = NULL;
1417
1418     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1419         return NULL;
1420     }
1421
1422     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1423     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1424         struct RAMSrcPageRequest *entry =
1425                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1426         block = entry->rb;
1427         *offset = entry->offset;
1428
1429         if (entry->len > TARGET_PAGE_SIZE) {
1430             entry->len -= TARGET_PAGE_SIZE;
1431             entry->offset += TARGET_PAGE_SIZE;
1432         } else {
1433             memory_region_unref(block->mr);
1434             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1435             g_free(entry);
1436             migration_consume_urgent_request();
1437         }
1438     }
1439
1440     return block;
1441 }
1442
1443 #if defined(__linux__)
1444 /**
1445  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1446  *   is found, return RAM block pointer and page offset
1447  *
1448  * Returns pointer to the RAMBlock containing faulting page,
1449  *   NULL if no write faults are pending
1450  *
1451  * @rs: current RAM state
1452  * @offset: page offset from the beginning of the block
1453  */
1454 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1455 {
1456     struct uffd_msg uffd_msg;
1457     void *page_address;
1458     RAMBlock *bs;
1459     int res;
1460
1461     if (!migrate_background_snapshot()) {
1462         return NULL;
1463     }
1464
1465     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1466     if (res <= 0) {
1467         return NULL;
1468     }
1469
1470     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1471     bs = qemu_ram_block_from_host(page_address, false, offset);
1472     assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0);
1473     return bs;
1474 }
1475
1476 /**
1477  * ram_save_release_protection: release UFFD write protection after
1478  *   a range of pages has been saved
1479  *
1480  * @rs: current RAM state
1481  * @pss: page-search-status structure
1482  * @start_page: index of the first page in the range relative to pss->block
1483  *
1484  * Returns 0 on success, negative value in case of an error
1485 */
1486 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1487         unsigned long start_page)
1488 {
1489     int res = 0;
1490
1491     /* Check if page is from UFFD-managed region. */
1492     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1493         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1494         uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1495
1496         /* Flush async buffers before un-protect. */
1497         qemu_fflush(rs->f);
1498         /* Un-protect memory range. */
1499         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1500                 false, false);
1501     }
1502
1503     return res;
1504 }
1505
1506 /* ram_write_tracking_available: check if kernel supports required UFFD features
1507  *
1508  * Returns true if supports, false otherwise
1509  */
1510 bool ram_write_tracking_available(void)
1511 {
1512     uint64_t uffd_features;
1513     int res;
1514
1515     res = uffd_query_features(&uffd_features);
1516     return (res == 0 &&
1517             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1518 }
1519
1520 /* ram_write_tracking_compatible: check if guest configuration is
1521  *   compatible with 'write-tracking'
1522  *
1523  * Returns true if compatible, false otherwise
1524  */
1525 bool ram_write_tracking_compatible(void)
1526 {
1527     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1528     int uffd_fd;
1529     RAMBlock *bs;
1530     bool ret = false;
1531
1532     /* Open UFFD file descriptor */
1533     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1534     if (uffd_fd < 0) {
1535         return false;
1536     }
1537
1538     RCU_READ_LOCK_GUARD();
1539
1540     RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
1541         uint64_t uffd_ioctls;
1542
1543         /* Nothing to do with read-only and MMIO-writable regions */
1544         if (bs->mr->readonly || bs->mr->rom_device) {
1545             continue;
1546         }
1547         /* Try to register block memory via UFFD-IO to track writes */
1548         if (uffd_register_memory(uffd_fd, bs->host, bs->max_length,
1549                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1550             goto out;
1551         }
1552         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1553             goto out;
1554         }
1555     }
1556     ret = true;
1557
1558 out:
1559     uffd_close_fd(uffd_fd);
1560     return ret;
1561 }
1562
1563 /*
1564  * ram_write_tracking_start: start UFFD-WP memory tracking
1565  *
1566  * Returns 0 for success or negative value in case of error
1567  */
1568 int ram_write_tracking_start(void)
1569 {
1570     int uffd_fd;
1571     RAMState *rs = ram_state;
1572     RAMBlock *bs;
1573
1574     /* Open UFFD file descriptor */
1575     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1576     if (uffd_fd < 0) {
1577         return uffd_fd;
1578     }
1579     rs->uffdio_fd = uffd_fd;
1580
1581     RCU_READ_LOCK_GUARD();
1582
1583     RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
1584         /* Nothing to do with read-only and MMIO-writable regions */
1585         if (bs->mr->readonly || bs->mr->rom_device) {
1586             continue;
1587         }
1588
1589         /* Register block memory with UFFD to track writes */
1590         if (uffd_register_memory(rs->uffdio_fd, bs->host,
1591                 bs->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1592             goto fail;
1593         }
1594         /* Apply UFFD write protection to the block memory range */
1595         if (uffd_change_protection(rs->uffdio_fd, bs->host,
1596                 bs->max_length, true, false)) {
1597             goto fail;
1598         }
1599         bs->flags |= RAM_UF_WRITEPROTECT;
1600         memory_region_ref(bs->mr);
1601
1602         trace_ram_write_tracking_ramblock_start(bs->idstr, bs->page_size,
1603                 bs->host, bs->max_length);
1604     }
1605
1606     return 0;
1607
1608 fail:
1609     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1610
1611     RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
1612         if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
1613             continue;
1614         }
1615         /*
1616          * In case some memory block failed to be write-protected
1617          * remove protection and unregister all succeeded RAM blocks
1618          */
1619         uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false);
1620         uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length);
1621         /* Cleanup flags and remove reference */
1622         bs->flags &= ~RAM_UF_WRITEPROTECT;
1623         memory_region_unref(bs->mr);
1624     }
1625
1626     uffd_close_fd(uffd_fd);
1627     rs->uffdio_fd = -1;
1628     return -1;
1629 }
1630
1631 /**
1632  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1633  */
1634 void ram_write_tracking_stop(void)
1635 {
1636     RAMState *rs = ram_state;
1637     RAMBlock *bs;
1638
1639     RCU_READ_LOCK_GUARD();
1640
1641     RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
1642         if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
1643             continue;
1644         }
1645         /* Remove protection and unregister all affected RAM blocks */
1646         uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false);
1647         uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length);
1648
1649         trace_ram_write_tracking_ramblock_stop(bs->idstr, bs->page_size,
1650                 bs->host, bs->max_length);
1651
1652         /* Cleanup flags and remove reference */
1653         bs->flags &= ~RAM_UF_WRITEPROTECT;
1654         memory_region_unref(bs->mr);
1655     }
1656
1657     /* Finally close UFFD file descriptor */
1658     uffd_close_fd(rs->uffdio_fd);
1659     rs->uffdio_fd = -1;
1660 }
1661
1662 #else
1663 /* No target OS support, stubs just fail or ignore */
1664
1665 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1666 {
1667     (void) rs;
1668     (void) offset;
1669
1670     return NULL;
1671 }
1672
1673 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1674         unsigned long start_page)
1675 {
1676     (void) rs;
1677     (void) pss;
1678     (void) start_page;
1679
1680     return 0;
1681 }
1682
1683 bool ram_write_tracking_available(void)
1684 {
1685     return false;
1686 }
1687
1688 bool ram_write_tracking_compatible(void)
1689 {
1690     assert(0);
1691     return false;
1692 }
1693
1694 int ram_write_tracking_start(void)
1695 {
1696     assert(0);
1697     return -1;
1698 }
1699
1700 void ram_write_tracking_stop(void)
1701 {
1702     assert(0);
1703 }
1704 #endif /* defined(__linux__) */
1705
1706 /**
1707  * get_queued_page: unqueue a page from the postcopy requests
1708  *
1709  * Skips pages that are already sent (!dirty)
1710  *
1711  * Returns true if a queued page is found
1712  *
1713  * @rs: current RAM state
1714  * @pss: data about the state of the current dirty page scan
1715  */
1716 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1717 {
1718     RAMBlock  *block;
1719     ram_addr_t offset;
1720     bool dirty;
1721
1722     do {
1723         block = unqueue_page(rs, &offset);
1724         /*
1725          * We're sending this page, and since it's postcopy nothing else
1726          * will dirty it, and we must make sure it doesn't get sent again
1727          * even if this queue request was received after the background
1728          * search already sent it.
1729          */
1730         if (block) {
1731             unsigned long page;
1732
1733             page = offset >> TARGET_PAGE_BITS;
1734             dirty = test_bit(page, block->bmap);
1735             if (!dirty) {
1736                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1737                                                 page);
1738             } else {
1739                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1740             }
1741         }
1742
1743     } while (block && !dirty);
1744
1745     if (!block) {
1746         /*
1747          * Poll write faults too if background snapshot is enabled; that's
1748          * when we have vcpus got blocked by the write protected pages.
1749          */
1750         block = poll_fault_page(rs, &offset);
1751     }
1752
1753     if (block) {
1754         /*
1755          * As soon as we start servicing pages out of order, then we have
1756          * to kill the bulk stage, since the bulk stage assumes
1757          * in (migration_bitmap_find_and_reset_dirty) that every page is
1758          * dirty, that's no longer true.
1759          */
1760         rs->ram_bulk_stage = false;
1761
1762         /*
1763          * We want the background search to continue from the queued page
1764          * since the guest is likely to want other pages near to the page
1765          * it just requested.
1766          */
1767         pss->block = block;
1768         pss->page = offset >> TARGET_PAGE_BITS;
1769
1770         /*
1771          * This unqueued page would break the "one round" check, even is
1772          * really rare.
1773          */
1774         pss->complete_round = false;
1775     }
1776
1777     return !!block;
1778 }
1779
1780 /**
1781  * migration_page_queue_free: drop any remaining pages in the ram
1782  * request queue
1783  *
1784  * It should be empty at the end anyway, but in error cases there may
1785  * be some left.  in case that there is any page left, we drop it.
1786  *
1787  */
1788 static void migration_page_queue_free(RAMState *rs)
1789 {
1790     struct RAMSrcPageRequest *mspr, *next_mspr;
1791     /* This queue generally should be empty - but in the case of a failed
1792      * migration might have some droppings in.
1793      */
1794     RCU_READ_LOCK_GUARD();
1795     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1796         memory_region_unref(mspr->rb->mr);
1797         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1798         g_free(mspr);
1799     }
1800 }
1801
1802 /**
1803  * ram_save_queue_pages: queue the page for transmission
1804  *
1805  * A request from postcopy destination for example.
1806  *
1807  * Returns zero on success or negative on error
1808  *
1809  * @rbname: Name of the RAMBLock of the request. NULL means the
1810  *          same that last one.
1811  * @start: starting address from the start of the RAMBlock
1812  * @len: length (in bytes) to send
1813  */
1814 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1815 {
1816     RAMBlock *ramblock;
1817     RAMState *rs = ram_state;
1818
1819     ram_counters.postcopy_requests++;
1820     RCU_READ_LOCK_GUARD();
1821
1822     if (!rbname) {
1823         /* Reuse last RAMBlock */
1824         ramblock = rs->last_req_rb;
1825
1826         if (!ramblock) {
1827             /*
1828              * Shouldn't happen, we can't reuse the last RAMBlock if
1829              * it's the 1st request.
1830              */
1831             error_report("ram_save_queue_pages no previous block");
1832             return -1;
1833         }
1834     } else {
1835         ramblock = qemu_ram_block_by_name(rbname);
1836
1837         if (!ramblock) {
1838             /* We shouldn't be asked for a non-existent RAMBlock */
1839             error_report("ram_save_queue_pages no block '%s'", rbname);
1840             return -1;
1841         }
1842         rs->last_req_rb = ramblock;
1843     }
1844     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1845     if (start + len > ramblock->used_length) {
1846         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1847                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1848                      __func__, start, len, ramblock->used_length);
1849         return -1;
1850     }
1851
1852     struct RAMSrcPageRequest *new_entry =
1853         g_malloc0(sizeof(struct RAMSrcPageRequest));
1854     new_entry->rb = ramblock;
1855     new_entry->offset = start;
1856     new_entry->len = len;
1857
1858     memory_region_ref(ramblock->mr);
1859     qemu_mutex_lock(&rs->src_page_req_mutex);
1860     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1861     migration_make_urgent_request();
1862     qemu_mutex_unlock(&rs->src_page_req_mutex);
1863
1864     return 0;
1865 }
1866
1867 static bool save_page_use_compression(RAMState *rs)
1868 {
1869     if (!migrate_use_compression()) {
1870         return false;
1871     }
1872
1873     /*
1874      * If xbzrle is on, stop using the data compression after first
1875      * round of migration even if compression is enabled. In theory,
1876      * xbzrle can do better than compression.
1877      */
1878     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1879         return true;
1880     }
1881
1882     return false;
1883 }
1884
1885 /*
1886  * try to compress the page before posting it out, return true if the page
1887  * has been properly handled by compression, otherwise needs other
1888  * paths to handle it
1889  */
1890 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1891 {
1892     if (!save_page_use_compression(rs)) {
1893         return false;
1894     }
1895
1896     /*
1897      * When starting the process of a new block, the first page of
1898      * the block should be sent out before other pages in the same
1899      * block, and all the pages in last block should have been sent
1900      * out, keeping this order is important, because the 'cont' flag
1901      * is used to avoid resending the block name.
1902      *
1903      * We post the fist page as normal page as compression will take
1904      * much CPU resource.
1905      */
1906     if (block != rs->last_sent_block) {
1907         flush_compressed_data(rs);
1908         return false;
1909     }
1910
1911     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1912         return true;
1913     }
1914
1915     compression_counters.busy++;
1916     return false;
1917 }
1918
1919 /**
1920  * ram_save_target_page: save one target page
1921  *
1922  * Returns the number of pages written
1923  *
1924  * @rs: current RAM state
1925  * @pss: data about the page we want to send
1926  * @last_stage: if we are at the completion stage
1927  */
1928 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1929                                 bool last_stage)
1930 {
1931     RAMBlock *block = pss->block;
1932     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1933     int res;
1934
1935     if (control_save_page(rs, block, offset, &res)) {
1936         return res;
1937     }
1938
1939     if (save_compress_page(rs, block, offset)) {
1940         return 1;
1941     }
1942
1943     res = save_zero_page(rs, block, offset);
1944     if (res > 0) {
1945         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1946          * page would be stale
1947          */
1948         if (!save_page_use_compression(rs)) {
1949             XBZRLE_cache_lock();
1950             xbzrle_cache_zero_page(rs, block->offset + offset);
1951             XBZRLE_cache_unlock();
1952         }
1953         ram_release_pages(block->idstr, offset, res);
1954         return res;
1955     }
1956
1957     /*
1958      * Do not use multifd for:
1959      * 1. Compression as the first page in the new block should be posted out
1960      *    before sending the compressed page
1961      * 2. In postcopy as one whole host page should be placed
1962      */
1963     if (!save_page_use_compression(rs) && migrate_use_multifd()
1964         && !migration_in_postcopy()) {
1965         return ram_save_multifd_page(rs, block, offset);
1966     }
1967
1968     return ram_save_page(rs, pss, last_stage);
1969 }
1970
1971 /**
1972  * ram_save_host_page: save a whole host page
1973  *
1974  * Starting at *offset send pages up to the end of the current host
1975  * page. It's valid for the initial offset to point into the middle of
1976  * a host page in which case the remainder of the hostpage is sent.
1977  * Only dirty target pages are sent. Note that the host page size may
1978  * be a huge page for this block.
1979  * The saving stops at the boundary of the used_length of the block
1980  * if the RAMBlock isn't a multiple of the host page size.
1981  *
1982  * Returns the number of pages written or negative on error
1983  *
1984  * @rs: current RAM state
1985  * @ms: current migration state
1986  * @pss: data about the page we want to send
1987  * @last_stage: if we are at the completion stage
1988  */
1989 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1990                               bool last_stage)
1991 {
1992     int tmppages, pages = 0;
1993     size_t pagesize_bits =
1994         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1995     unsigned long start_page = pss->page;
1996     int res;
1997
1998     if (ramblock_is_ignored(pss->block)) {
1999         error_report("block %s should not be migrated !", pss->block->idstr);
2000         return 0;
2001     }
2002
2003     do {
2004         /* Check the pages is dirty and if it is send it */
2005         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2006             pss->page++;
2007             continue;
2008         }
2009
2010         tmppages = ram_save_target_page(rs, pss, last_stage);
2011         if (tmppages < 0) {
2012             return tmppages;
2013         }
2014
2015         pages += tmppages;
2016         pss->page++;
2017         /* Allow rate limiting to happen in the middle of huge pages */
2018         migration_rate_limit();
2019     } while ((pss->page & (pagesize_bits - 1)) &&
2020              offset_in_ramblock(pss->block,
2021                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2022     /* The offset we leave with is the last one we looked at */
2023     pss->page--;
2024
2025     res = ram_save_release_protection(rs, pss, start_page);
2026     return (res < 0 ? res : pages);
2027 }
2028
2029 /**
2030  * ram_find_and_save_block: finds a dirty page and sends it to f
2031  *
2032  * Called within an RCU critical section.
2033  *
2034  * Returns the number of pages written where zero means no dirty pages,
2035  * or negative on error
2036  *
2037  * @rs: current RAM state
2038  * @last_stage: if we are at the completion stage
2039  *
2040  * On systems where host-page-size > target-page-size it will send all the
2041  * pages in a host page that are dirty.
2042  */
2043
2044 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2045 {
2046     PageSearchStatus pss;
2047     int pages = 0;
2048     bool again, found;
2049
2050     /* No dirty page as there is zero RAM */
2051     if (!ram_bytes_total()) {
2052         return pages;
2053     }
2054
2055     pss.block = rs->last_seen_block;
2056     pss.page = rs->last_page;
2057     pss.complete_round = false;
2058
2059     if (!pss.block) {
2060         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2061     }
2062
2063     do {
2064         again = true;
2065         found = get_queued_page(rs, &pss);
2066
2067         if (!found) {
2068             /* priority queue empty, so just search for something dirty */
2069             found = find_dirty_block(rs, &pss, &again);
2070         }
2071
2072         if (found) {
2073             pages = ram_save_host_page(rs, &pss, last_stage);
2074         }
2075     } while (!pages && again);
2076
2077     rs->last_seen_block = pss.block;
2078     rs->last_page = pss.page;
2079
2080     return pages;
2081 }
2082
2083 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2084 {
2085     uint64_t pages = size / TARGET_PAGE_SIZE;
2086
2087     if (zero) {
2088         ram_counters.duplicate += pages;
2089     } else {
2090         ram_counters.normal += pages;
2091         ram_counters.transferred += size;
2092         qemu_update_position(f, size);
2093     }
2094 }
2095
2096 static uint64_t ram_bytes_total_common(bool count_ignored)
2097 {
2098     RAMBlock *block;
2099     uint64_t total = 0;
2100
2101     RCU_READ_LOCK_GUARD();
2102
2103     if (count_ignored) {
2104         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2105             total += block->used_length;
2106         }
2107     } else {
2108         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2109             total += block->used_length;
2110         }
2111     }
2112     return total;
2113 }
2114
2115 uint64_t ram_bytes_total(void)
2116 {
2117     return ram_bytes_total_common(false);
2118 }
2119
2120 static void xbzrle_load_setup(void)
2121 {
2122     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2123 }
2124
2125 static void xbzrle_load_cleanup(void)
2126 {
2127     g_free(XBZRLE.decoded_buf);
2128     XBZRLE.decoded_buf = NULL;
2129 }
2130
2131 static void ram_state_cleanup(RAMState **rsp)
2132 {
2133     if (*rsp) {
2134         migration_page_queue_free(*rsp);
2135         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2136         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2137         g_free(*rsp);
2138         *rsp = NULL;
2139     }
2140 }
2141
2142 static void xbzrle_cleanup(void)
2143 {
2144     XBZRLE_cache_lock();
2145     if (XBZRLE.cache) {
2146         cache_fini(XBZRLE.cache);
2147         g_free(XBZRLE.encoded_buf);
2148         g_free(XBZRLE.current_buf);
2149         g_free(XBZRLE.zero_target_page);
2150         XBZRLE.cache = NULL;
2151         XBZRLE.encoded_buf = NULL;
2152         XBZRLE.current_buf = NULL;
2153         XBZRLE.zero_target_page = NULL;
2154     }
2155     XBZRLE_cache_unlock();
2156 }
2157
2158 static void ram_save_cleanup(void *opaque)
2159 {
2160     RAMState **rsp = opaque;
2161     RAMBlock *block;
2162
2163     /* We don't use dirty log with background snapshots */
2164     if (!migrate_background_snapshot()) {
2165         /* caller have hold iothread lock or is in a bh, so there is
2166          * no writing race against the migration bitmap
2167          */
2168         memory_global_dirty_log_stop();
2169     }
2170
2171     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2172         g_free(block->clear_bmap);
2173         block->clear_bmap = NULL;
2174         g_free(block->bmap);
2175         block->bmap = NULL;
2176     }
2177
2178     xbzrle_cleanup();
2179     compress_threads_save_cleanup();
2180     ram_state_cleanup(rsp);
2181 }
2182
2183 static void ram_state_reset(RAMState *rs)
2184 {
2185     rs->last_seen_block = NULL;
2186     rs->last_sent_block = NULL;
2187     rs->last_page = 0;
2188     rs->last_version = ram_list.version;
2189     rs->ram_bulk_stage = true;
2190     rs->fpo_enabled = false;
2191 }
2192
2193 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2194
2195 /*
2196  * 'expected' is the value you expect the bitmap mostly to be full
2197  * of; it won't bother printing lines that are all this value.
2198  * If 'todump' is null the migration bitmap is dumped.
2199  */
2200 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2201                            unsigned long pages)
2202 {
2203     int64_t cur;
2204     int64_t linelen = 128;
2205     char linebuf[129];
2206
2207     for (cur = 0; cur < pages; cur += linelen) {
2208         int64_t curb;
2209         bool found = false;
2210         /*
2211          * Last line; catch the case where the line length
2212          * is longer than remaining ram
2213          */
2214         if (cur + linelen > pages) {
2215             linelen = pages - cur;
2216         }
2217         for (curb = 0; curb < linelen; curb++) {
2218             bool thisbit = test_bit(cur + curb, todump);
2219             linebuf[curb] = thisbit ? '1' : '.';
2220             found = found || (thisbit != expected);
2221         }
2222         if (found) {
2223             linebuf[curb] = '\0';
2224             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2225         }
2226     }
2227 }
2228
2229 /* **** functions for postcopy ***** */
2230
2231 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2232 {
2233     struct RAMBlock *block;
2234
2235     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2236         unsigned long *bitmap = block->bmap;
2237         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2238         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2239
2240         while (run_start < range) {
2241             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2242             ram_discard_range(block->idstr,
2243                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2244                               ((ram_addr_t)(run_end - run_start))
2245                                 << TARGET_PAGE_BITS);
2246             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2247         }
2248     }
2249 }
2250
2251 /**
2252  * postcopy_send_discard_bm_ram: discard a RAMBlock
2253  *
2254  * Returns zero on success
2255  *
2256  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2257  *
2258  * @ms: current migration state
2259  * @block: RAMBlock to discard
2260  */
2261 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2262 {
2263     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2264     unsigned long current;
2265     unsigned long *bitmap = block->bmap;
2266
2267     for (current = 0; current < end; ) {
2268         unsigned long one = find_next_bit(bitmap, end, current);
2269         unsigned long zero, discard_length;
2270
2271         if (one >= end) {
2272             break;
2273         }
2274
2275         zero = find_next_zero_bit(bitmap, end, one + 1);
2276
2277         if (zero >= end) {
2278             discard_length = end - one;
2279         } else {
2280             discard_length = zero - one;
2281         }
2282         postcopy_discard_send_range(ms, one, discard_length);
2283         current = one + discard_length;
2284     }
2285
2286     return 0;
2287 }
2288
2289 /**
2290  * postcopy_each_ram_send_discard: discard all RAMBlocks
2291  *
2292  * Returns 0 for success or negative for error
2293  *
2294  * Utility for the outgoing postcopy code.
2295  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2296  *   passing it bitmap indexes and name.
2297  * (qemu_ram_foreach_block ends up passing unscaled lengths
2298  *  which would mean postcopy code would have to deal with target page)
2299  *
2300  * @ms: current migration state
2301  */
2302 static int postcopy_each_ram_send_discard(MigrationState *ms)
2303 {
2304     struct RAMBlock *block;
2305     int ret;
2306
2307     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2308         postcopy_discard_send_init(ms, block->idstr);
2309
2310         /*
2311          * Postcopy sends chunks of bitmap over the wire, but it
2312          * just needs indexes at this point, avoids it having
2313          * target page specific code.
2314          */
2315         ret = postcopy_send_discard_bm_ram(ms, block);
2316         postcopy_discard_send_finish(ms);
2317         if (ret) {
2318             return ret;
2319         }
2320     }
2321
2322     return 0;
2323 }
2324
2325 /**
2326  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2327  *
2328  * Helper for postcopy_chunk_hostpages; it's called twice to
2329  * canonicalize the two bitmaps, that are similar, but one is
2330  * inverted.
2331  *
2332  * Postcopy requires that all target pages in a hostpage are dirty or
2333  * clean, not a mix.  This function canonicalizes the bitmaps.
2334  *
2335  * @ms: current migration state
2336  * @block: block that contains the page we want to canonicalize
2337  */
2338 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2339 {
2340     RAMState *rs = ram_state;
2341     unsigned long *bitmap = block->bmap;
2342     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2343     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2344     unsigned long run_start;
2345
2346     if (block->page_size == TARGET_PAGE_SIZE) {
2347         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2348         return;
2349     }
2350
2351     /* Find a dirty page */
2352     run_start = find_next_bit(bitmap, pages, 0);
2353
2354     while (run_start < pages) {
2355
2356         /*
2357          * If the start of this run of pages is in the middle of a host
2358          * page, then we need to fixup this host page.
2359          */
2360         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2361             /* Find the end of this run */
2362             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2363             /*
2364              * If the end isn't at the start of a host page, then the
2365              * run doesn't finish at the end of a host page
2366              * and we need to discard.
2367              */
2368         }
2369
2370         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2371             unsigned long page;
2372             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2373                                                              host_ratio);
2374             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2375
2376             /* Clean up the bitmap */
2377             for (page = fixup_start_addr;
2378                  page < fixup_start_addr + host_ratio; page++) {
2379                 /*
2380                  * Remark them as dirty, updating the count for any pages
2381                  * that weren't previously dirty.
2382                  */
2383                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2384             }
2385         }
2386
2387         /* Find the next dirty page for the next iteration */
2388         run_start = find_next_bit(bitmap, pages, run_start);
2389     }
2390 }
2391
2392 /**
2393  * postcopy_chunk_hostpages: discard any partially sent host page
2394  *
2395  * Utility for the outgoing postcopy code.
2396  *
2397  * Discard any partially sent host-page size chunks, mark any partially
2398  * dirty host-page size chunks as all dirty.  In this case the host-page
2399  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2400  *
2401  * Returns zero on success
2402  *
2403  * @ms: current migration state
2404  * @block: block we want to work with
2405  */
2406 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2407 {
2408     postcopy_discard_send_init(ms, block->idstr);
2409
2410     /*
2411      * Ensure that all partially dirty host pages are made fully dirty.
2412      */
2413     postcopy_chunk_hostpages_pass(ms, block);
2414
2415     postcopy_discard_send_finish(ms);
2416     return 0;
2417 }
2418
2419 /**
2420  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2421  *
2422  * Returns zero on success
2423  *
2424  * Transmit the set of pages to be discarded after precopy to the target
2425  * these are pages that:
2426  *     a) Have been previously transmitted but are now dirty again
2427  *     b) Pages that have never been transmitted, this ensures that
2428  *        any pages on the destination that have been mapped by background
2429  *        tasks get discarded (transparent huge pages is the specific concern)
2430  * Hopefully this is pretty sparse
2431  *
2432  * @ms: current migration state
2433  */
2434 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2435 {
2436     RAMState *rs = ram_state;
2437     RAMBlock *block;
2438     int ret;
2439
2440     RCU_READ_LOCK_GUARD();
2441
2442     /* This should be our last sync, the src is now paused */
2443     migration_bitmap_sync(rs);
2444
2445     /* Easiest way to make sure we don't resume in the middle of a host-page */
2446     rs->last_seen_block = NULL;
2447     rs->last_sent_block = NULL;
2448     rs->last_page = 0;
2449
2450     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2451         /* Deal with TPS != HPS and huge pages */
2452         ret = postcopy_chunk_hostpages(ms, block);
2453         if (ret) {
2454             return ret;
2455         }
2456
2457 #ifdef DEBUG_POSTCOPY
2458         ram_debug_dump_bitmap(block->bmap, true,
2459                               block->used_length >> TARGET_PAGE_BITS);
2460 #endif
2461     }
2462     trace_ram_postcopy_send_discard_bitmap();
2463
2464     return postcopy_each_ram_send_discard(ms);
2465 }
2466
2467 /**
2468  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2469  *
2470  * Returns zero on success
2471  *
2472  * @rbname: name of the RAMBlock of the request. NULL means the
2473  *          same that last one.
2474  * @start: RAMBlock starting page
2475  * @length: RAMBlock size
2476  */
2477 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2478 {
2479     trace_ram_discard_range(rbname, start, length);
2480
2481     RCU_READ_LOCK_GUARD();
2482     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2483
2484     if (!rb) {
2485         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2486         return -1;
2487     }
2488
2489     /*
2490      * On source VM, we don't need to update the received bitmap since
2491      * we don't even have one.
2492      */
2493     if (rb->receivedmap) {
2494         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2495                      length >> qemu_target_page_bits());
2496     }
2497
2498     return ram_block_discard_range(rb, start, length);
2499 }
2500
2501 /*
2502  * For every allocation, we will try not to crash the VM if the
2503  * allocation failed.
2504  */
2505 static int xbzrle_init(void)
2506 {
2507     Error *local_err = NULL;
2508
2509     if (!migrate_use_xbzrle()) {
2510         return 0;
2511     }
2512
2513     XBZRLE_cache_lock();
2514
2515     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2516     if (!XBZRLE.zero_target_page) {
2517         error_report("%s: Error allocating zero page", __func__);
2518         goto err_out;
2519     }
2520
2521     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2522                               TARGET_PAGE_SIZE, &local_err);
2523     if (!XBZRLE.cache) {
2524         error_report_err(local_err);
2525         goto free_zero_page;
2526     }
2527
2528     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2529     if (!XBZRLE.encoded_buf) {
2530         error_report("%s: Error allocating encoded_buf", __func__);
2531         goto free_cache;
2532     }
2533
2534     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2535     if (!XBZRLE.current_buf) {
2536         error_report("%s: Error allocating current_buf", __func__);
2537         goto free_encoded_buf;
2538     }
2539
2540     /* We are all good */
2541     XBZRLE_cache_unlock();
2542     return 0;
2543
2544 free_encoded_buf:
2545     g_free(XBZRLE.encoded_buf);
2546     XBZRLE.encoded_buf = NULL;
2547 free_cache:
2548     cache_fini(XBZRLE.cache);
2549     XBZRLE.cache = NULL;
2550 free_zero_page:
2551     g_free(XBZRLE.zero_target_page);
2552     XBZRLE.zero_target_page = NULL;
2553 err_out:
2554     XBZRLE_cache_unlock();
2555     return -ENOMEM;
2556 }
2557
2558 static int ram_state_init(RAMState **rsp)
2559 {
2560     *rsp = g_try_new0(RAMState, 1);
2561
2562     if (!*rsp) {
2563         error_report("%s: Init ramstate fail", __func__);
2564         return -1;
2565     }
2566
2567     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2568     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2569     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2570
2571     /*
2572      * Count the total number of pages used by ram blocks not including any
2573      * gaps due to alignment or unplugs.
2574      * This must match with the initial values of dirty bitmap.
2575      */
2576     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2577     ram_state_reset(*rsp);
2578
2579     return 0;
2580 }
2581
2582 static void ram_list_init_bitmaps(void)
2583 {
2584     MigrationState *ms = migrate_get_current();
2585     RAMBlock *block;
2586     unsigned long pages;
2587     uint8_t shift;
2588
2589     /* Skip setting bitmap if there is no RAM */
2590     if (ram_bytes_total()) {
2591         shift = ms->clear_bitmap_shift;
2592         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2593             error_report("clear_bitmap_shift (%u) too big, using "
2594                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2595             shift = CLEAR_BITMAP_SHIFT_MAX;
2596         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2597             error_report("clear_bitmap_shift (%u) too small, using "
2598                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2599             shift = CLEAR_BITMAP_SHIFT_MIN;
2600         }
2601
2602         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2603             pages = block->max_length >> TARGET_PAGE_BITS;
2604             /*
2605              * The initial dirty bitmap for migration must be set with all
2606              * ones to make sure we'll migrate every guest RAM page to
2607              * destination.
2608              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2609              * new migration after a failed migration, ram_list.
2610              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2611              * guest memory.
2612              */
2613             block->bmap = bitmap_new(pages);
2614             bitmap_set(block->bmap, 0, pages);
2615             block->clear_bmap_shift = shift;
2616             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2617         }
2618     }
2619 }
2620
2621 static void ram_init_bitmaps(RAMState *rs)
2622 {
2623     /* For memory_global_dirty_log_start below.  */
2624     qemu_mutex_lock_iothread();
2625     qemu_mutex_lock_ramlist();
2626
2627     WITH_RCU_READ_LOCK_GUARD() {
2628         ram_list_init_bitmaps();
2629         /* We don't use dirty log with background snapshots */
2630         if (!migrate_background_snapshot()) {
2631             memory_global_dirty_log_start();
2632             migration_bitmap_sync_precopy(rs);
2633         }
2634     }
2635     qemu_mutex_unlock_ramlist();
2636     qemu_mutex_unlock_iothread();
2637 }
2638
2639 static int ram_init_all(RAMState **rsp)
2640 {
2641     if (ram_state_init(rsp)) {
2642         return -1;
2643     }
2644
2645     if (xbzrle_init()) {
2646         ram_state_cleanup(rsp);
2647         return -1;
2648     }
2649
2650     ram_init_bitmaps(*rsp);
2651
2652     return 0;
2653 }
2654
2655 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2656 {
2657     RAMBlock *block;
2658     uint64_t pages = 0;
2659
2660     /*
2661      * Postcopy is not using xbzrle/compression, so no need for that.
2662      * Also, since source are already halted, we don't need to care
2663      * about dirty page logging as well.
2664      */
2665
2666     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2667         pages += bitmap_count_one(block->bmap,
2668                                   block->used_length >> TARGET_PAGE_BITS);
2669     }
2670
2671     /* This may not be aligned with current bitmaps. Recalculate. */
2672     rs->migration_dirty_pages = pages;
2673
2674     rs->last_seen_block = NULL;
2675     rs->last_sent_block = NULL;
2676     rs->last_page = 0;
2677     rs->last_version = ram_list.version;
2678     /*
2679      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2680      * matter what we have sent.
2681      */
2682     rs->ram_bulk_stage = false;
2683
2684     /* Update RAMState cache of output QEMUFile */
2685     rs->f = out;
2686
2687     trace_ram_state_resume_prepare(pages);
2688 }
2689
2690 /*
2691  * This function clears bits of the free pages reported by the caller from the
2692  * migration dirty bitmap. @addr is the host address corresponding to the
2693  * start of the continuous guest free pages, and @len is the total bytes of
2694  * those pages.
2695  */
2696 void qemu_guest_free_page_hint(void *addr, size_t len)
2697 {
2698     RAMBlock *block;
2699     ram_addr_t offset;
2700     size_t used_len, start, npages;
2701     MigrationState *s = migrate_get_current();
2702
2703     /* This function is currently expected to be used during live migration */
2704     if (!migration_is_setup_or_active(s->state)) {
2705         return;
2706     }
2707
2708     for (; len > 0; len -= used_len, addr += used_len) {
2709         block = qemu_ram_block_from_host(addr, false, &offset);
2710         if (unlikely(!block || offset >= block->used_length)) {
2711             /*
2712              * The implementation might not support RAMBlock resize during
2713              * live migration, but it could happen in theory with future
2714              * updates. So we add a check here to capture that case.
2715              */
2716             error_report_once("%s unexpected error", __func__);
2717             return;
2718         }
2719
2720         if (len <= block->used_length - offset) {
2721             used_len = len;
2722         } else {
2723             used_len = block->used_length - offset;
2724         }
2725
2726         start = offset >> TARGET_PAGE_BITS;
2727         npages = used_len >> TARGET_PAGE_BITS;
2728
2729         qemu_mutex_lock(&ram_state->bitmap_mutex);
2730         ram_state->migration_dirty_pages -=
2731                       bitmap_count_one_with_offset(block->bmap, start, npages);
2732         bitmap_clear(block->bmap, start, npages);
2733         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2734     }
2735 }
2736
2737 /*
2738  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2739  * long-running RCU critical section.  When rcu-reclaims in the code
2740  * start to become numerous it will be necessary to reduce the
2741  * granularity of these critical sections.
2742  */
2743
2744 /**
2745  * ram_save_setup: Setup RAM for migration
2746  *
2747  * Returns zero to indicate success and negative for error
2748  *
2749  * @f: QEMUFile where to send the data
2750  * @opaque: RAMState pointer
2751  */
2752 static int ram_save_setup(QEMUFile *f, void *opaque)
2753 {
2754     RAMState **rsp = opaque;
2755     RAMBlock *block;
2756
2757     if (compress_threads_save_setup()) {
2758         return -1;
2759     }
2760
2761     /* migration has already setup the bitmap, reuse it. */
2762     if (!migration_in_colo_state()) {
2763         if (ram_init_all(rsp) != 0) {
2764             compress_threads_save_cleanup();
2765             return -1;
2766         }
2767     }
2768     (*rsp)->f = f;
2769
2770     WITH_RCU_READ_LOCK_GUARD() {
2771         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2772
2773         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2774             qemu_put_byte(f, strlen(block->idstr));
2775             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2776             qemu_put_be64(f, block->used_length);
2777             if (migrate_postcopy_ram() && block->page_size !=
2778                                           qemu_host_page_size) {
2779                 qemu_put_be64(f, block->page_size);
2780             }
2781             if (migrate_ignore_shared()) {
2782                 qemu_put_be64(f, block->mr->addr);
2783             }
2784         }
2785     }
2786
2787     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2788     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2789
2790     multifd_send_sync_main(f);
2791     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2792     qemu_fflush(f);
2793
2794     return 0;
2795 }
2796
2797 /**
2798  * ram_save_iterate: iterative stage for migration
2799  *
2800  * Returns zero to indicate success and negative for error
2801  *
2802  * @f: QEMUFile where to send the data
2803  * @opaque: RAMState pointer
2804  */
2805 static int ram_save_iterate(QEMUFile *f, void *opaque)
2806 {
2807     RAMState **temp = opaque;
2808     RAMState *rs = *temp;
2809     int ret = 0;
2810     int i;
2811     int64_t t0;
2812     int done = 0;
2813
2814     if (blk_mig_bulk_active()) {
2815         /* Avoid transferring ram during bulk phase of block migration as
2816          * the bulk phase will usually take a long time and transferring
2817          * ram updates during that time is pointless. */
2818         goto out;
2819     }
2820
2821     WITH_RCU_READ_LOCK_GUARD() {
2822         if (ram_list.version != rs->last_version) {
2823             ram_state_reset(rs);
2824         }
2825
2826         /* Read version before ram_list.blocks */
2827         smp_rmb();
2828
2829         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2830
2831         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2832         i = 0;
2833         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2834                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2835             int pages;
2836
2837             if (qemu_file_get_error(f)) {
2838                 break;
2839             }
2840
2841             pages = ram_find_and_save_block(rs, false);
2842             /* no more pages to sent */
2843             if (pages == 0) {
2844                 done = 1;
2845                 break;
2846             }
2847
2848             if (pages < 0) {
2849                 qemu_file_set_error(f, pages);
2850                 break;
2851             }
2852
2853             rs->target_page_count += pages;
2854
2855             /*
2856              * During postcopy, it is necessary to make sure one whole host
2857              * page is sent in one chunk.
2858              */
2859             if (migrate_postcopy_ram()) {
2860                 flush_compressed_data(rs);
2861             }
2862
2863             /*
2864              * we want to check in the 1st loop, just in case it was the 1st
2865              * time and we had to sync the dirty bitmap.
2866              * qemu_clock_get_ns() is a bit expensive, so we only check each
2867              * some iterations
2868              */
2869             if ((i & 63) == 0) {
2870                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2871                               1000000;
2872                 if (t1 > MAX_WAIT) {
2873                     trace_ram_save_iterate_big_wait(t1, i);
2874                     break;
2875                 }
2876             }
2877             i++;
2878         }
2879     }
2880
2881     /*
2882      * Must occur before EOS (or any QEMUFile operation)
2883      * because of RDMA protocol.
2884      */
2885     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2886
2887 out:
2888     if (ret >= 0
2889         && migration_is_setup_or_active(migrate_get_current()->state)) {
2890         multifd_send_sync_main(rs->f);
2891         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2892         qemu_fflush(f);
2893         ram_counters.transferred += 8;
2894
2895         ret = qemu_file_get_error(f);
2896     }
2897     if (ret < 0) {
2898         return ret;
2899     }
2900
2901     return done;
2902 }
2903
2904 /**
2905  * ram_save_complete: function called to send the remaining amount of ram
2906  *
2907  * Returns zero to indicate success or negative on error
2908  *
2909  * Called with iothread lock
2910  *
2911  * @f: QEMUFile where to send the data
2912  * @opaque: RAMState pointer
2913  */
2914 static int ram_save_complete(QEMUFile *f, void *opaque)
2915 {
2916     RAMState **temp = opaque;
2917     RAMState *rs = *temp;
2918     int ret = 0;
2919
2920     WITH_RCU_READ_LOCK_GUARD() {
2921         if (!migration_in_postcopy()) {
2922             migration_bitmap_sync_precopy(rs);
2923         }
2924
2925         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2926
2927         /* try transferring iterative blocks of memory */
2928
2929         /* flush all remaining blocks regardless of rate limiting */
2930         while (true) {
2931             int pages;
2932
2933             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2934             /* no more blocks to sent */
2935             if (pages == 0) {
2936                 break;
2937             }
2938             if (pages < 0) {
2939                 ret = pages;
2940                 break;
2941             }
2942         }
2943
2944         flush_compressed_data(rs);
2945         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2946     }
2947
2948     if (ret >= 0) {
2949         multifd_send_sync_main(rs->f);
2950         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2951         qemu_fflush(f);
2952     }
2953
2954     return ret;
2955 }
2956
2957 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2958                              uint64_t *res_precopy_only,
2959                              uint64_t *res_compatible,
2960                              uint64_t *res_postcopy_only)
2961 {
2962     RAMState **temp = opaque;
2963     RAMState *rs = *temp;
2964     uint64_t remaining_size;
2965
2966     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2967
2968     if (!migration_in_postcopy() &&
2969         remaining_size < max_size) {
2970         qemu_mutex_lock_iothread();
2971         WITH_RCU_READ_LOCK_GUARD() {
2972             migration_bitmap_sync_precopy(rs);
2973         }
2974         qemu_mutex_unlock_iothread();
2975         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2976     }
2977
2978     if (migrate_postcopy_ram()) {
2979         /* We can do postcopy, and all the data is postcopiable */
2980         *res_compatible += remaining_size;
2981     } else {
2982         *res_precopy_only += remaining_size;
2983     }
2984 }
2985
2986 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2987 {
2988     unsigned int xh_len;
2989     int xh_flags;
2990     uint8_t *loaded_data;
2991
2992     /* extract RLE header */
2993     xh_flags = qemu_get_byte(f);
2994     xh_len = qemu_get_be16(f);
2995
2996     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2997         error_report("Failed to load XBZRLE page - wrong compression!");
2998         return -1;
2999     }
3000
3001     if (xh_len > TARGET_PAGE_SIZE) {
3002         error_report("Failed to load XBZRLE page - len overflow!");
3003         return -1;
3004     }
3005     loaded_data = XBZRLE.decoded_buf;
3006     /* load data and decode */
3007     /* it can change loaded_data to point to an internal buffer */
3008     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3009
3010     /* decode RLE */
3011     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3012                              TARGET_PAGE_SIZE) == -1) {
3013         error_report("Failed to load XBZRLE page - decode error!");
3014         return -1;
3015     }
3016
3017     return 0;
3018 }
3019
3020 /**
3021  * ram_block_from_stream: read a RAMBlock id from the migration stream
3022  *
3023  * Must be called from within a rcu critical section.
3024  *
3025  * Returns a pointer from within the RCU-protected ram_list.
3026  *
3027  * @f: QEMUFile where to read the data from
3028  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3029  */
3030 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3031 {
3032     static RAMBlock *block;
3033     char id[256];
3034     uint8_t len;
3035
3036     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3037         if (!block) {
3038             error_report("Ack, bad migration stream!");
3039             return NULL;
3040         }
3041         return block;
3042     }
3043
3044     len = qemu_get_byte(f);
3045     qemu_get_buffer(f, (uint8_t *)id, len);
3046     id[len] = 0;
3047
3048     block = qemu_ram_block_by_name(id);
3049     if (!block) {
3050         error_report("Can't find block %s", id);
3051         return NULL;
3052     }
3053
3054     if (ramblock_is_ignored(block)) {
3055         error_report("block %s should not be migrated !", id);
3056         return NULL;
3057     }
3058
3059     return block;
3060 }
3061
3062 static inline void *host_from_ram_block_offset(RAMBlock *block,
3063                                                ram_addr_t offset)
3064 {
3065     if (!offset_in_ramblock(block, offset)) {
3066         return NULL;
3067     }
3068
3069     return block->host + offset;
3070 }
3071
3072 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3073                              ram_addr_t offset, bool record_bitmap)
3074 {
3075     if (!offset_in_ramblock(block, offset)) {
3076         return NULL;
3077     }
3078     if (!block->colo_cache) {
3079         error_report("%s: colo_cache is NULL in block :%s",
3080                      __func__, block->idstr);
3081         return NULL;
3082     }
3083
3084     /*
3085     * During colo checkpoint, we need bitmap of these migrated pages.
3086     * It help us to decide which pages in ram cache should be flushed
3087     * into VM's RAM later.
3088     */
3089     if (record_bitmap &&
3090         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3091         ram_state->migration_dirty_pages++;
3092     }
3093     return block->colo_cache + offset;
3094 }
3095
3096 /**
3097  * ram_handle_compressed: handle the zero page case
3098  *
3099  * If a page (or a whole RDMA chunk) has been
3100  * determined to be zero, then zap it.
3101  *
3102  * @host: host address for the zero page
3103  * @ch: what the page is filled from.  We only support zero
3104  * @size: size of the zero page
3105  */
3106 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3107 {
3108     if (ch != 0 || !is_zero_range(host, size)) {
3109         memset(host, ch, size);
3110     }
3111 }
3112
3113 /* return the size after decompression, or negative value on error */
3114 static int
3115 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3116                      const uint8_t *source, size_t source_len)
3117 {
3118     int err;
3119
3120     err = inflateReset(stream);
3121     if (err != Z_OK) {
3122         return -1;
3123     }
3124
3125     stream->avail_in = source_len;
3126     stream->next_in = (uint8_t *)source;
3127     stream->avail_out = dest_len;
3128     stream->next_out = dest;
3129
3130     err = inflate(stream, Z_NO_FLUSH);
3131     if (err != Z_STREAM_END) {
3132         return -1;
3133     }
3134
3135     return stream->total_out;
3136 }
3137
3138 static void *do_data_decompress(void *opaque)
3139 {
3140     DecompressParam *param = opaque;
3141     unsigned long pagesize;
3142     uint8_t *des;
3143     int len, ret;
3144
3145     qemu_mutex_lock(&param->mutex);
3146     while (!param->quit) {
3147         if (param->des) {
3148             des = param->des;
3149             len = param->len;
3150             param->des = 0;
3151             qemu_mutex_unlock(&param->mutex);
3152
3153             pagesize = TARGET_PAGE_SIZE;
3154
3155             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3156                                        param->compbuf, len);
3157             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3158                 error_report("decompress data failed");
3159                 qemu_file_set_error(decomp_file, ret);
3160             }
3161
3162             qemu_mutex_lock(&decomp_done_lock);
3163             param->done = true;
3164             qemu_cond_signal(&decomp_done_cond);
3165             qemu_mutex_unlock(&decomp_done_lock);
3166
3167             qemu_mutex_lock(&param->mutex);
3168         } else {
3169             qemu_cond_wait(&param->cond, &param->mutex);
3170         }
3171     }
3172     qemu_mutex_unlock(&param->mutex);
3173
3174     return NULL;
3175 }
3176
3177 static int wait_for_decompress_done(void)
3178 {
3179     int idx, thread_count;
3180
3181     if (!migrate_use_compression()) {
3182         return 0;
3183     }
3184
3185     thread_count = migrate_decompress_threads();
3186     qemu_mutex_lock(&decomp_done_lock);
3187     for (idx = 0; idx < thread_count; idx++) {
3188         while (!decomp_param[idx].done) {
3189             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3190         }
3191     }
3192     qemu_mutex_unlock(&decomp_done_lock);
3193     return qemu_file_get_error(decomp_file);
3194 }
3195
3196 static void compress_threads_load_cleanup(void)
3197 {
3198     int i, thread_count;
3199
3200     if (!migrate_use_compression()) {
3201         return;
3202     }
3203     thread_count = migrate_decompress_threads();
3204     for (i = 0; i < thread_count; i++) {
3205         /*
3206          * we use it as a indicator which shows if the thread is
3207          * properly init'd or not
3208          */
3209         if (!decomp_param[i].compbuf) {
3210             break;
3211         }
3212
3213         qemu_mutex_lock(&decomp_param[i].mutex);
3214         decomp_param[i].quit = true;
3215         qemu_cond_signal(&decomp_param[i].cond);
3216         qemu_mutex_unlock(&decomp_param[i].mutex);
3217     }
3218     for (i = 0; i < thread_count; i++) {
3219         if (!decomp_param[i].compbuf) {
3220             break;
3221         }
3222
3223         qemu_thread_join(decompress_threads + i);
3224         qemu_mutex_destroy(&decomp_param[i].mutex);
3225         qemu_cond_destroy(&decomp_param[i].cond);
3226         inflateEnd(&decomp_param[i].stream);
3227         g_free(decomp_param[i].compbuf);
3228         decomp_param[i].compbuf = NULL;
3229     }
3230     g_free(decompress_threads);
3231     g_free(decomp_param);
3232     decompress_threads = NULL;
3233     decomp_param = NULL;
3234     decomp_file = NULL;
3235 }
3236
3237 static int compress_threads_load_setup(QEMUFile *f)
3238 {
3239     int i, thread_count;
3240
3241     if (!migrate_use_compression()) {
3242         return 0;
3243     }
3244
3245     thread_count = migrate_decompress_threads();
3246     decompress_threads = g_new0(QemuThread, thread_count);
3247     decomp_param = g_new0(DecompressParam, thread_count);
3248     qemu_mutex_init(&decomp_done_lock);
3249     qemu_cond_init(&decomp_done_cond);
3250     decomp_file = f;
3251     for (i = 0; i < thread_count; i++) {
3252         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3253             goto exit;
3254         }
3255
3256         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3257         qemu_mutex_init(&decomp_param[i].mutex);
3258         qemu_cond_init(&decomp_param[i].cond);
3259         decomp_param[i].done = true;
3260         decomp_param[i].quit = false;
3261         qemu_thread_create(decompress_threads + i, "decompress",
3262                            do_data_decompress, decomp_param + i,
3263                            QEMU_THREAD_JOINABLE);
3264     }
3265     return 0;
3266 exit:
3267     compress_threads_load_cleanup();
3268     return -1;
3269 }
3270
3271 static void decompress_data_with_multi_threads(QEMUFile *f,
3272                                                void *host, int len)
3273 {
3274     int idx, thread_count;
3275
3276     thread_count = migrate_decompress_threads();
3277     QEMU_LOCK_GUARD(&decomp_done_lock);
3278     while (true) {
3279         for (idx = 0; idx < thread_count; idx++) {
3280             if (decomp_param[idx].done) {
3281                 decomp_param[idx].done = false;
3282                 qemu_mutex_lock(&decomp_param[idx].mutex);
3283                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3284                 decomp_param[idx].des = host;
3285                 decomp_param[idx].len = len;
3286                 qemu_cond_signal(&decomp_param[idx].cond);
3287                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3288                 break;
3289             }
3290         }
3291         if (idx < thread_count) {
3292             break;
3293         } else {
3294             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3295         }
3296     }
3297 }
3298
3299  /*
3300   * we must set ram_bulk_stage to false, otherwise in
3301   * migation_bitmap_find_dirty the bitmap will be unused and
3302   * all the pages in ram cache wil be flushed to the ram of
3303   * secondary VM.
3304   */
3305 static void colo_init_ram_state(void)
3306 {
3307     ram_state_init(&ram_state);
3308     ram_state->ram_bulk_stage = false;
3309 }
3310
3311 /*
3312  * colo cache: this is for secondary VM, we cache the whole
3313  * memory of the secondary VM, it is need to hold the global lock
3314  * to call this helper.
3315  */
3316 int colo_init_ram_cache(void)
3317 {
3318     RAMBlock *block;
3319
3320     WITH_RCU_READ_LOCK_GUARD() {
3321         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3322             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3323                                                     NULL,
3324                                                     false);
3325             if (!block->colo_cache) {
3326                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3327                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3328                              block->used_length);
3329                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3330                     if (block->colo_cache) {
3331                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3332                         block->colo_cache = NULL;
3333                     }
3334                 }
3335                 return -errno;
3336             }
3337         }
3338     }
3339
3340     /*
3341     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3342     * with to decide which page in cache should be flushed into SVM's RAM. Here
3343     * we use the same name 'ram_bitmap' as for migration.
3344     */
3345     if (ram_bytes_total()) {
3346         RAMBlock *block;
3347
3348         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3349             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3350             block->bmap = bitmap_new(pages);
3351         }
3352     }
3353
3354     colo_init_ram_state();
3355     return 0;
3356 }
3357
3358 /* TODO: duplicated with ram_init_bitmaps */
3359 void colo_incoming_start_dirty_log(void)
3360 {
3361     RAMBlock *block = NULL;
3362     /* For memory_global_dirty_log_start below. */
3363     qemu_mutex_lock_iothread();
3364     qemu_mutex_lock_ramlist();
3365
3366     memory_global_dirty_log_sync();
3367     WITH_RCU_READ_LOCK_GUARD() {
3368         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3369             ramblock_sync_dirty_bitmap(ram_state, block);
3370             /* Discard this dirty bitmap record */
3371             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3372         }
3373         memory_global_dirty_log_start();
3374     }
3375     ram_state->migration_dirty_pages = 0;
3376     qemu_mutex_unlock_ramlist();
3377     qemu_mutex_unlock_iothread();
3378 }
3379
3380 /* It is need to hold the global lock to call this helper */
3381 void colo_release_ram_cache(void)
3382 {
3383     RAMBlock *block;
3384
3385     memory_global_dirty_log_stop();
3386     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3387         g_free(block->bmap);
3388         block->bmap = NULL;
3389     }
3390
3391     WITH_RCU_READ_LOCK_GUARD() {
3392         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3393             if (block->colo_cache) {
3394                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3395                 block->colo_cache = NULL;
3396             }
3397         }
3398     }
3399     ram_state_cleanup(&ram_state);
3400 }
3401
3402 /**
3403  * ram_load_setup: Setup RAM for migration incoming side
3404  *
3405  * Returns zero to indicate success and negative for error
3406  *
3407  * @f: QEMUFile where to receive the data
3408  * @opaque: RAMState pointer
3409  */
3410 static int ram_load_setup(QEMUFile *f, void *opaque)
3411 {
3412     if (compress_threads_load_setup(f)) {
3413         return -1;
3414     }
3415
3416     xbzrle_load_setup();
3417     ramblock_recv_map_init();
3418
3419     return 0;
3420 }
3421
3422 static int ram_load_cleanup(void *opaque)
3423 {
3424     RAMBlock *rb;
3425
3426     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3427         qemu_ram_block_writeback(rb);
3428     }
3429
3430     xbzrle_load_cleanup();
3431     compress_threads_load_cleanup();
3432
3433     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3434         g_free(rb->receivedmap);
3435         rb->receivedmap = NULL;
3436     }
3437
3438     return 0;
3439 }
3440
3441 /**
3442  * ram_postcopy_incoming_init: allocate postcopy data structures
3443  *
3444  * Returns 0 for success and negative if there was one error
3445  *
3446  * @mis: current migration incoming state
3447  *
3448  * Allocate data structures etc needed by incoming migration with
3449  * postcopy-ram. postcopy-ram's similarly names
3450  * postcopy_ram_incoming_init does the work.
3451  */
3452 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3453 {
3454     return postcopy_ram_incoming_init(mis);
3455 }
3456
3457 /**
3458  * ram_load_postcopy: load a page in postcopy case
3459  *
3460  * Returns 0 for success or -errno in case of error
3461  *
3462  * Called in postcopy mode by ram_load().
3463  * rcu_read_lock is taken prior to this being called.
3464  *
3465  * @f: QEMUFile where to send the data
3466  */
3467 static int ram_load_postcopy(QEMUFile *f)
3468 {
3469     int flags = 0, ret = 0;
3470     bool place_needed = false;
3471     bool matches_target_page_size = false;
3472     MigrationIncomingState *mis = migration_incoming_get_current();
3473     /* Temporary page that is later 'placed' */
3474     void *postcopy_host_page = mis->postcopy_tmp_page;
3475     void *this_host = NULL;
3476     bool all_zero = true;
3477     int target_pages = 0;
3478
3479     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3480         ram_addr_t addr;
3481         void *host = NULL;
3482         void *page_buffer = NULL;
3483         void *place_source = NULL;
3484         RAMBlock *block = NULL;
3485         uint8_t ch;
3486         int len;
3487
3488         addr = qemu_get_be64(f);
3489
3490         /*
3491          * If qemu file error, we should stop here, and then "addr"
3492          * may be invalid
3493          */
3494         ret = qemu_file_get_error(f);
3495         if (ret) {
3496             break;
3497         }
3498
3499         flags = addr & ~TARGET_PAGE_MASK;
3500         addr &= TARGET_PAGE_MASK;
3501
3502         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3503         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3504                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3505             block = ram_block_from_stream(f, flags);
3506
3507             host = host_from_ram_block_offset(block, addr);
3508             if (!host) {
3509                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3510                 ret = -EINVAL;
3511                 break;
3512             }
3513             target_pages++;
3514             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3515             /*
3516              * Postcopy requires that we place whole host pages atomically;
3517              * these may be huge pages for RAMBlocks that are backed by
3518              * hugetlbfs.
3519              * To make it atomic, the data is read into a temporary page
3520              * that's moved into place later.
3521              * The migration protocol uses,  possibly smaller, target-pages
3522              * however the source ensures it always sends all the components
3523              * of a host page in one chunk.
3524              */
3525             page_buffer = postcopy_host_page +
3526                           ((uintptr_t)host & (block->page_size - 1));
3527             if (target_pages == 1) {
3528                 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3529                                                     block->page_size);
3530             } else {
3531                 /* not the 1st TP within the HP */
3532                 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3533                     (uintptr_t)this_host) {
3534                     error_report("Non-same host page %p/%p",
3535                                   host, this_host);
3536                     ret = -EINVAL;
3537                     break;
3538                 }
3539             }
3540
3541             /*
3542              * If it's the last part of a host page then we place the host
3543              * page
3544              */
3545             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3546                 place_needed = true;
3547             }
3548             place_source = postcopy_host_page;
3549         }
3550
3551         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3552         case RAM_SAVE_FLAG_ZERO:
3553             ch = qemu_get_byte(f);
3554             /*
3555              * Can skip to set page_buffer when
3556              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3557              */
3558             if (ch || !matches_target_page_size) {
3559                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3560             }
3561             if (ch) {
3562                 all_zero = false;
3563             }
3564             break;
3565
3566         case RAM_SAVE_FLAG_PAGE:
3567             all_zero = false;
3568             if (!matches_target_page_size) {
3569                 /* For huge pages, we always use temporary buffer */
3570                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3571             } else {
3572                 /*
3573                  * For small pages that matches target page size, we
3574                  * avoid the qemu_file copy.  Instead we directly use
3575                  * the buffer of QEMUFile to place the page.  Note: we
3576                  * cannot do any QEMUFile operation before using that
3577                  * buffer to make sure the buffer is valid when
3578                  * placing the page.
3579                  */
3580                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3581                                          TARGET_PAGE_SIZE);
3582             }
3583             break;
3584         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3585             all_zero = false;
3586             len = qemu_get_be32(f);
3587             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3588                 error_report("Invalid compressed data length: %d", len);
3589                 ret = -EINVAL;
3590                 break;
3591             }
3592             decompress_data_with_multi_threads(f, page_buffer, len);
3593             break;
3594
3595         case RAM_SAVE_FLAG_EOS:
3596             /* normal exit */
3597             multifd_recv_sync_main();
3598             break;
3599         default:
3600             error_report("Unknown combination of migration flags: 0x%x"
3601                          " (postcopy mode)", flags);
3602             ret = -EINVAL;
3603             break;
3604         }
3605
3606         /* Got the whole host page, wait for decompress before placing. */
3607         if (place_needed) {
3608             ret |= wait_for_decompress_done();
3609         }
3610
3611         /* Detect for any possible file errors */
3612         if (!ret && qemu_file_get_error(f)) {
3613             ret = qemu_file_get_error(f);
3614         }
3615
3616         if (!ret && place_needed) {
3617             /* This gets called at the last target page in the host page */
3618             void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3619                                                        block->page_size);
3620
3621             if (all_zero) {
3622                 ret = postcopy_place_page_zero(mis, place_dest,
3623                                                block);
3624             } else {
3625                 ret = postcopy_place_page(mis, place_dest,
3626                                           place_source, block);
3627             }
3628             place_needed = false;
3629             target_pages = 0;
3630             /* Assume we have a zero page until we detect something different */
3631             all_zero = true;
3632         }
3633     }
3634
3635     return ret;
3636 }
3637
3638 static bool postcopy_is_advised(void)
3639 {
3640     PostcopyState ps = postcopy_state_get();
3641     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3642 }
3643
3644 static bool postcopy_is_running(void)
3645 {
3646     PostcopyState ps = postcopy_state_get();
3647     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3648 }
3649
3650 /*
3651  * Flush content of RAM cache into SVM's memory.
3652  * Only flush the pages that be dirtied by PVM or SVM or both.
3653  */
3654 void colo_flush_ram_cache(void)
3655 {
3656     RAMBlock *block = NULL;
3657     void *dst_host;
3658     void *src_host;
3659     unsigned long offset = 0;
3660
3661     memory_global_dirty_log_sync();
3662     WITH_RCU_READ_LOCK_GUARD() {
3663         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3664             ramblock_sync_dirty_bitmap(ram_state, block);
3665         }
3666     }
3667
3668     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3669     WITH_RCU_READ_LOCK_GUARD() {
3670         block = QLIST_FIRST_RCU(&ram_list.blocks);
3671
3672         while (block) {
3673             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3674
3675             if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3676                 >= block->used_length) {
3677                 offset = 0;
3678                 block = QLIST_NEXT_RCU(block, next);
3679             } else {
3680                 migration_bitmap_clear_dirty(ram_state, block, offset);
3681                 dst_host = block->host
3682                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3683                 src_host = block->colo_cache
3684                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3685                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3686             }
3687         }
3688     }
3689     trace_colo_flush_ram_cache_end();
3690 }
3691
3692 /**
3693  * ram_load_precopy: load pages in precopy case
3694  *
3695  * Returns 0 for success or -errno in case of error
3696  *
3697  * Called in precopy mode by ram_load().
3698  * rcu_read_lock is taken prior to this being called.
3699  *
3700  * @f: QEMUFile where to send the data
3701  */
3702 static int ram_load_precopy(QEMUFile *f)
3703 {
3704     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3705     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3706     bool postcopy_advised = postcopy_is_advised();
3707     if (!migrate_use_compression()) {
3708         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3709     }
3710
3711     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3712         ram_addr_t addr, total_ram_bytes;
3713         void *host = NULL, *host_bak = NULL;
3714         uint8_t ch;
3715
3716         /*
3717          * Yield periodically to let main loop run, but an iteration of
3718          * the main loop is expensive, so do it each some iterations
3719          */
3720         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3721             aio_co_schedule(qemu_get_current_aio_context(),
3722                             qemu_coroutine_self());
3723             qemu_coroutine_yield();
3724         }
3725         i++;
3726
3727         addr = qemu_get_be64(f);
3728         flags = addr & ~TARGET_PAGE_MASK;
3729         addr &= TARGET_PAGE_MASK;
3730
3731         if (flags & invalid_flags) {
3732             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3733                 error_report("Received an unexpected compressed page");
3734             }
3735
3736             ret = -EINVAL;
3737             break;
3738         }
3739
3740         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3741                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3742             RAMBlock *block = ram_block_from_stream(f, flags);
3743
3744             host = host_from_ram_block_offset(block, addr);
3745             /*
3746              * After going into COLO stage, we should not load the page
3747              * into SVM's memory directly, we put them into colo_cache firstly.
3748              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3749              * Previously, we copied all these memory in preparing stage of COLO
3750              * while we need to stop VM, which is a time-consuming process.
3751              * Here we optimize it by a trick, back-up every page while in
3752              * migration process while COLO is enabled, though it affects the
3753              * speed of the migration, but it obviously reduce the downtime of
3754              * back-up all SVM'S memory in COLO preparing stage.
3755              */
3756             if (migration_incoming_colo_enabled()) {
3757                 if (migration_incoming_in_colo_state()) {
3758                     /* In COLO stage, put all pages into cache temporarily */
3759                     host = colo_cache_from_block_offset(block, addr, true);
3760                 } else {
3761                    /*
3762                     * In migration stage but before COLO stage,
3763                     * Put all pages into both cache and SVM's memory.
3764                     */
3765                     host_bak = colo_cache_from_block_offset(block, addr, false);
3766                 }
3767             }
3768             if (!host) {
3769                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3770                 ret = -EINVAL;
3771                 break;
3772             }
3773             if (!migration_incoming_in_colo_state()) {
3774                 ramblock_recv_bitmap_set(block, host);
3775             }
3776
3777             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3778         }
3779
3780         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3781         case RAM_SAVE_FLAG_MEM_SIZE:
3782             /* Synchronize RAM block list */
3783             total_ram_bytes = addr;
3784             while (!ret && total_ram_bytes) {
3785                 RAMBlock *block;
3786                 char id[256];
3787                 ram_addr_t length;
3788
3789                 len = qemu_get_byte(f);
3790                 qemu_get_buffer(f, (uint8_t *)id, len);
3791                 id[len] = 0;
3792                 length = qemu_get_be64(f);
3793
3794                 block = qemu_ram_block_by_name(id);
3795                 if (block && !qemu_ram_is_migratable(block)) {
3796                     error_report("block %s should not be migrated !", id);
3797                     ret = -EINVAL;
3798                 } else if (block) {
3799                     if (length != block->used_length) {
3800                         Error *local_err = NULL;
3801
3802                         ret = qemu_ram_resize(block, length,
3803                                               &local_err);
3804                         if (local_err) {
3805                             error_report_err(local_err);
3806                         }
3807                     }
3808                     /* For postcopy we need to check hugepage sizes match */
3809                     if (postcopy_advised && migrate_postcopy_ram() &&
3810                         block->page_size != qemu_host_page_size) {
3811                         uint64_t remote_page_size = qemu_get_be64(f);
3812                         if (remote_page_size != block->page_size) {
3813                             error_report("Mismatched RAM page size %s "
3814                                          "(local) %zd != %" PRId64,
3815                                          id, block->page_size,
3816                                          remote_page_size);
3817                             ret = -EINVAL;
3818                         }
3819                     }
3820                     if (migrate_ignore_shared()) {
3821                         hwaddr addr = qemu_get_be64(f);
3822                         if (ramblock_is_ignored(block) &&
3823                             block->mr->addr != addr) {
3824                             error_report("Mismatched GPAs for block %s "
3825                                          "%" PRId64 "!= %" PRId64,
3826                                          id, (uint64_t)addr,
3827                                          (uint64_t)block->mr->addr);
3828                             ret = -EINVAL;
3829                         }
3830                     }
3831                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3832                                           block->idstr);
3833                 } else {
3834                     error_report("Unknown ramblock \"%s\", cannot "
3835                                  "accept migration", id);
3836                     ret = -EINVAL;
3837                 }
3838
3839                 total_ram_bytes -= length;
3840             }
3841             break;
3842
3843         case RAM_SAVE_FLAG_ZERO:
3844             ch = qemu_get_byte(f);
3845             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3846             break;
3847
3848         case RAM_SAVE_FLAG_PAGE:
3849             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3850             break;
3851
3852         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3853             len = qemu_get_be32(f);
3854             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3855                 error_report("Invalid compressed data length: %d", len);
3856                 ret = -EINVAL;
3857                 break;
3858             }
3859             decompress_data_with_multi_threads(f, host, len);
3860             break;
3861
3862         case RAM_SAVE_FLAG_XBZRLE:
3863             if (load_xbzrle(f, addr, host) < 0) {
3864                 error_report("Failed to decompress XBZRLE page at "
3865                              RAM_ADDR_FMT, addr);
3866                 ret = -EINVAL;
3867                 break;
3868             }
3869             break;
3870         case RAM_SAVE_FLAG_EOS:
3871             /* normal exit */
3872             multifd_recv_sync_main();
3873             break;
3874         default:
3875             if (flags & RAM_SAVE_FLAG_HOOK) {
3876                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3877             } else {
3878                 error_report("Unknown combination of migration flags: 0x%x",
3879                              flags);
3880                 ret = -EINVAL;
3881             }
3882         }
3883         if (!ret) {
3884             ret = qemu_file_get_error(f);
3885         }
3886         if (!ret && host_bak) {
3887             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3888         }
3889     }
3890
3891     ret |= wait_for_decompress_done();
3892     return ret;
3893 }
3894
3895 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3896 {
3897     int ret = 0;
3898     static uint64_t seq_iter;
3899     /*
3900      * If system is running in postcopy mode, page inserts to host memory must
3901      * be atomic
3902      */
3903     bool postcopy_running = postcopy_is_running();
3904
3905     seq_iter++;
3906
3907     if (version_id != 4) {
3908         return -EINVAL;
3909     }
3910
3911     /*
3912      * This RCU critical section can be very long running.
3913      * When RCU reclaims in the code start to become numerous,
3914      * it will be necessary to reduce the granularity of this
3915      * critical section.
3916      */
3917     WITH_RCU_READ_LOCK_GUARD() {
3918         if (postcopy_running) {
3919             ret = ram_load_postcopy(f);
3920         } else {
3921             ret = ram_load_precopy(f);
3922         }
3923     }
3924     trace_ram_load_complete(ret, seq_iter);
3925
3926     return ret;
3927 }
3928
3929 static bool ram_has_postcopy(void *opaque)
3930 {
3931     RAMBlock *rb;
3932     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3933         if (ramblock_is_pmem(rb)) {
3934             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3935                          "is not supported now!", rb->idstr, rb->host);
3936             return false;
3937         }
3938     }
3939
3940     return migrate_postcopy_ram();
3941 }
3942
3943 /* Sync all the dirty bitmap with destination VM.  */
3944 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3945 {
3946     RAMBlock *block;
3947     QEMUFile *file = s->to_dst_file;
3948     int ramblock_count = 0;
3949
3950     trace_ram_dirty_bitmap_sync_start();
3951
3952     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3953         qemu_savevm_send_recv_bitmap(file, block->idstr);
3954         trace_ram_dirty_bitmap_request(block->idstr);
3955         ramblock_count++;
3956     }
3957
3958     trace_ram_dirty_bitmap_sync_wait();
3959
3960     /* Wait until all the ramblocks' dirty bitmap synced */
3961     while (ramblock_count--) {
3962         qemu_sem_wait(&s->rp_state.rp_sem);
3963     }
3964
3965     trace_ram_dirty_bitmap_sync_complete();
3966
3967     return 0;
3968 }
3969
3970 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3971 {
3972     qemu_sem_post(&s->rp_state.rp_sem);
3973 }
3974
3975 /*
3976  * Read the received bitmap, revert it as the initial dirty bitmap.
3977  * This is only used when the postcopy migration is paused but wants
3978  * to resume from a middle point.
3979  */
3980 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3981 {
3982     int ret = -EINVAL;
3983     QEMUFile *file = s->rp_state.from_dst_file;
3984     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3985     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3986     uint64_t size, end_mark;
3987
3988     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3989
3990     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3991         error_report("%s: incorrect state %s", __func__,
3992                      MigrationStatus_str(s->state));
3993         return -EINVAL;
3994     }
3995
3996     /*
3997      * Note: see comments in ramblock_recv_bitmap_send() on why we
3998      * need the endianness conversion, and the paddings.
3999      */
4000     local_size = ROUND_UP(local_size, 8);
4001
4002     /* Add paddings */
4003     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4004
4005     size = qemu_get_be64(file);
4006
4007     /* The size of the bitmap should match with our ramblock */
4008     if (size != local_size) {
4009         error_report("%s: ramblock '%s' bitmap size mismatch "
4010                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4011                      block->idstr, size, local_size);
4012         ret = -EINVAL;
4013         goto out;
4014     }
4015
4016     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4017     end_mark = qemu_get_be64(file);
4018
4019     ret = qemu_file_get_error(file);
4020     if (ret || size != local_size) {
4021         error_report("%s: read bitmap failed for ramblock '%s': %d"
4022                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4023                      __func__, block->idstr, ret, local_size, size);
4024         ret = -EIO;
4025         goto out;
4026     }
4027
4028     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4029         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4030                      __func__, block->idstr, end_mark);
4031         ret = -EINVAL;
4032         goto out;
4033     }
4034
4035     /*
4036      * Endianness conversion. We are during postcopy (though paused).
4037      * The dirty bitmap won't change. We can directly modify it.
4038      */
4039     bitmap_from_le(block->bmap, le_bitmap, nbits);
4040
4041     /*
4042      * What we received is "received bitmap". Revert it as the initial
4043      * dirty bitmap for this ramblock.
4044      */
4045     bitmap_complement(block->bmap, block->bmap, nbits);
4046
4047     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4048
4049     /*
4050      * We succeeded to sync bitmap for current ramblock. If this is
4051      * the last one to sync, we need to notify the main send thread.
4052      */
4053     ram_dirty_bitmap_reload_notify(s);
4054
4055     ret = 0;
4056 out:
4057     g_free(le_bitmap);
4058     return ret;
4059 }
4060
4061 static int ram_resume_prepare(MigrationState *s, void *opaque)
4062 {
4063     RAMState *rs = *(RAMState **)opaque;
4064     int ret;
4065
4066     ret = ram_dirty_bitmap_sync_all(s, rs);
4067     if (ret) {
4068         return ret;
4069     }
4070
4071     ram_state_resume_prepare(rs, s->to_dst_file);
4072
4073     return 0;
4074 }
4075
4076 static SaveVMHandlers savevm_ram_handlers = {
4077     .save_setup = ram_save_setup,
4078     .save_live_iterate = ram_save_iterate,
4079     .save_live_complete_postcopy = ram_save_complete,
4080     .save_live_complete_precopy = ram_save_complete,
4081     .has_postcopy = ram_has_postcopy,
4082     .save_live_pending = ram_save_pending,
4083     .load_state = ram_load,
4084     .save_cleanup = ram_save_cleanup,
4085     .load_setup = ram_load_setup,
4086     .load_cleanup = ram_load_cleanup,
4087     .resume_prepare = ram_resume_prepare,
4088 };
4089
4090 void ram_mig_init(void)
4091 {
4092     qemu_mutex_init(&XBZRLE.lock);
4093     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4094 }