migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include "qemu/cutils.h"
  32 #include "qemu/bitops.h"
  33 #include "qemu/bitmap.h"
  34 #include "qemu/main-loop.h"
  35 #include "xbzrle.h"
  36 #include "ram.h"
  37 #include "migration.h"
  38 #include "migration/register.h"
  39 #include "migration/misc.h"
  40 #include "qemu-file.h"
  41 #include "postcopy-ram.h"
  42 #include "page_cache.h"
  43 #include "qemu/error-report.h"
  44 #include "qapi/error.h"
  45 #include "qapi/qapi-types-migration.h"
  46 #include "qapi/qapi-events-migration.h"
  47 #include "qapi/qmp/qerror.h"
  48 #include "trace.h"
  49 #include "exec/ram_addr.h"
  50 #include "exec/target_page.h"
  51 #include "qemu/rcu_queue.h"
  52 #include "migration/colo.h"
  53 #include "block.h"
  54 #include "sysemu/sysemu.h"
  55 #include "sysemu/cpu-throttle.h"
  56 #include "savevm.h"
  57 #include "qemu/iov.h"
  58 #include "multifd.h"
  59 #include "sysemu/runstate.h"
  60
  61 #if defined(__linux__)
  62 #include "qemu/userfaultfd.h"
  63 #endif /* defined(__linux__) */
  64
  65 /***********************************************************/
  66 /* ram save/restore */
  67
  68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  69  * worked for pages that where filled with the same char.  We switched
  70  * it to only search for the zero value.  And to avoid confusion with
  71  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  72  */
  73
  74 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  75 #define RAM_SAVE_FLAG_ZERO     0x02
  76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  77 #define RAM_SAVE_FLAG_PAGE     0x08
  78 #define RAM_SAVE_FLAG_EOS      0x10
  79 #define RAM_SAVE_FLAG_CONTINUE 0x20
  80 #define RAM_SAVE_FLAG_XBZRLE   0x40
  81 /* 0x80 is reserved in migration.h start with 0x100 next */
  82 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  83
  84 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  85 {
  86     return buffer_is_zero(p, size);
  87 }
  88
  89 XBZRLECacheStats xbzrle_counters;
  90
  91 /* struct contains XBZRLE cache and a static page
  92    used by the compression */
  93 static struct {
  94     /* buffer used for XBZRLE encoding */
  95     uint8_t *encoded_buf;
  96     /* buffer for storing page content */
  97     uint8_t *current_buf;
  98     /* Cache for XBZRLE, Protected by lock. */
  99     PageCache *cache;
 100     QemuMutex lock;
 101     /* it will store a page full of zeros */
 102     uint8_t *zero_target_page;
 103     /* buffer used for XBZRLE decoding */
 104     uint8_t *decoded_buf;
 105 } XBZRLE;
 106
 107 static void XBZRLE_cache_lock(void)
 108 {
 109     if (migrate_use_xbzrle()) {
 110         qemu_mutex_lock(&XBZRLE.lock);
 111     }
 112 }
 113
 114 static void XBZRLE_cache_unlock(void)
 115 {
 116     if (migrate_use_xbzrle()) {
 117         qemu_mutex_unlock(&XBZRLE.lock);
 118     }
 119 }
 120
 121 /**
 122  * xbzrle_cache_resize: resize the xbzrle cache
 123  *
 124  * This function is called from qmp_migrate_set_cache_size in main
 125  * thread, possibly while a migration is in progress.  A running
 126  * migration may be using the cache and might finish during this call,
 127  * hence changes to the cache are protected by XBZRLE.lock().
 128  *
 129  * Returns 0 for success or -1 for error
 130  *
 131  * @new_size: new cache size
 132  * @errp: set *errp if the check failed, with reason
 133  */
 134 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 135 {
 136     PageCache *new_cache;
 137     int64_t ret = 0;
 138
 139     /* Check for truncation */
 140     if (new_size != (size_t)new_size) {
 141         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 142                    "exceeding address space");
 143         return -1;
 144     }
 145
 146     if (new_size == migrate_xbzrle_cache_size()) {
 147         /* nothing to do */
 148         return 0;
 149     }
 150
 151     XBZRLE_cache_lock();
 152
 153     if (XBZRLE.cache != NULL) {
 154         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 155         if (!new_cache) {
 156             ret = -1;
 157             goto out;
 158         }
 159
 160         cache_fini(XBZRLE.cache);
 161         XBZRLE.cache = new_cache;
 162     }
 163 out:
 164     XBZRLE_cache_unlock();
 165     return ret;
 166 }
 167
 168 bool ramblock_is_ignored(RAMBlock *block)
 169 {
 170     return !qemu_ram_is_migratable(block) ||
 171            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 172 }
 173
 174 #undef RAMBLOCK_FOREACH
 175
 176 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 177 {
 178     RAMBlock *block;
 179     int ret = 0;
 180
 181     RCU_READ_LOCK_GUARD();
 182
 183     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 184         ret = func(block, opaque);
 185         if (ret) {
 186             break;
 187         }
 188     }
 189     return ret;
 190 }
 191
 192 static void ramblock_recv_map_init(void)
 193 {
 194     RAMBlock *rb;
 195
 196     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 197         assert(!rb->receivedmap);
 198         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 199     }
 200 }
 201
 202 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 203 {
 204     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 205                     rb->receivedmap);
 206 }
 207
 208 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 209 {
 210     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 211 }
 212
 213 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 214 {
 215     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 216 }
 217
 218 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 219                                     size_t nr)
 220 {
 221     bitmap_set_atomic(rb->receivedmap,
 222                       ramblock_recv_bitmap_offset(host_addr, rb),
 223                       nr);
 224 }
 225
 226 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 227
 228 /*
 229  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 230  *
 231  * Returns >0 if success with sent bytes, or <0 if error.
 232  */
 233 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 234                                   const char *block_name)
 235 {
 236     RAMBlock *block = qemu_ram_block_by_name(block_name);
 237     unsigned long *le_bitmap, nbits;
 238     uint64_t size;
 239
 240     if (!block) {
 241         error_report("%s: invalid block name: %s", __func__, block_name);
 242         return -1;
 243     }
 244
 245     nbits = block->used_length >> TARGET_PAGE_BITS;
 246
 247     /*
 248      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 249      * machines we may need 4 more bytes for padding (see below
 250      * comment). So extend it a bit before hand.
 251      */
 252     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 253
 254     /*
 255      * Always use little endian when sending the bitmap. This is
 256      * required that when source and destination VMs are not using the
 257      * same endianness. (Note: big endian won't work.)
 258      */
 259     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 260
 261     /* Size of the bitmap, in bytes */
 262     size = DIV_ROUND_UP(nbits, 8);
 263
 264     /*
 265      * size is always aligned to 8 bytes for 64bit machines, but it
 266      * may not be true for 32bit machines. We need this padding to
 267      * make sure the migration can survive even between 32bit and
 268      * 64bit machines.
 269      */
 270     size = ROUND_UP(size, 8);
 271
 272     qemu_put_be64(file, size);
 273     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 274     /*
 275      * Mark as an end, in case the middle part is screwed up due to
 276      * some "mysterious" reason.
 277      */
 278     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 279     qemu_fflush(file);
 280
 281     g_free(le_bitmap);
 282
 283     if (qemu_file_get_error(file)) {
 284         return qemu_file_get_error(file);
 285     }
 286
 287     return size + sizeof(size);
 288 }
 289
 290 /*
 291  * An outstanding page request, on the source, having been received
 292  * and queued
 293  */
 294 struct RAMSrcPageRequest {
 295     RAMBlock *rb;
 296     hwaddr    offset;
 297     hwaddr    len;
 298
 299     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 300 };
 301
 302 /* State of RAM for migration */
 303 struct RAMState {
 304     /* QEMUFile used for this migration */
 305     QEMUFile *f;
 306     /* UFFD file descriptor, used in 'write-tracking' migration */
 307     int uffdio_fd;
 308     /* Last block that we have visited searching for dirty pages */
 309     RAMBlock *last_seen_block;
 310     /* Last block from where we have sent data */
 311     RAMBlock *last_sent_block;
 312     /* Last dirty target page we have sent */
 313     ram_addr_t last_page;
 314     /* last ram version we have seen */
 315     uint32_t last_version;
 316     /* We are in the first round */
 317     bool ram_bulk_stage;
 318     /* The free page optimization is enabled */
 319     bool fpo_enabled;
 320     /* How many times we have dirty too many pages */
 321     int dirty_rate_high_cnt;
 322     /* these variables are used for bitmap sync */
 323     /* last time we did a full bitmap_sync */
 324     int64_t time_last_bitmap_sync;
 325     /* bytes transferred at start_time */
 326     uint64_t bytes_xfer_prev;
 327     /* number of dirty pages since start_time */
 328     uint64_t num_dirty_pages_period;
 329     /* xbzrle misses since the beginning of the period */
 330     uint64_t xbzrle_cache_miss_prev;
 331     /* Amount of xbzrle pages since the beginning of the period */
 332     uint64_t xbzrle_pages_prev;
 333     /* Amount of xbzrle encoded bytes since the beginning of the period */
 334     uint64_t xbzrle_bytes_prev;
 335
 336     /* compression statistics since the beginning of the period */
 337     /* amount of count that no free thread to compress data */
 338     uint64_t compress_thread_busy_prev;
 339     /* amount bytes after compression */
 340     uint64_t compressed_size_prev;
 341     /* amount of compressed pages */
 342     uint64_t compress_pages_prev;
 343
 344     /* total handled target pages at the beginning of period */
 345     uint64_t target_page_count_prev;
 346     /* total handled target pages since start */
 347     uint64_t target_page_count;
 348     /* number of dirty bits in the bitmap */
 349     uint64_t migration_dirty_pages;
 350     /* Protects modification of the bitmap and migration dirty pages */
 351     QemuMutex bitmap_mutex;
 352     /* The RAMBlock used in the last src_page_requests */
 353     RAMBlock *last_req_rb;
 354     /* Queue of outstanding page requests from the destination */
 355     QemuMutex src_page_req_mutex;
 356     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 357 };
 358 typedef struct RAMState RAMState;
 359
 360 static RAMState *ram_state;
 361
 362 static NotifierWithReturnList precopy_notifier_list;
 363
 364 void precopy_infrastructure_init(void)
 365 {
 366     notifier_with_return_list_init(&precopy_notifier_list);
 367 }
 368
 369 void precopy_add_notifier(NotifierWithReturn *n)
 370 {
 371     notifier_with_return_list_add(&precopy_notifier_list, n);
 372 }
 373
 374 void precopy_remove_notifier(NotifierWithReturn *n)
 375 {
 376     notifier_with_return_remove(n);
 377 }
 378
 379 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 380 {
 381     PrecopyNotifyData pnd;
 382     pnd.reason = reason;
 383     pnd.errp = errp;
 384
 385     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 386 }
 387
 388 void precopy_enable_free_page_optimization(void)
 389 {
 390     if (!ram_state) {
 391         return;
 392     }
 393
 394     ram_state->fpo_enabled = true;
 395 }
 396
 397 uint64_t ram_bytes_remaining(void)
 398 {
 399     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 400                        0;
 401 }
 402
 403 MigrationStats ram_counters;
 404
 405 /* used by the search for pages to send */
 406 struct PageSearchStatus {
 407     /* Current block being searched */
 408     RAMBlock    *block;
 409     /* Current page to search from */
 410     unsigned long page;
 411     /* Set once we wrap around */
 412     bool         complete_round;
 413 };
 414 typedef struct PageSearchStatus PageSearchStatus;
 415
 416 CompressionStats compression_counters;
 417
 418 struct CompressParam {
 419     bool done;
 420     bool quit;
 421     bool zero_page;
 422     QEMUFile *file;
 423     QemuMutex mutex;
 424     QemuCond cond;
 425     RAMBlock *block;
 426     ram_addr_t offset;
 427
 428     /* internally used fields */
 429     z_stream stream;
 430     uint8_t *originbuf;
 431 };
 432 typedef struct CompressParam CompressParam;
 433
 434 struct DecompressParam {
 435     bool done;
 436     bool quit;
 437     QemuMutex mutex;
 438     QemuCond cond;
 439     void *des;
 440     uint8_t *compbuf;
 441     int len;
 442     z_stream stream;
 443 };
 444 typedef struct DecompressParam DecompressParam;
 445
 446 static CompressParam *comp_param;
 447 static QemuThread *compress_threads;
 448 /* comp_done_cond is used to wake up the migration thread when
 449  * one of the compression threads has finished the compression.
 450  * comp_done_lock is used to co-work with comp_done_cond.
 451  */
 452 static QemuMutex comp_done_lock;
 453 static QemuCond comp_done_cond;
 454 /* The empty QEMUFileOps will be used by file in CompressParam */
 455 static const QEMUFileOps empty_ops = { };
 456
 457 static QEMUFile *decomp_file;
 458 static DecompressParam *decomp_param;
 459 static QemuThread *decompress_threads;
 460 static QemuMutex decomp_done_lock;
 461 static QemuCond decomp_done_cond;
 462
 463 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 464                                  ram_addr_t offset, uint8_t *source_buf);
 465
 466 static void *do_data_compress(void *opaque)
 467 {
 468     CompressParam *param = opaque;
 469     RAMBlock *block;
 470     ram_addr_t offset;
 471     bool zero_page;
 472
 473     qemu_mutex_lock(&param->mutex);
 474     while (!param->quit) {
 475         if (param->block) {
 476             block = param->block;
 477             offset = param->offset;
 478             param->block = NULL;
 479             qemu_mutex_unlock(&param->mutex);
 480
 481             zero_page = do_compress_ram_page(param->file, &param->stream,
 482                                              block, offset, param->originbuf);
 483
 484             qemu_mutex_lock(&comp_done_lock);
 485             param->done = true;
 486             param->zero_page = zero_page;
 487             qemu_cond_signal(&comp_done_cond);
 488             qemu_mutex_unlock(&comp_done_lock);
 489
 490             qemu_mutex_lock(&param->mutex);
 491         } else {
 492             qemu_cond_wait(&param->cond, &param->mutex);
 493         }
 494     }
 495     qemu_mutex_unlock(&param->mutex);
 496
 497     return NULL;
 498 }
 499
 500 static void compress_threads_save_cleanup(void)
 501 {
 502     int i, thread_count;
 503
 504     if (!migrate_use_compression() || !comp_param) {
 505         return;
 506     }
 507
 508     thread_count = migrate_compress_threads();
 509     for (i = 0; i < thread_count; i++) {
 510         /*
 511          * we use it as a indicator which shows if the thread is
 512          * properly init'd or not
 513          */
 514         if (!comp_param[i].file) {
 515             break;
 516         }
 517
 518         qemu_mutex_lock(&comp_param[i].mutex);
 519         comp_param[i].quit = true;
 520         qemu_cond_signal(&comp_param[i].cond);
 521         qemu_mutex_unlock(&comp_param[i].mutex);
 522
 523         qemu_thread_join(compress_threads + i);
 524         qemu_mutex_destroy(&comp_param[i].mutex);
 525         qemu_cond_destroy(&comp_param[i].cond);
 526         deflateEnd(&comp_param[i].stream);
 527         g_free(comp_param[i].originbuf);
 528         qemu_fclose(comp_param[i].file);
 529         comp_param[i].file = NULL;
 530     }
 531     qemu_mutex_destroy(&comp_done_lock);
 532     qemu_cond_destroy(&comp_done_cond);
 533     g_free(compress_threads);
 534     g_free(comp_param);
 535     compress_threads = NULL;
 536     comp_param = NULL;
 537 }
 538
 539 static int compress_threads_save_setup(void)
 540 {
 541     int i, thread_count;
 542
 543     if (!migrate_use_compression()) {
 544         return 0;
 545     }
 546     thread_count = migrate_compress_threads();
 547     compress_threads = g_new0(QemuThread, thread_count);
 548     comp_param = g_new0(CompressParam, thread_count);
 549     qemu_cond_init(&comp_done_cond);
 550     qemu_mutex_init(&comp_done_lock);
 551     for (i = 0; i < thread_count; i++) {
 552         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 553         if (!comp_param[i].originbuf) {
 554             goto exit;
 555         }
 556
 557         if (deflateInit(&comp_param[i].stream,
 558                         migrate_compress_level()) != Z_OK) {
 559             g_free(comp_param[i].originbuf);
 560             goto exit;
 561         }
 562
 563         /* comp_param[i].file is just used as a dummy buffer to save data,
 564          * set its ops to empty.
 565          */
 566         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 567         comp_param[i].done = true;
 568         comp_param[i].quit = false;
 569         qemu_mutex_init(&comp_param[i].mutex);
 570         qemu_cond_init(&comp_param[i].cond);
 571         qemu_thread_create(compress_threads + i, "compress",
 572                            do_data_compress, comp_param + i,
 573                            QEMU_THREAD_JOINABLE);
 574     }
 575     return 0;
 576
 577 exit:
 578     compress_threads_save_cleanup();
 579     return -1;
 580 }
 581
 582 /**
 583  * save_page_header: write page header to wire
 584  *
 585  * If this is the 1st block, it also writes the block identification
 586  *
 587  * Returns the number of bytes written
 588  *
 589  * @f: QEMUFile where to send the data
 590  * @block: block that contains the page we want to send
 591  * @offset: offset inside the block for the page
 592  *          in the lower bits, it contains flags
 593  */
 594 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 595                                ram_addr_t offset)
 596 {
 597     size_t size, len;
 598
 599     if (block == rs->last_sent_block) {
 600         offset |= RAM_SAVE_FLAG_CONTINUE;
 601     }
 602     qemu_put_be64(f, offset);
 603     size = 8;
 604
 605     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 606         len = strlen(block->idstr);
 607         qemu_put_byte(f, len);
 608         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 609         size += 1 + len;
 610         rs->last_sent_block = block;
 611     }
 612     return size;
 613 }
 614
 615 /**
 616  * mig_throttle_guest_down: throotle down the guest
 617  *
 618  * Reduce amount of guest cpu execution to hopefully slow down memory
 619  * writes. If guest dirty memory rate is reduced below the rate at
 620  * which we can transfer pages to the destination then we should be
 621  * able to complete migration. Some workloads dirty memory way too
 622  * fast and will not effectively converge, even with auto-converge.
 623  */
 624 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 625                                     uint64_t bytes_dirty_threshold)
 626 {
 627     MigrationState *s = migrate_get_current();
 628     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 629     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 630     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 631     int pct_max = s->parameters.max_cpu_throttle;
 632
 633     uint64_t throttle_now = cpu_throttle_get_percentage();
 634     uint64_t cpu_now, cpu_ideal, throttle_inc;
 635
 636     /* We have not started throttling yet. Let's start it. */
 637     if (!cpu_throttle_active()) {
 638         cpu_throttle_set(pct_initial);
 639     } else {
 640         /* Throttling already on, just increase the rate */
 641         if (!pct_tailslow) {
 642             throttle_inc = pct_increment;
 643         } else {
 644             /* Compute the ideal CPU percentage used by Guest, which may
 645              * make the dirty rate match the dirty rate threshold. */
 646             cpu_now = 100 - throttle_now;
 647             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 648                         bytes_dirty_period);
 649             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 650         }
 651         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 652     }
 653 }
 654
 655 /**
 656  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 657  *
 658  * @rs: current RAM state
 659  * @current_addr: address for the zero page
 660  *
 661  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 662  * The important thing is that a stale (not-yet-0'd) page be replaced
 663  * by the new data.
 664  * As a bonus, if the page wasn't in the cache it gets added so that
 665  * when a small write is made into the 0'd page it gets XBZRLE sent.
 666  */
 667 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 668 {
 669     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 670         return;
 671     }
 672
 673     /* We don't care if this fails to allocate a new cache page
 674      * as long as it updated an old one */
 675     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 676                  ram_counters.dirty_sync_count);
 677 }
 678
 679 #define ENCODING_FLAG_XBZRLE 0x1
 680
 681 /**
 682  * save_xbzrle_page: compress and send current page
 683  *
 684  * Returns: 1 means that we wrote the page
 685  *          0 means that page is identical to the one already sent
 686  *          -1 means that xbzrle would be longer than normal
 687  *
 688  * @rs: current RAM state
 689  * @current_data: pointer to the address of the page contents
 690  * @current_addr: addr of the page
 691  * @block: block that contains the page we want to send
 692  * @offset: offset inside the block for the page
 693  * @last_stage: if we are at the completion stage
 694  */
 695 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 696                             ram_addr_t current_addr, RAMBlock *block,
 697                             ram_addr_t offset, bool last_stage)
 698 {
 699     int encoded_len = 0, bytes_xbzrle;
 700     uint8_t *prev_cached_page;
 701
 702     if (!cache_is_cached(XBZRLE.cache, current_addr,
 703                          ram_counters.dirty_sync_count)) {
 704         xbzrle_counters.cache_miss++;
 705         if (!last_stage) {
 706             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 707                              ram_counters.dirty_sync_count) == -1) {
 708                 return -1;
 709             } else {
 710                 /* update *current_data when the page has been
 711                    inserted into cache */
 712                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 713             }
 714         }
 715         return -1;
 716     }
 717
 718     /*
 719      * Reaching here means the page has hit the xbzrle cache, no matter what
 720      * encoding result it is (normal encoding, overflow or skipping the page),
 721      * count the page as encoded. This is used to calculate the encoding rate.
 722      *
 723      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 724      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 725      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 726      * skipped page included. In this way, the encoding rate can tell if the
 727      * guest page is good for xbzrle encoding.
 728      */
 729     xbzrle_counters.pages++;
 730     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 731
 732     /* save current buffer into memory */
 733     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 734
 735     /* XBZRLE encoding (if there is no overflow) */
 736     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 737                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 738                                        TARGET_PAGE_SIZE);
 739
 740     /*
 741      * Update the cache contents, so that it corresponds to the data
 742      * sent, in all cases except where we skip the page.
 743      */
 744     if (!last_stage && encoded_len != 0) {
 745         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 746         /*
 747          * In the case where we couldn't compress, ensure that the caller
 748          * sends the data from the cache, since the guest might have
 749          * changed the RAM since we copied it.
 750          */
 751         *current_data = prev_cached_page;
 752     }
 753
 754     if (encoded_len == 0) {
 755         trace_save_xbzrle_page_skipping();
 756         return 0;
 757     } else if (encoded_len == -1) {
 758         trace_save_xbzrle_page_overflow();
 759         xbzrle_counters.overflow++;
 760         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 761         return -1;
 762     }
 763
 764     /* Send XBZRLE based compressed page */
 765     bytes_xbzrle = save_page_header(rs, rs->f, block,
 766                                     offset | RAM_SAVE_FLAG_XBZRLE);
 767     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 768     qemu_put_be16(rs->f, encoded_len);
 769     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 770     bytes_xbzrle += encoded_len + 1 + 2;
 771     /*
 772      * Like compressed_size (please see update_compress_thread_counts),
 773      * the xbzrle encoded bytes don't count the 8 byte header with
 774      * RAM_SAVE_FLAG_CONTINUE.
 775      */
 776     xbzrle_counters.bytes += bytes_xbzrle - 8;
 777     ram_counters.transferred += bytes_xbzrle;
 778
 779     return 1;
 780 }
 781
 782 /**
 783  * migration_bitmap_find_dirty: find the next dirty page from start
 784  *
 785  * Returns the page offset within memory region of the start of a dirty page
 786  *
 787  * @rs: current RAM state
 788  * @rb: RAMBlock where to search for dirty pages
 789  * @start: page where we start the search
 790  */
 791 static inline
 792 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 793                                           unsigned long start)
 794 {
 795     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 796     unsigned long *bitmap = rb->bmap;
 797     unsigned long next;
 798
 799     if (ramblock_is_ignored(rb)) {
 800         return size;
 801     }
 802
 803     /*
 804      * When the free page optimization is enabled, we need to check the bitmap
 805      * to send the non-free pages rather than all the pages in the bulk stage.
 806      */
 807     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
 808         next = start + 1;
 809     } else {
 810         next = find_next_bit(bitmap, size, start);
 811     }
 812
 813     return next;
 814 }
 815
 816 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 817                                                 RAMBlock *rb,
 818                                                 unsigned long page)
 819 {
 820     bool ret;
 821
 822     qemu_mutex_lock(&rs->bitmap_mutex);
 823
 824     /*
 825      * Clear dirty bitmap if needed.  This _must_ be called before we
 826      * send any of the page in the chunk because we need to make sure
 827      * we can capture further page content changes when we sync dirty
 828      * log the next time.  So as long as we are going to send any of
 829      * the page in the chunk we clear the remote dirty bitmap for all.
 830      * Clearing it earlier won't be a problem, but too late will.
 831      */
 832     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
 833         uint8_t shift = rb->clear_bmap_shift;
 834         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
 835         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
 836
 837         /*
 838          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 839          * can make things easier sometimes since then start address
 840          * of the small chunk will always be 64 pages aligned so the
 841          * bitmap will always be aligned to unsigned long.  We should
 842          * even be able to remove this restriction but I'm simply
 843          * keeping it.
 844          */
 845         assert(shift >= 6);
 846         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 847         memory_region_clear_dirty_bitmap(rb->mr, start, size);
 848     }
 849
 850     ret = test_and_clear_bit(page, rb->bmap);
 851
 852     if (ret) {
 853         rs->migration_dirty_pages--;
 854     }
 855     qemu_mutex_unlock(&rs->bitmap_mutex);
 856
 857     return ret;
 858 }
 859
 860 /* Called with RCU critical section */
 861 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 862 {
 863     uint64_t new_dirty_pages =
 864         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
 865
 866     rs->migration_dirty_pages += new_dirty_pages;
 867     rs->num_dirty_pages_period += new_dirty_pages;
 868 }
 869
 870 /**
 871  * ram_pagesize_summary: calculate all the pagesizes of a VM
 872  *
 873  * Returns a summary bitmap of the page sizes of all RAMBlocks
 874  *
 875  * For VMs with just normal pages this is equivalent to the host page
 876  * size. If it's got some huge pages then it's the OR of all the
 877  * different page sizes.
 878  */
 879 uint64_t ram_pagesize_summary(void)
 880 {
 881     RAMBlock *block;
 882     uint64_t summary = 0;
 883
 884     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 885         summary |= block->page_size;
 886     }
 887
 888     return summary;
 889 }
 890
 891 uint64_t ram_get_total_transferred_pages(void)
 892 {
 893     return  ram_counters.normal + ram_counters.duplicate +
 894                 compression_counters.pages + xbzrle_counters.pages;
 895 }
 896
 897 static void migration_update_rates(RAMState *rs, int64_t end_time)
 898 {
 899     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 900     double compressed_size;
 901
 902     /* calculate period counters */
 903     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 904                 / (end_time - rs->time_last_bitmap_sync);
 905
 906     if (!page_count) {
 907         return;
 908     }
 909
 910     if (migrate_use_xbzrle()) {
 911         double encoded_size, unencoded_size;
 912
 913         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 914             rs->xbzrle_cache_miss_prev) / page_count;
 915         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 916         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
 917                          TARGET_PAGE_SIZE;
 918         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
 919         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
 920             xbzrle_counters.encoding_rate = 0;
 921         } else {
 922             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
 923         }
 924         rs->xbzrle_pages_prev = xbzrle_counters.pages;
 925         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
 926     }
 927
 928     if (migrate_use_compression()) {
 929         compression_counters.busy_rate = (double)(compression_counters.busy -
 930             rs->compress_thread_busy_prev) / page_count;
 931         rs->compress_thread_busy_prev = compression_counters.busy;
 932
 933         compressed_size = compression_counters.compressed_size -
 934                           rs->compressed_size_prev;
 935         if (compressed_size) {
 936             double uncompressed_size = (compression_counters.pages -
 937                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
 938
 939             /* Compression-Ratio = Uncompressed-size / Compressed-size */
 940             compression_counters.compression_rate =
 941                                         uncompressed_size / compressed_size;
 942
 943             rs->compress_pages_prev = compression_counters.pages;
 944             rs->compressed_size_prev = compression_counters.compressed_size;
 945         }
 946     }
 947 }
 948
 949 static void migration_trigger_throttle(RAMState *rs)
 950 {
 951     MigrationState *s = migrate_get_current();
 952     uint64_t threshold = s->parameters.throttle_trigger_threshold;
 953
 954     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
 955     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
 956     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
 957
 958     /* During block migration the auto-converge logic incorrectly detects
 959      * that ram migration makes no progress. Avoid this by disabling the
 960      * throttling logic during the bulk phase of block migration. */
 961     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 962         /* The following detection logic can be refined later. For now:
 963            Check to see if the ratio between dirtied bytes and the approx.
 964            amount of bytes that just got transferred since the last time
 965            we were in this routine reaches the threshold. If that happens
 966            twice, start or increase throttling. */
 967
 968         if ((bytes_dirty_period > bytes_dirty_threshold) &&
 969             (++rs->dirty_rate_high_cnt >= 2)) {
 970             trace_migration_throttle();
 971             rs->dirty_rate_high_cnt = 0;
 972             mig_throttle_guest_down(bytes_dirty_period,
 973                                     bytes_dirty_threshold);
 974         }
 975     }
 976 }
 977
 978 static void migration_bitmap_sync(RAMState *rs)
 979 {
 980     RAMBlock *block;
 981     int64_t end_time;
 982
 983     ram_counters.dirty_sync_count++;
 984
 985     if (!rs->time_last_bitmap_sync) {
 986         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 987     }
 988
 989     trace_migration_bitmap_sync_start();
 990     memory_global_dirty_log_sync();
 991
 992     qemu_mutex_lock(&rs->bitmap_mutex);
 993     WITH_RCU_READ_LOCK_GUARD() {
 994         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 995             ramblock_sync_dirty_bitmap(rs, block);
 996         }
 997         ram_counters.remaining = ram_bytes_remaining();
 998     }
 999     qemu_mutex_unlock(&rs->bitmap_mutex);
1000
1001     memory_global_after_dirty_log_sync();
1002     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1003
1004     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1005
1006     /* more than 1 second = 1000 millisecons */
1007     if (end_time > rs->time_last_bitmap_sync + 1000) {
1008         migration_trigger_throttle(rs);
1009
1010         migration_update_rates(rs, end_time);
1011
1012         rs->target_page_count_prev = rs->target_page_count;
1013
1014         /* reset period counters */
1015         rs->time_last_bitmap_sync = end_time;
1016         rs->num_dirty_pages_period = 0;
1017         rs->bytes_xfer_prev = ram_counters.transferred;
1018     }
1019     if (migrate_use_events()) {
1020         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1021     }
1022 }
1023
1024 static void migration_bitmap_sync_precopy(RAMState *rs)
1025 {
1026     Error *local_err = NULL;
1027
1028     /*
1029      * The current notifier usage is just an optimization to migration, so we
1030      * don't stop the normal migration process in the error case.
1031      */
1032     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1033         error_report_err(local_err);
1034         local_err = NULL;
1035     }
1036
1037     migration_bitmap_sync(rs);
1038
1039     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1040         error_report_err(local_err);
1041     }
1042 }
1043
1044 /**
1045  * save_zero_page_to_file: send the zero page to the file
1046  *
1047  * Returns the size of data written to the file, 0 means the page is not
1048  * a zero page
1049  *
1050  * @rs: current RAM state
1051  * @file: the file where the data is saved
1052  * @block: block that contains the page we want to send
1053  * @offset: offset inside the block for the page
1054  */
1055 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1056                                   RAMBlock *block, ram_addr_t offset)
1057 {
1058     uint8_t *p = block->host + offset;
1059     int len = 0;
1060
1061     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1062         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1063         qemu_put_byte(file, 0);
1064         len += 1;
1065     }
1066     return len;
1067 }
1068
1069 /**
1070  * save_zero_page: send the zero page to the stream
1071  *
1072  * Returns the number of pages written.
1073  *
1074  * @rs: current RAM state
1075  * @block: block that contains the page we want to send
1076  * @offset: offset inside the block for the page
1077  */
1078 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1079 {
1080     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1081
1082     if (len) {
1083         ram_counters.duplicate++;
1084         ram_counters.transferred += len;
1085         return 1;
1086     }
1087     return -1;
1088 }
1089
1090 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1091 {
1092     if (!migrate_release_ram() || !migration_in_postcopy()) {
1093         return;
1094     }
1095
1096     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1097 }
1098
1099 /*
1100  * @pages: the number of pages written by the control path,
1101  *        < 0 - error
1102  *        > 0 - number of pages written
1103  *
1104  * Return true if the pages has been saved, otherwise false is returned.
1105  */
1106 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1107                               int *pages)
1108 {
1109     uint64_t bytes_xmit = 0;
1110     int ret;
1111
1112     *pages = -1;
1113     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1114                                 &bytes_xmit);
1115     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1116         return false;
1117     }
1118
1119     if (bytes_xmit) {
1120         ram_counters.transferred += bytes_xmit;
1121         *pages = 1;
1122     }
1123
1124     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1125         return true;
1126     }
1127
1128     if (bytes_xmit > 0) {
1129         ram_counters.normal++;
1130     } else if (bytes_xmit == 0) {
1131         ram_counters.duplicate++;
1132     }
1133
1134     return true;
1135 }
1136
1137 /*
1138  * directly send the page to the stream
1139  *
1140  * Returns the number of pages written.
1141  *
1142  * @rs: current RAM state
1143  * @block: block that contains the page we want to send
1144  * @offset: offset inside the block for the page
1145  * @buf: the page to be sent
1146  * @async: send to page asyncly
1147  */
1148 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1149                             uint8_t *buf, bool async)
1150 {
1151     ram_counters.transferred += save_page_header(rs, rs->f, block,
1152                                                  offset | RAM_SAVE_FLAG_PAGE);
1153     if (async) {
1154         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1155                               migrate_release_ram() &
1156                               migration_in_postcopy());
1157     } else {
1158         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1159     }
1160     ram_counters.transferred += TARGET_PAGE_SIZE;
1161     ram_counters.normal++;
1162     return 1;
1163 }
1164
1165 /**
1166  * ram_save_page: send the given page to the stream
1167  *
1168  * Returns the number of pages written.
1169  *          < 0 - error
1170  *          >=0 - Number of pages written - this might legally be 0
1171  *                if xbzrle noticed the page was the same.
1172  *
1173  * @rs: current RAM state
1174  * @block: block that contains the page we want to send
1175  * @offset: offset inside the block for the page
1176  * @last_stage: if we are at the completion stage
1177  */
1178 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1179 {
1180     int pages = -1;
1181     uint8_t *p;
1182     bool send_async = true;
1183     RAMBlock *block = pss->block;
1184     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1185     ram_addr_t current_addr = block->offset + offset;
1186
1187     p = block->host + offset;
1188     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1189
1190     XBZRLE_cache_lock();
1191     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1192         migrate_use_xbzrle()) {
1193         pages = save_xbzrle_page(rs, &p, current_addr, block,
1194                                  offset, last_stage);
1195         if (!last_stage) {
1196             /* Can't send this cached data async, since the cache page
1197              * might get updated before it gets to the wire
1198              */
1199             send_async = false;
1200         }
1201     }
1202
1203     /* XBZRLE overflow or normal page */
1204     if (pages == -1) {
1205         pages = save_normal_page(rs, block, offset, p, send_async);
1206     }
1207
1208     XBZRLE_cache_unlock();
1209
1210     return pages;
1211 }
1212
1213 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1214                                  ram_addr_t offset)
1215 {
1216     if (multifd_queue_page(rs->f, block, offset) < 0) {
1217         return -1;
1218     }
1219     ram_counters.normal++;
1220
1221     return 1;
1222 }
1223
1224 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1225                                  ram_addr_t offset, uint8_t *source_buf)
1226 {
1227     RAMState *rs = ram_state;
1228     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1229     bool zero_page = false;
1230     int ret;
1231
1232     if (save_zero_page_to_file(rs, f, block, offset)) {
1233         zero_page = true;
1234         goto exit;
1235     }
1236
1237     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1238
1239     /*
1240      * copy it to a internal buffer to avoid it being modified by VM
1241      * so that we can catch up the error during compression and
1242      * decompression
1243      */
1244     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1245     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1246     if (ret < 0) {
1247         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1248         error_report("compressed data failed!");
1249         return false;
1250     }
1251
1252 exit:
1253     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1254     return zero_page;
1255 }
1256
1257 static void
1258 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1259 {
1260     ram_counters.transferred += bytes_xmit;
1261
1262     if (param->zero_page) {
1263         ram_counters.duplicate++;
1264         return;
1265     }
1266
1267     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1268     compression_counters.compressed_size += bytes_xmit - 8;
1269     compression_counters.pages++;
1270 }
1271
1272 static bool save_page_use_compression(RAMState *rs);
1273
1274 static void flush_compressed_data(RAMState *rs)
1275 {
1276     int idx, len, thread_count;
1277
1278     if (!save_page_use_compression(rs)) {
1279         return;
1280     }
1281     thread_count = migrate_compress_threads();
1282
1283     qemu_mutex_lock(&comp_done_lock);
1284     for (idx = 0; idx < thread_count; idx++) {
1285         while (!comp_param[idx].done) {
1286             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1287         }
1288     }
1289     qemu_mutex_unlock(&comp_done_lock);
1290
1291     for (idx = 0; idx < thread_count; idx++) {
1292         qemu_mutex_lock(&comp_param[idx].mutex);
1293         if (!comp_param[idx].quit) {
1294             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1295             /*
1296              * it's safe to fetch zero_page without holding comp_done_lock
1297              * as there is no further request submitted to the thread,
1298              * i.e, the thread should be waiting for a request at this point.
1299              */
1300             update_compress_thread_counts(&comp_param[idx], len);
1301         }
1302         qemu_mutex_unlock(&comp_param[idx].mutex);
1303     }
1304 }
1305
1306 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1307                                        ram_addr_t offset)
1308 {
1309     param->block = block;
1310     param->offset = offset;
1311 }
1312
1313 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1314                                            ram_addr_t offset)
1315 {
1316     int idx, thread_count, bytes_xmit = -1, pages = -1;
1317     bool wait = migrate_compress_wait_thread();
1318
1319     thread_count = migrate_compress_threads();
1320     qemu_mutex_lock(&comp_done_lock);
1321 retry:
1322     for (idx = 0; idx < thread_count; idx++) {
1323         if (comp_param[idx].done) {
1324             comp_param[idx].done = false;
1325             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1326             qemu_mutex_lock(&comp_param[idx].mutex);
1327             set_compress_params(&comp_param[idx], block, offset);
1328             qemu_cond_signal(&comp_param[idx].cond);
1329             qemu_mutex_unlock(&comp_param[idx].mutex);
1330             pages = 1;
1331             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1332             break;
1333         }
1334     }
1335
1336     /*
1337      * wait for the free thread if the user specifies 'compress-wait-thread',
1338      * otherwise we will post the page out in the main thread as normal page.
1339      */
1340     if (pages < 0 && wait) {
1341         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1342         goto retry;
1343     }
1344     qemu_mutex_unlock(&comp_done_lock);
1345
1346     return pages;
1347 }
1348
1349 /**
1350  * find_dirty_block: find the next dirty page and update any state
1351  * associated with the search process.
1352  *
1353  * Returns true if a page is found
1354  *
1355  * @rs: current RAM state
1356  * @pss: data about the state of the current dirty page scan
1357  * @again: set to false if the search has scanned the whole of RAM
1358  */
1359 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1360 {
1361     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1362     if (pss->complete_round && pss->block == rs->last_seen_block &&
1363         pss->page >= rs->last_page) {
1364         /*
1365          * We've been once around the RAM and haven't found anything.
1366          * Give up.
1367          */
1368         *again = false;
1369         return false;
1370     }
1371     if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1372         >= pss->block->used_length) {
1373         /* Didn't find anything in this RAM Block */
1374         pss->page = 0;
1375         pss->block = QLIST_NEXT_RCU(pss->block, next);
1376         if (!pss->block) {
1377             /*
1378              * If memory migration starts over, we will meet a dirtied page
1379              * which may still exists in compression threads's ring, so we
1380              * should flush the compressed data to make sure the new page
1381              * is not overwritten by the old one in the destination.
1382              *
1383              * Also If xbzrle is on, stop using the data compression at this
1384              * point. In theory, xbzrle can do better than compression.
1385              */
1386             flush_compressed_data(rs);
1387
1388             /* Hit the end of the list */
1389             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1390             /* Flag that we've looped */
1391             pss->complete_round = true;
1392             rs->ram_bulk_stage = false;
1393         }
1394         /* Didn't find anything this time, but try again on the new block */
1395         *again = true;
1396         return false;
1397     } else {
1398         /* Can go around again, but... */
1399         *again = true;
1400         /* We've found something so probably don't need to */
1401         return true;
1402     }
1403 }
1404
1405 /**
1406  * unqueue_page: gets a page of the queue
1407  *
1408  * Helper for 'get_queued_page' - gets a page off the queue
1409  *
1410  * Returns the block of the page (or NULL if none available)
1411  *
1412  * @rs: current RAM state
1413  * @offset: used to return the offset within the RAMBlock
1414  */
1415 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1416 {
1417     RAMBlock *block = NULL;
1418
1419     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1420         return NULL;
1421     }
1422
1423     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1424     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1425         struct RAMSrcPageRequest *entry =
1426                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1427         block = entry->rb;
1428         *offset = entry->offset;
1429
1430         if (entry->len > TARGET_PAGE_SIZE) {
1431             entry->len -= TARGET_PAGE_SIZE;
1432             entry->offset += TARGET_PAGE_SIZE;
1433         } else {
1434             memory_region_unref(block->mr);
1435             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1436             g_free(entry);
1437             migration_consume_urgent_request();
1438         }
1439     }
1440
1441     return block;
1442 }
1443
1444 #if defined(__linux__)
1445 /**
1446  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1447  *   is found, return RAM block pointer and page offset
1448  *
1449  * Returns pointer to the RAMBlock containing faulting page,
1450  *   NULL if no write faults are pending
1451  *
1452  * @rs: current RAM state
1453  * @offset: page offset from the beginning of the block
1454  */
1455 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1456 {
1457     struct uffd_msg uffd_msg;
1458     void *page_address;
1459     RAMBlock *bs;
1460     int res;
1461
1462     if (!migrate_background_snapshot()) {
1463         return NULL;
1464     }
1465
1466     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1467     if (res <= 0) {
1468         return NULL;
1469     }
1470
1471     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1472     bs = qemu_ram_block_from_host(page_address, false, offset);
1473     assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0);
1474     return bs;
1475 }
1476
1477 /**
1478  * ram_save_release_protection: release UFFD write protection after
1479  *   a range of pages has been saved
1480  *
1481  * @rs: current RAM state
1482  * @pss: page-search-status structure
1483  * @start_page: index of the first page in the range relative to pss->block
1484  *
1485  * Returns 0 on success, negative value in case of an error
1486 */
1487 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1488         unsigned long start_page)
1489 {
1490     int res = 0;
1491
1492     /* Check if page is from UFFD-managed region. */
1493     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1494         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1495         uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1496
1497         /* Flush async buffers before un-protect. */
1498         qemu_fflush(rs->f);
1499         /* Un-protect memory range. */
1500         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1501                 false, false);
1502     }
1503
1504     return res;
1505 }
1506
1507 /* ram_write_tracking_available: check if kernel supports required UFFD features
1508  *
1509  * Returns true if supports, false otherwise
1510  */
1511 bool ram_write_tracking_available(void)
1512 {
1513     uint64_t uffd_features;
1514     int res;
1515
1516     res = uffd_query_features(&uffd_features);
1517     return (res == 0 &&
1518             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1519 }
1520
1521 /* ram_write_tracking_compatible: check if guest configuration is
1522  *   compatible with 'write-tracking'
1523  *
1524  * Returns true if compatible, false otherwise
1525  */
1526 bool ram_write_tracking_compatible(void)
1527 {
1528     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1529     int uffd_fd;
1530     RAMBlock *bs;
1531     bool ret = false;
1532
1533     /* Open UFFD file descriptor */
1534     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1535     if (uffd_fd < 0) {
1536         return false;
1537     }
1538
1539     RCU_READ_LOCK_GUARD();
1540
1541     RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
1542         uint64_t uffd_ioctls;
1543
1544         /* Nothing to do with read-only and MMIO-writable regions */
1545         if (bs->mr->readonly || bs->mr->rom_device) {
1546             continue;
1547         }
1548         /* Try to register block memory via UFFD-IO to track writes */
1549         if (uffd_register_memory(uffd_fd, bs->host, bs->max_length,
1550                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1551             goto out;
1552         }
1553         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1554             goto out;
1555         }
1556     }
1557     ret = true;
1558
1559 out:
1560     uffd_close_fd(uffd_fd);
1561     return ret;
1562 }
1563
1564 /*
1565  * ram_write_tracking_start: start UFFD-WP memory tracking
1566  *
1567  * Returns 0 for success or negative value in case of error
1568  */
1569 int ram_write_tracking_start(void)
1570 {
1571     int uffd_fd;
1572     RAMState *rs = ram_state;
1573     RAMBlock *bs;
1574
1575     /* Open UFFD file descriptor */
1576     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1577     if (uffd_fd < 0) {
1578         return uffd_fd;
1579     }
1580     rs->uffdio_fd = uffd_fd;
1581
1582     RCU_READ_LOCK_GUARD();
1583
1584     RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
1585         /* Nothing to do with read-only and MMIO-writable regions */
1586         if (bs->mr->readonly || bs->mr->rom_device) {
1587             continue;
1588         }
1589
1590         /* Register block memory with UFFD to track writes */
1591         if (uffd_register_memory(rs->uffdio_fd, bs->host,
1592                 bs->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1593             goto fail;
1594         }
1595         /* Apply UFFD write protection to the block memory range */
1596         if (uffd_change_protection(rs->uffdio_fd, bs->host,
1597                 bs->max_length, true, false)) {
1598             goto fail;
1599         }
1600         bs->flags |= RAM_UF_WRITEPROTECT;
1601         memory_region_ref(bs->mr);
1602
1603         trace_ram_write_tracking_ramblock_start(bs->idstr, bs->page_size,
1604                 bs->host, bs->max_length);
1605     }
1606
1607     return 0;
1608
1609 fail:
1610     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1611
1612     RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
1613         if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
1614             continue;
1615         }
1616         /*
1617          * In case some memory block failed to be write-protected
1618          * remove protection and unregister all succeeded RAM blocks
1619          */
1620         uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false);
1621         uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length);
1622         /* Cleanup flags and remove reference */
1623         bs->flags &= ~RAM_UF_WRITEPROTECT;
1624         memory_region_unref(bs->mr);
1625     }
1626
1627     uffd_close_fd(uffd_fd);
1628     rs->uffdio_fd = -1;
1629     return -1;
1630 }
1631
1632 /**
1633  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1634  */
1635 void ram_write_tracking_stop(void)
1636 {
1637     RAMState *rs = ram_state;
1638     RAMBlock *bs;
1639
1640     RCU_READ_LOCK_GUARD();
1641
1642     RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
1643         if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
1644             continue;
1645         }
1646         /* Remove protection and unregister all affected RAM blocks */
1647         uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false);
1648         uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length);
1649
1650         trace_ram_write_tracking_ramblock_stop(bs->idstr, bs->page_size,
1651                 bs->host, bs->max_length);
1652
1653         /* Cleanup flags and remove reference */
1654         bs->flags &= ~RAM_UF_WRITEPROTECT;
1655         memory_region_unref(bs->mr);
1656     }
1657
1658     /* Finally close UFFD file descriptor */
1659     uffd_close_fd(rs->uffdio_fd);
1660     rs->uffdio_fd = -1;
1661 }
1662
1663 #else
1664 /* No target OS support, stubs just fail or ignore */
1665
1666 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1667 {
1668     (void) rs;
1669     (void) offset;
1670
1671     return NULL;
1672 }
1673
1674 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1675         unsigned long start_page)
1676 {
1677     (void) rs;
1678     (void) pss;
1679     (void) start_page;
1680
1681     return 0;
1682 }
1683
1684 bool ram_write_tracking_available(void)
1685 {
1686     return false;
1687 }
1688
1689 bool ram_write_tracking_compatible(void)
1690 {
1691     assert(0);
1692     return false;
1693 }
1694
1695 int ram_write_tracking_start(void)
1696 {
1697     assert(0);
1698     return -1;
1699 }
1700
1701 void ram_write_tracking_stop(void)
1702 {
1703     assert(0);
1704 }
1705 #endif /* defined(__linux__) */
1706
1707 /**
1708  * get_queued_page: unqueue a page from the postcopy requests
1709  *
1710  * Skips pages that are already sent (!dirty)
1711  *
1712  * Returns true if a queued page is found
1713  *
1714  * @rs: current RAM state
1715  * @pss: data about the state of the current dirty page scan
1716  */
1717 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1718 {
1719     RAMBlock  *block;
1720     ram_addr_t offset;
1721     bool dirty;
1722
1723     do {
1724         block = unqueue_page(rs, &offset);
1725         /*
1726          * We're sending this page, and since it's postcopy nothing else
1727          * will dirty it, and we must make sure it doesn't get sent again
1728          * even if this queue request was received after the background
1729          * search already sent it.
1730          */
1731         if (block) {
1732             unsigned long page;
1733
1734             page = offset >> TARGET_PAGE_BITS;
1735             dirty = test_bit(page, block->bmap);
1736             if (!dirty) {
1737                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1738                                                 page);
1739             } else {
1740                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1741             }
1742         }
1743
1744     } while (block && !dirty);
1745
1746     if (!block) {
1747         /*
1748          * Poll write faults too if background snapshot is enabled; that's
1749          * when we have vcpus got blocked by the write protected pages.
1750          */
1751         block = poll_fault_page(rs, &offset);
1752     }
1753
1754     if (block) {
1755         /*
1756          * As soon as we start servicing pages out of order, then we have
1757          * to kill the bulk stage, since the bulk stage assumes
1758          * in (migration_bitmap_find_and_reset_dirty) that every page is
1759          * dirty, that's no longer true.
1760          */
1761         rs->ram_bulk_stage = false;
1762
1763         /*
1764          * We want the background search to continue from the queued page
1765          * since the guest is likely to want other pages near to the page
1766          * it just requested.
1767          */
1768         pss->block = block;
1769         pss->page = offset >> TARGET_PAGE_BITS;
1770
1771         /*
1772          * This unqueued page would break the "one round" check, even is
1773          * really rare.
1774          */
1775         pss->complete_round = false;
1776     }
1777
1778     return !!block;
1779 }
1780
1781 /**
1782  * migration_page_queue_free: drop any remaining pages in the ram
1783  * request queue
1784  *
1785  * It should be empty at the end anyway, but in error cases there may
1786  * be some left.  in case that there is any page left, we drop it.
1787  *
1788  */
1789 static void migration_page_queue_free(RAMState *rs)
1790 {
1791     struct RAMSrcPageRequest *mspr, *next_mspr;
1792     /* This queue generally should be empty - but in the case of a failed
1793      * migration might have some droppings in.
1794      */
1795     RCU_READ_LOCK_GUARD();
1796     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1797         memory_region_unref(mspr->rb->mr);
1798         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1799         g_free(mspr);
1800     }
1801 }
1802
1803 /**
1804  * ram_save_queue_pages: queue the page for transmission
1805  *
1806  * A request from postcopy destination for example.
1807  *
1808  * Returns zero on success or negative on error
1809  *
1810  * @rbname: Name of the RAMBLock of the request. NULL means the
1811  *          same that last one.
1812  * @start: starting address from the start of the RAMBlock
1813  * @len: length (in bytes) to send
1814  */
1815 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1816 {
1817     RAMBlock *ramblock;
1818     RAMState *rs = ram_state;
1819
1820     ram_counters.postcopy_requests++;
1821     RCU_READ_LOCK_GUARD();
1822
1823     if (!rbname) {
1824         /* Reuse last RAMBlock */
1825         ramblock = rs->last_req_rb;
1826
1827         if (!ramblock) {
1828             /*
1829              * Shouldn't happen, we can't reuse the last RAMBlock if
1830              * it's the 1st request.
1831              */
1832             error_report("ram_save_queue_pages no previous block");
1833             return -1;
1834         }
1835     } else {
1836         ramblock = qemu_ram_block_by_name(rbname);
1837
1838         if (!ramblock) {
1839             /* We shouldn't be asked for a non-existent RAMBlock */
1840             error_report("ram_save_queue_pages no block '%s'", rbname);
1841             return -1;
1842         }
1843         rs->last_req_rb = ramblock;
1844     }
1845     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1846     if (start + len > ramblock->used_length) {
1847         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1848                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1849                      __func__, start, len, ramblock->used_length);
1850         return -1;
1851     }
1852
1853     struct RAMSrcPageRequest *new_entry =
1854         g_malloc0(sizeof(struct RAMSrcPageRequest));
1855     new_entry->rb = ramblock;
1856     new_entry->offset = start;
1857     new_entry->len = len;
1858
1859     memory_region_ref(ramblock->mr);
1860     qemu_mutex_lock(&rs->src_page_req_mutex);
1861     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1862     migration_make_urgent_request();
1863     qemu_mutex_unlock(&rs->src_page_req_mutex);
1864
1865     return 0;
1866 }
1867
1868 static bool save_page_use_compression(RAMState *rs)
1869 {
1870     if (!migrate_use_compression()) {
1871         return false;
1872     }
1873
1874     /*
1875      * If xbzrle is on, stop using the data compression after first
1876      * round of migration even if compression is enabled. In theory,
1877      * xbzrle can do better than compression.
1878      */
1879     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1880         return true;
1881     }
1882
1883     return false;
1884 }
1885
1886 /*
1887  * try to compress the page before posting it out, return true if the page
1888  * has been properly handled by compression, otherwise needs other
1889  * paths to handle it
1890  */
1891 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1892 {
1893     if (!save_page_use_compression(rs)) {
1894         return false;
1895     }
1896
1897     /*
1898      * When starting the process of a new block, the first page of
1899      * the block should be sent out before other pages in the same
1900      * block, and all the pages in last block should have been sent
1901      * out, keeping this order is important, because the 'cont' flag
1902      * is used to avoid resending the block name.
1903      *
1904      * We post the fist page as normal page as compression will take
1905      * much CPU resource.
1906      */
1907     if (block != rs->last_sent_block) {
1908         flush_compressed_data(rs);
1909         return false;
1910     }
1911
1912     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1913         return true;
1914     }
1915
1916     compression_counters.busy++;
1917     return false;
1918 }
1919
1920 /**
1921  * ram_save_target_page: save one target page
1922  *
1923  * Returns the number of pages written
1924  *
1925  * @rs: current RAM state
1926  * @pss: data about the page we want to send
1927  * @last_stage: if we are at the completion stage
1928  */
1929 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1930                                 bool last_stage)
1931 {
1932     RAMBlock *block = pss->block;
1933     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1934     int res;
1935
1936     if (control_save_page(rs, block, offset, &res)) {
1937         return res;
1938     }
1939
1940     if (save_compress_page(rs, block, offset)) {
1941         return 1;
1942     }
1943
1944     res = save_zero_page(rs, block, offset);
1945     if (res > 0) {
1946         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1947          * page would be stale
1948          */
1949         if (!save_page_use_compression(rs)) {
1950             XBZRLE_cache_lock();
1951             xbzrle_cache_zero_page(rs, block->offset + offset);
1952             XBZRLE_cache_unlock();
1953         }
1954         ram_release_pages(block->idstr, offset, res);
1955         return res;
1956     }
1957
1958     /*
1959      * Do not use multifd for:
1960      * 1. Compression as the first page in the new block should be posted out
1961      *    before sending the compressed page
1962      * 2. In postcopy as one whole host page should be placed
1963      */
1964     if (!save_page_use_compression(rs) && migrate_use_multifd()
1965         && !migration_in_postcopy()) {
1966         return ram_save_multifd_page(rs, block, offset);
1967     }
1968
1969     return ram_save_page(rs, pss, last_stage);
1970 }
1971
1972 /**
1973  * ram_save_host_page: save a whole host page
1974  *
1975  * Starting at *offset send pages up to the end of the current host
1976  * page. It's valid for the initial offset to point into the middle of
1977  * a host page in which case the remainder of the hostpage is sent.
1978  * Only dirty target pages are sent. Note that the host page size may
1979  * be a huge page for this block.
1980  * The saving stops at the boundary of the used_length of the block
1981  * if the RAMBlock isn't a multiple of the host page size.
1982  *
1983  * Returns the number of pages written or negative on error
1984  *
1985  * @rs: current RAM state
1986  * @ms: current migration state
1987  * @pss: data about the page we want to send
1988  * @last_stage: if we are at the completion stage
1989  */
1990 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1991                               bool last_stage)
1992 {
1993     int tmppages, pages = 0;
1994     size_t pagesize_bits =
1995         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1996     unsigned long start_page = pss->page;
1997     int res;
1998
1999     if (ramblock_is_ignored(pss->block)) {
2000         error_report("block %s should not be migrated !", pss->block->idstr);
2001         return 0;
2002     }
2003
2004     do {
2005         /* Check the pages is dirty and if it is send it */
2006         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2007             pss->page++;
2008             continue;
2009         }
2010
2011         tmppages = ram_save_target_page(rs, pss, last_stage);
2012         if (tmppages < 0) {
2013             return tmppages;
2014         }
2015
2016         pages += tmppages;
2017         pss->page++;
2018         /* Allow rate limiting to happen in the middle of huge pages */
2019         migration_rate_limit();
2020     } while ((pss->page & (pagesize_bits - 1)) &&
2021              offset_in_ramblock(pss->block,
2022                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2023     /* The offset we leave with is the last one we looked at */
2024     pss->page--;
2025
2026     res = ram_save_release_protection(rs, pss, start_page);
2027     return (res < 0 ? res : pages);
2028 }
2029
2030 /**
2031  * ram_find_and_save_block: finds a dirty page and sends it to f
2032  *
2033  * Called within an RCU critical section.
2034  *
2035  * Returns the number of pages written where zero means no dirty pages,
2036  * or negative on error
2037  *
2038  * @rs: current RAM state
2039  * @last_stage: if we are at the completion stage
2040  *
2041  * On systems where host-page-size > target-page-size it will send all the
2042  * pages in a host page that are dirty.
2043  */
2044
2045 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2046 {
2047     PageSearchStatus pss;
2048     int pages = 0;
2049     bool again, found;
2050
2051     /* No dirty page as there is zero RAM */
2052     if (!ram_bytes_total()) {
2053         return pages;
2054     }
2055
2056     pss.block = rs->last_seen_block;
2057     pss.page = rs->last_page;
2058     pss.complete_round = false;
2059
2060     if (!pss.block) {
2061         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2062     }
2063
2064     do {
2065         again = true;
2066         found = get_queued_page(rs, &pss);
2067
2068         if (!found) {
2069             /* priority queue empty, so just search for something dirty */
2070             found = find_dirty_block(rs, &pss, &again);
2071         }
2072
2073         if (found) {
2074             pages = ram_save_host_page(rs, &pss, last_stage);
2075         }
2076     } while (!pages && again);
2077
2078     rs->last_seen_block = pss.block;
2079     rs->last_page = pss.page;
2080
2081     return pages;
2082 }
2083
2084 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2085 {
2086     uint64_t pages = size / TARGET_PAGE_SIZE;
2087
2088     if (zero) {
2089         ram_counters.duplicate += pages;
2090     } else {
2091         ram_counters.normal += pages;
2092         ram_counters.transferred += size;
2093         qemu_update_position(f, size);
2094     }
2095 }
2096
2097 static uint64_t ram_bytes_total_common(bool count_ignored)
2098 {
2099     RAMBlock *block;
2100     uint64_t total = 0;
2101
2102     RCU_READ_LOCK_GUARD();
2103
2104     if (count_ignored) {
2105         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2106             total += block->used_length;
2107         }
2108     } else {
2109         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2110             total += block->used_length;
2111         }
2112     }
2113     return total;
2114 }
2115
2116 uint64_t ram_bytes_total(void)
2117 {
2118     return ram_bytes_total_common(false);
2119 }
2120
2121 static void xbzrle_load_setup(void)
2122 {
2123     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2124 }
2125
2126 static void xbzrle_load_cleanup(void)
2127 {
2128     g_free(XBZRLE.decoded_buf);
2129     XBZRLE.decoded_buf = NULL;
2130 }
2131
2132 static void ram_state_cleanup(RAMState **rsp)
2133 {
2134     if (*rsp) {
2135         migration_page_queue_free(*rsp);
2136         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2137         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2138         g_free(*rsp);
2139         *rsp = NULL;
2140     }
2141 }
2142
2143 static void xbzrle_cleanup(void)
2144 {
2145     XBZRLE_cache_lock();
2146     if (XBZRLE.cache) {
2147         cache_fini(XBZRLE.cache);
2148         g_free(XBZRLE.encoded_buf);
2149         g_free(XBZRLE.current_buf);
2150         g_free(XBZRLE.zero_target_page);
2151         XBZRLE.cache = NULL;
2152         XBZRLE.encoded_buf = NULL;
2153         XBZRLE.current_buf = NULL;
2154         XBZRLE.zero_target_page = NULL;
2155     }
2156     XBZRLE_cache_unlock();
2157 }
2158
2159 static void ram_save_cleanup(void *opaque)
2160 {
2161     RAMState **rsp = opaque;
2162     RAMBlock *block;
2163
2164     /* We don't use dirty log with background snapshots */
2165     if (!migrate_background_snapshot()) {
2166         /* caller have hold iothread lock or is in a bh, so there is
2167          * no writing race against the migration bitmap
2168          */
2169         memory_global_dirty_log_stop();
2170     }
2171
2172     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2173         g_free(block->clear_bmap);
2174         block->clear_bmap = NULL;
2175         g_free(block->bmap);
2176         block->bmap = NULL;
2177     }
2178
2179     xbzrle_cleanup();
2180     compress_threads_save_cleanup();
2181     ram_state_cleanup(rsp);
2182 }
2183
2184 static void ram_state_reset(RAMState *rs)
2185 {
2186     rs->last_seen_block = NULL;
2187     rs->last_sent_block = NULL;
2188     rs->last_page = 0;
2189     rs->last_version = ram_list.version;
2190     rs->ram_bulk_stage = true;
2191     rs->fpo_enabled = false;
2192 }
2193
2194 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2195
2196 /*
2197  * 'expected' is the value you expect the bitmap mostly to be full
2198  * of; it won't bother printing lines that are all this value.
2199  * If 'todump' is null the migration bitmap is dumped.
2200  */
2201 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2202                            unsigned long pages)
2203 {
2204     int64_t cur;
2205     int64_t linelen = 128;
2206     char linebuf[129];
2207
2208     for (cur = 0; cur < pages; cur += linelen) {
2209         int64_t curb;
2210         bool found = false;
2211         /*
2212          * Last line; catch the case where the line length
2213          * is longer than remaining ram
2214          */
2215         if (cur + linelen > pages) {
2216             linelen = pages - cur;
2217         }
2218         for (curb = 0; curb < linelen; curb++) {
2219             bool thisbit = test_bit(cur + curb, todump);
2220             linebuf[curb] = thisbit ? '1' : '.';
2221             found = found || (thisbit != expected);
2222         }
2223         if (found) {
2224             linebuf[curb] = '\0';
2225             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2226         }
2227     }
2228 }
2229
2230 /* **** functions for postcopy ***** */
2231
2232 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2233 {
2234     struct RAMBlock *block;
2235
2236     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2237         unsigned long *bitmap = block->bmap;
2238         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2239         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2240
2241         while (run_start < range) {
2242             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2243             ram_discard_range(block->idstr,
2244                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2245                               ((ram_addr_t)(run_end - run_start))
2246                                 << TARGET_PAGE_BITS);
2247             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2248         }
2249     }
2250 }
2251
2252 /**
2253  * postcopy_send_discard_bm_ram: discard a RAMBlock
2254  *
2255  * Returns zero on success
2256  *
2257  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2258  *
2259  * @ms: current migration state
2260  * @block: RAMBlock to discard
2261  */
2262 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2263 {
2264     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2265     unsigned long current;
2266     unsigned long *bitmap = block->bmap;
2267
2268     for (current = 0; current < end; ) {
2269         unsigned long one = find_next_bit(bitmap, end, current);
2270         unsigned long zero, discard_length;
2271
2272         if (one >= end) {
2273             break;
2274         }
2275
2276         zero = find_next_zero_bit(bitmap, end, one + 1);
2277
2278         if (zero >= end) {
2279             discard_length = end - one;
2280         } else {
2281             discard_length = zero - one;
2282         }
2283         postcopy_discard_send_range(ms, one, discard_length);
2284         current = one + discard_length;
2285     }
2286
2287     return 0;
2288 }
2289
2290 /**
2291  * postcopy_each_ram_send_discard: discard all RAMBlocks
2292  *
2293  * Returns 0 for success or negative for error
2294  *
2295  * Utility for the outgoing postcopy code.
2296  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2297  *   passing it bitmap indexes and name.
2298  * (qemu_ram_foreach_block ends up passing unscaled lengths
2299  *  which would mean postcopy code would have to deal with target page)
2300  *
2301  * @ms: current migration state
2302  */
2303 static int postcopy_each_ram_send_discard(MigrationState *ms)
2304 {
2305     struct RAMBlock *block;
2306     int ret;
2307
2308     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2309         postcopy_discard_send_init(ms, block->idstr);
2310
2311         /*
2312          * Postcopy sends chunks of bitmap over the wire, but it
2313          * just needs indexes at this point, avoids it having
2314          * target page specific code.
2315          */
2316         ret = postcopy_send_discard_bm_ram(ms, block);
2317         postcopy_discard_send_finish(ms);
2318         if (ret) {
2319             return ret;
2320         }
2321     }
2322
2323     return 0;
2324 }
2325
2326 /**
2327  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2328  *
2329  * Helper for postcopy_chunk_hostpages; it's called twice to
2330  * canonicalize the two bitmaps, that are similar, but one is
2331  * inverted.
2332  *
2333  * Postcopy requires that all target pages in a hostpage are dirty or
2334  * clean, not a mix.  This function canonicalizes the bitmaps.
2335  *
2336  * @ms: current migration state
2337  * @block: block that contains the page we want to canonicalize
2338  */
2339 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2340 {
2341     RAMState *rs = ram_state;
2342     unsigned long *bitmap = block->bmap;
2343     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2344     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2345     unsigned long run_start;
2346
2347     if (block->page_size == TARGET_PAGE_SIZE) {
2348         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2349         return;
2350     }
2351
2352     /* Find a dirty page */
2353     run_start = find_next_bit(bitmap, pages, 0);
2354
2355     while (run_start < pages) {
2356
2357         /*
2358          * If the start of this run of pages is in the middle of a host
2359          * page, then we need to fixup this host page.
2360          */
2361         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2362             /* Find the end of this run */
2363             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2364             /*
2365              * If the end isn't at the start of a host page, then the
2366              * run doesn't finish at the end of a host page
2367              * and we need to discard.
2368              */
2369         }
2370
2371         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2372             unsigned long page;
2373             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2374                                                              host_ratio);
2375             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2376
2377             /* Clean up the bitmap */
2378             for (page = fixup_start_addr;
2379                  page < fixup_start_addr + host_ratio; page++) {
2380                 /*
2381                  * Remark them as dirty, updating the count for any pages
2382                  * that weren't previously dirty.
2383                  */
2384                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2385             }
2386         }
2387
2388         /* Find the next dirty page for the next iteration */
2389         run_start = find_next_bit(bitmap, pages, run_start);
2390     }
2391 }
2392
2393 /**
2394  * postcopy_chunk_hostpages: discard any partially sent host page
2395  *
2396  * Utility for the outgoing postcopy code.
2397  *
2398  * Discard any partially sent host-page size chunks, mark any partially
2399  * dirty host-page size chunks as all dirty.  In this case the host-page
2400  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2401  *
2402  * Returns zero on success
2403  *
2404  * @ms: current migration state
2405  * @block: block we want to work with
2406  */
2407 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2408 {
2409     postcopy_discard_send_init(ms, block->idstr);
2410
2411     /*
2412      * Ensure that all partially dirty host pages are made fully dirty.
2413      */
2414     postcopy_chunk_hostpages_pass(ms, block);
2415
2416     postcopy_discard_send_finish(ms);
2417     return 0;
2418 }
2419
2420 /**
2421  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2422  *
2423  * Returns zero on success
2424  *
2425  * Transmit the set of pages to be discarded after precopy to the target
2426  * these are pages that:
2427  *     a) Have been previously transmitted but are now dirty again
2428  *     b) Pages that have never been transmitted, this ensures that
2429  *        any pages on the destination that have been mapped by background
2430  *        tasks get discarded (transparent huge pages is the specific concern)
2431  * Hopefully this is pretty sparse
2432  *
2433  * @ms: current migration state
2434  */
2435 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2436 {
2437     RAMState *rs = ram_state;
2438     RAMBlock *block;
2439     int ret;
2440
2441     RCU_READ_LOCK_GUARD();
2442
2443     /* This should be our last sync, the src is now paused */
2444     migration_bitmap_sync(rs);
2445
2446     /* Easiest way to make sure we don't resume in the middle of a host-page */
2447     rs->last_seen_block = NULL;
2448     rs->last_sent_block = NULL;
2449     rs->last_page = 0;
2450
2451     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2452         /* Deal with TPS != HPS and huge pages */
2453         ret = postcopy_chunk_hostpages(ms, block);
2454         if (ret) {
2455             return ret;
2456         }
2457
2458 #ifdef DEBUG_POSTCOPY
2459         ram_debug_dump_bitmap(block->bmap, true,
2460                               block->used_length >> TARGET_PAGE_BITS);
2461 #endif
2462     }
2463     trace_ram_postcopy_send_discard_bitmap();
2464
2465     return postcopy_each_ram_send_discard(ms);
2466 }
2467
2468 /**
2469  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2470  *
2471  * Returns zero on success
2472  *
2473  * @rbname: name of the RAMBlock of the request. NULL means the
2474  *          same that last one.
2475  * @start: RAMBlock starting page
2476  * @length: RAMBlock size
2477  */
2478 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2479 {
2480     trace_ram_discard_range(rbname, start, length);
2481
2482     RCU_READ_LOCK_GUARD();
2483     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2484
2485     if (!rb) {
2486         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2487         return -1;
2488     }
2489
2490     /*
2491      * On source VM, we don't need to update the received bitmap since
2492      * we don't even have one.
2493      */
2494     if (rb->receivedmap) {
2495         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2496                      length >> qemu_target_page_bits());
2497     }
2498
2499     return ram_block_discard_range(rb, start, length);
2500 }
2501
2502 /*
2503  * For every allocation, we will try not to crash the VM if the
2504  * allocation failed.
2505  */
2506 static int xbzrle_init(void)
2507 {
2508     Error *local_err = NULL;
2509
2510     if (!migrate_use_xbzrle()) {
2511         return 0;
2512     }
2513
2514     XBZRLE_cache_lock();
2515
2516     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2517     if (!XBZRLE.zero_target_page) {
2518         error_report("%s: Error allocating zero page", __func__);
2519         goto err_out;
2520     }
2521
2522     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2523                               TARGET_PAGE_SIZE, &local_err);
2524     if (!XBZRLE.cache) {
2525         error_report_err(local_err);
2526         goto free_zero_page;
2527     }
2528
2529     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2530     if (!XBZRLE.encoded_buf) {
2531         error_report("%s: Error allocating encoded_buf", __func__);
2532         goto free_cache;
2533     }
2534
2535     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2536     if (!XBZRLE.current_buf) {
2537         error_report("%s: Error allocating current_buf", __func__);
2538         goto free_encoded_buf;
2539     }
2540
2541     /* We are all good */
2542     XBZRLE_cache_unlock();
2543     return 0;
2544
2545 free_encoded_buf:
2546     g_free(XBZRLE.encoded_buf);
2547     XBZRLE.encoded_buf = NULL;
2548 free_cache:
2549     cache_fini(XBZRLE.cache);
2550     XBZRLE.cache = NULL;
2551 free_zero_page:
2552     g_free(XBZRLE.zero_target_page);
2553     XBZRLE.zero_target_page = NULL;
2554 err_out:
2555     XBZRLE_cache_unlock();
2556     return -ENOMEM;
2557 }
2558
2559 static int ram_state_init(RAMState **rsp)
2560 {
2561     *rsp = g_try_new0(RAMState, 1);
2562
2563     if (!*rsp) {
2564         error_report("%s: Init ramstate fail", __func__);
2565         return -1;
2566     }
2567
2568     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2569     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2570     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2571
2572     /*
2573      * Count the total number of pages used by ram blocks not including any
2574      * gaps due to alignment or unplugs.
2575      * This must match with the initial values of dirty bitmap.
2576      */
2577     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2578     ram_state_reset(*rsp);
2579
2580     return 0;
2581 }
2582
2583 static void ram_list_init_bitmaps(void)
2584 {
2585     MigrationState *ms = migrate_get_current();
2586     RAMBlock *block;
2587     unsigned long pages;
2588     uint8_t shift;
2589
2590     /* Skip setting bitmap if there is no RAM */
2591     if (ram_bytes_total()) {
2592         shift = ms->clear_bitmap_shift;
2593         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2594             error_report("clear_bitmap_shift (%u) too big, using "
2595                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2596             shift = CLEAR_BITMAP_SHIFT_MAX;
2597         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2598             error_report("clear_bitmap_shift (%u) too small, using "
2599                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2600             shift = CLEAR_BITMAP_SHIFT_MIN;
2601         }
2602
2603         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2604             pages = block->max_length >> TARGET_PAGE_BITS;
2605             /*
2606              * The initial dirty bitmap for migration must be set with all
2607              * ones to make sure we'll migrate every guest RAM page to
2608              * destination.
2609              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2610              * new migration after a failed migration, ram_list.
2611              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2612              * guest memory.
2613              */
2614             block->bmap = bitmap_new(pages);
2615             bitmap_set(block->bmap, 0, pages);
2616             block->clear_bmap_shift = shift;
2617             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2618         }
2619     }
2620 }
2621
2622 static void ram_init_bitmaps(RAMState *rs)
2623 {
2624     /* For memory_global_dirty_log_start below.  */
2625     qemu_mutex_lock_iothread();
2626     qemu_mutex_lock_ramlist();
2627
2628     WITH_RCU_READ_LOCK_GUARD() {
2629         ram_list_init_bitmaps();
2630         /* We don't use dirty log with background snapshots */
2631         if (!migrate_background_snapshot()) {
2632             memory_global_dirty_log_start();
2633             migration_bitmap_sync_precopy(rs);
2634         }
2635     }
2636     qemu_mutex_unlock_ramlist();
2637     qemu_mutex_unlock_iothread();
2638 }
2639
2640 static int ram_init_all(RAMState **rsp)
2641 {
2642     if (ram_state_init(rsp)) {
2643         return -1;
2644     }
2645
2646     if (xbzrle_init()) {
2647         ram_state_cleanup(rsp);
2648         return -1;
2649     }
2650
2651     ram_init_bitmaps(*rsp);
2652
2653     return 0;
2654 }
2655
2656 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2657 {
2658     RAMBlock *block;
2659     uint64_t pages = 0;
2660
2661     /*
2662      * Postcopy is not using xbzrle/compression, so no need for that.
2663      * Also, since source are already halted, we don't need to care
2664      * about dirty page logging as well.
2665      */
2666
2667     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2668         pages += bitmap_count_one(block->bmap,
2669                                   block->used_length >> TARGET_PAGE_BITS);
2670     }
2671
2672     /* This may not be aligned with current bitmaps. Recalculate. */
2673     rs->migration_dirty_pages = pages;
2674
2675     rs->last_seen_block = NULL;
2676     rs->last_sent_block = NULL;
2677     rs->last_page = 0;
2678     rs->last_version = ram_list.version;
2679     /*
2680      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2681      * matter what we have sent.
2682      */
2683     rs->ram_bulk_stage = false;
2684
2685     /* Update RAMState cache of output QEMUFile */
2686     rs->f = out;
2687
2688     trace_ram_state_resume_prepare(pages);
2689 }
2690
2691 /*
2692  * This function clears bits of the free pages reported by the caller from the
2693  * migration dirty bitmap. @addr is the host address corresponding to the
2694  * start of the continuous guest free pages, and @len is the total bytes of
2695  * those pages.
2696  */
2697 void qemu_guest_free_page_hint(void *addr, size_t len)
2698 {
2699     RAMBlock *block;
2700     ram_addr_t offset;
2701     size_t used_len, start, npages;
2702     MigrationState *s = migrate_get_current();
2703
2704     /* This function is currently expected to be used during live migration */
2705     if (!migration_is_setup_or_active(s->state)) {
2706         return;
2707     }
2708
2709     for (; len > 0; len -= used_len, addr += used_len) {
2710         block = qemu_ram_block_from_host(addr, false, &offset);
2711         if (unlikely(!block || offset >= block->used_length)) {
2712             /*
2713              * The implementation might not support RAMBlock resize during
2714              * live migration, but it could happen in theory with future
2715              * updates. So we add a check here to capture that case.
2716              */
2717             error_report_once("%s unexpected error", __func__);
2718             return;
2719         }
2720
2721         if (len <= block->used_length - offset) {
2722             used_len = len;
2723         } else {
2724             used_len = block->used_length - offset;
2725         }
2726
2727         start = offset >> TARGET_PAGE_BITS;
2728         npages = used_len >> TARGET_PAGE_BITS;
2729
2730         qemu_mutex_lock(&ram_state->bitmap_mutex);
2731         ram_state->migration_dirty_pages -=
2732                       bitmap_count_one_with_offset(block->bmap, start, npages);
2733         bitmap_clear(block->bmap, start, npages);
2734         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2735     }
2736 }
2737
2738 /*
2739  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2740  * long-running RCU critical section.  When rcu-reclaims in the code
2741  * start to become numerous it will be necessary to reduce the
2742  * granularity of these critical sections.
2743  */
2744
2745 /**
2746  * ram_save_setup: Setup RAM for migration
2747  *
2748  * Returns zero to indicate success and negative for error
2749  *
2750  * @f: QEMUFile where to send the data
2751  * @opaque: RAMState pointer
2752  */
2753 static int ram_save_setup(QEMUFile *f, void *opaque)
2754 {
2755     RAMState **rsp = opaque;
2756     RAMBlock *block;
2757
2758     if (compress_threads_save_setup()) {
2759         return -1;
2760     }
2761
2762     /* migration has already setup the bitmap, reuse it. */
2763     if (!migration_in_colo_state()) {
2764         if (ram_init_all(rsp) != 0) {
2765             compress_threads_save_cleanup();
2766             return -1;
2767         }
2768     }
2769     (*rsp)->f = f;
2770
2771     WITH_RCU_READ_LOCK_GUARD() {
2772         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2773
2774         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2775             qemu_put_byte(f, strlen(block->idstr));
2776             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2777             qemu_put_be64(f, block->used_length);
2778             if (migrate_postcopy_ram() && block->page_size !=
2779                                           qemu_host_page_size) {
2780                 qemu_put_be64(f, block->page_size);
2781             }
2782             if (migrate_ignore_shared()) {
2783                 qemu_put_be64(f, block->mr->addr);
2784             }
2785         }
2786     }
2787
2788     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2789     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2790
2791     multifd_send_sync_main(f);
2792     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2793     qemu_fflush(f);
2794
2795     return 0;
2796 }
2797
2798 /**
2799  * ram_save_iterate: iterative stage for migration
2800  *
2801  * Returns zero to indicate success and negative for error
2802  *
2803  * @f: QEMUFile where to send the data
2804  * @opaque: RAMState pointer
2805  */
2806 static int ram_save_iterate(QEMUFile *f, void *opaque)
2807 {
2808     RAMState **temp = opaque;
2809     RAMState *rs = *temp;
2810     int ret = 0;
2811     int i;
2812     int64_t t0;
2813     int done = 0;
2814
2815     if (blk_mig_bulk_active()) {
2816         /* Avoid transferring ram during bulk phase of block migration as
2817          * the bulk phase will usually take a long time and transferring
2818          * ram updates during that time is pointless. */
2819         goto out;
2820     }
2821
2822     WITH_RCU_READ_LOCK_GUARD() {
2823         if (ram_list.version != rs->last_version) {
2824             ram_state_reset(rs);
2825         }
2826
2827         /* Read version before ram_list.blocks */
2828         smp_rmb();
2829
2830         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2831
2832         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2833         i = 0;
2834         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2835                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2836             int pages;
2837
2838             if (qemu_file_get_error(f)) {
2839                 break;
2840             }
2841
2842             pages = ram_find_and_save_block(rs, false);
2843             /* no more pages to sent */
2844             if (pages == 0) {
2845                 done = 1;
2846                 break;
2847             }
2848
2849             if (pages < 0) {
2850                 qemu_file_set_error(f, pages);
2851                 break;
2852             }
2853
2854             rs->target_page_count += pages;
2855
2856             /*
2857              * During postcopy, it is necessary to make sure one whole host
2858              * page is sent in one chunk.
2859              */
2860             if (migrate_postcopy_ram()) {
2861                 flush_compressed_data(rs);
2862             }
2863
2864             /*
2865              * we want to check in the 1st loop, just in case it was the 1st
2866              * time and we had to sync the dirty bitmap.
2867              * qemu_clock_get_ns() is a bit expensive, so we only check each
2868              * some iterations
2869              */
2870             if ((i & 63) == 0) {
2871                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2872                               1000000;
2873                 if (t1 > MAX_WAIT) {
2874                     trace_ram_save_iterate_big_wait(t1, i);
2875                     break;
2876                 }
2877             }
2878             i++;
2879         }
2880     }
2881
2882     /*
2883      * Must occur before EOS (or any QEMUFile operation)
2884      * because of RDMA protocol.
2885      */
2886     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2887
2888 out:
2889     if (ret >= 0
2890         && migration_is_setup_or_active(migrate_get_current()->state)) {
2891         multifd_send_sync_main(rs->f);
2892         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2893         qemu_fflush(f);
2894         ram_counters.transferred += 8;
2895
2896         ret = qemu_file_get_error(f);
2897     }
2898     if (ret < 0) {
2899         return ret;
2900     }
2901
2902     return done;
2903 }
2904
2905 /**
2906  * ram_save_complete: function called to send the remaining amount of ram
2907  *
2908  * Returns zero to indicate success or negative on error
2909  *
2910  * Called with iothread lock
2911  *
2912  * @f: QEMUFile where to send the data
2913  * @opaque: RAMState pointer
2914  */
2915 static int ram_save_complete(QEMUFile *f, void *opaque)
2916 {
2917     RAMState **temp = opaque;
2918     RAMState *rs = *temp;
2919     int ret = 0;
2920
2921     WITH_RCU_READ_LOCK_GUARD() {
2922         if (!migration_in_postcopy()) {
2923             migration_bitmap_sync_precopy(rs);
2924         }
2925
2926         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2927
2928         /* try transferring iterative blocks of memory */
2929
2930         /* flush all remaining blocks regardless of rate limiting */
2931         while (true) {
2932             int pages;
2933
2934             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2935             /* no more blocks to sent */
2936             if (pages == 0) {
2937                 break;
2938             }
2939             if (pages < 0) {
2940                 ret = pages;
2941                 break;
2942             }
2943         }
2944
2945         flush_compressed_data(rs);
2946         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2947     }
2948
2949     if (ret >= 0) {
2950         multifd_send_sync_main(rs->f);
2951         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2952         qemu_fflush(f);
2953     }
2954
2955     return ret;
2956 }
2957
2958 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2959                              uint64_t *res_precopy_only,
2960                              uint64_t *res_compatible,
2961                              uint64_t *res_postcopy_only)
2962 {
2963     RAMState **temp = opaque;
2964     RAMState *rs = *temp;
2965     uint64_t remaining_size;
2966
2967     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2968
2969     if (!migration_in_postcopy() &&
2970         remaining_size < max_size) {
2971         qemu_mutex_lock_iothread();
2972         WITH_RCU_READ_LOCK_GUARD() {
2973             migration_bitmap_sync_precopy(rs);
2974         }
2975         qemu_mutex_unlock_iothread();
2976         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2977     }
2978
2979     if (migrate_postcopy_ram()) {
2980         /* We can do postcopy, and all the data is postcopiable */
2981         *res_compatible += remaining_size;
2982     } else {
2983         *res_precopy_only += remaining_size;
2984     }
2985 }
2986
2987 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2988 {
2989     unsigned int xh_len;
2990     int xh_flags;
2991     uint8_t *loaded_data;
2992
2993     /* extract RLE header */
2994     xh_flags = qemu_get_byte(f);
2995     xh_len = qemu_get_be16(f);
2996
2997     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2998         error_report("Failed to load XBZRLE page - wrong compression!");
2999         return -1;
3000     }
3001
3002     if (xh_len > TARGET_PAGE_SIZE) {
3003         error_report("Failed to load XBZRLE page - len overflow!");
3004         return -1;
3005     }
3006     loaded_data = XBZRLE.decoded_buf;
3007     /* load data and decode */
3008     /* it can change loaded_data to point to an internal buffer */
3009     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3010
3011     /* decode RLE */
3012     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3013                              TARGET_PAGE_SIZE) == -1) {
3014         error_report("Failed to load XBZRLE page - decode error!");
3015         return -1;
3016     }
3017
3018     return 0;
3019 }
3020
3021 /**
3022  * ram_block_from_stream: read a RAMBlock id from the migration stream
3023  *
3024  * Must be called from within a rcu critical section.
3025  *
3026  * Returns a pointer from within the RCU-protected ram_list.
3027  *
3028  * @f: QEMUFile where to read the data from
3029  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3030  */
3031 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3032 {
3033     static RAMBlock *block;
3034     char id[256];
3035     uint8_t len;
3036
3037     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3038         if (!block) {
3039             error_report("Ack, bad migration stream!");
3040             return NULL;
3041         }
3042         return block;
3043     }
3044
3045     len = qemu_get_byte(f);
3046     qemu_get_buffer(f, (uint8_t *)id, len);
3047     id[len] = 0;
3048
3049     block = qemu_ram_block_by_name(id);
3050     if (!block) {
3051         error_report("Can't find block %s", id);
3052         return NULL;
3053     }
3054
3055     if (ramblock_is_ignored(block)) {
3056         error_report("block %s should not be migrated !", id);
3057         return NULL;
3058     }
3059
3060     return block;
3061 }
3062
3063 static inline void *host_from_ram_block_offset(RAMBlock *block,
3064                                                ram_addr_t offset)
3065 {
3066     if (!offset_in_ramblock(block, offset)) {
3067         return NULL;
3068     }
3069
3070     return block->host + offset;
3071 }
3072
3073 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3074                              ram_addr_t offset, bool record_bitmap)
3075 {
3076     if (!offset_in_ramblock(block, offset)) {
3077         return NULL;
3078     }
3079     if (!block->colo_cache) {
3080         error_report("%s: colo_cache is NULL in block :%s",
3081                      __func__, block->idstr);
3082         return NULL;
3083     }
3084
3085     /*
3086     * During colo checkpoint, we need bitmap of these migrated pages.
3087     * It help us to decide which pages in ram cache should be flushed
3088     * into VM's RAM later.
3089     */
3090     if (record_bitmap &&
3091         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3092         ram_state->migration_dirty_pages++;
3093     }
3094     return block->colo_cache + offset;
3095 }
3096
3097 /**
3098  * ram_handle_compressed: handle the zero page case
3099  *
3100  * If a page (or a whole RDMA chunk) has been
3101  * determined to be zero, then zap it.
3102  *
3103  * @host: host address for the zero page
3104  * @ch: what the page is filled from.  We only support zero
3105  * @size: size of the zero page
3106  */
3107 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3108 {
3109     if (ch != 0 || !is_zero_range(host, size)) {
3110         memset(host, ch, size);
3111     }
3112 }
3113
3114 /* return the size after decompression, or negative value on error */
3115 static int
3116 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3117                      const uint8_t *source, size_t source_len)
3118 {
3119     int err;
3120
3121     err = inflateReset(stream);
3122     if (err != Z_OK) {
3123         return -1;
3124     }
3125
3126     stream->avail_in = source_len;
3127     stream->next_in = (uint8_t *)source;
3128     stream->avail_out = dest_len;
3129     stream->next_out = dest;
3130
3131     err = inflate(stream, Z_NO_FLUSH);
3132     if (err != Z_STREAM_END) {
3133         return -1;
3134     }
3135
3136     return stream->total_out;
3137 }
3138
3139 static void *do_data_decompress(void *opaque)
3140 {
3141     DecompressParam *param = opaque;
3142     unsigned long pagesize;
3143     uint8_t *des;
3144     int len, ret;
3145
3146     qemu_mutex_lock(&param->mutex);
3147     while (!param->quit) {
3148         if (param->des) {
3149             des = param->des;
3150             len = param->len;
3151             param->des = 0;
3152             qemu_mutex_unlock(&param->mutex);
3153
3154             pagesize = TARGET_PAGE_SIZE;
3155
3156             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3157                                        param->compbuf, len);
3158             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3159                 error_report("decompress data failed");
3160                 qemu_file_set_error(decomp_file, ret);
3161             }
3162
3163             qemu_mutex_lock(&decomp_done_lock);
3164             param->done = true;
3165             qemu_cond_signal(&decomp_done_cond);
3166             qemu_mutex_unlock(&decomp_done_lock);
3167
3168             qemu_mutex_lock(&param->mutex);
3169         } else {
3170             qemu_cond_wait(&param->cond, &param->mutex);
3171         }
3172     }
3173     qemu_mutex_unlock(&param->mutex);
3174
3175     return NULL;
3176 }
3177
3178 static int wait_for_decompress_done(void)
3179 {
3180     int idx, thread_count;
3181
3182     if (!migrate_use_compression()) {
3183         return 0;
3184     }
3185
3186     thread_count = migrate_decompress_threads();
3187     qemu_mutex_lock(&decomp_done_lock);
3188     for (idx = 0; idx < thread_count; idx++) {
3189         while (!decomp_param[idx].done) {
3190             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3191         }
3192     }
3193     qemu_mutex_unlock(&decomp_done_lock);
3194     return qemu_file_get_error(decomp_file);
3195 }
3196
3197 static void compress_threads_load_cleanup(void)
3198 {
3199     int i, thread_count;
3200
3201     if (!migrate_use_compression()) {
3202         return;
3203     }
3204     thread_count = migrate_decompress_threads();
3205     for (i = 0; i < thread_count; i++) {
3206         /*
3207          * we use it as a indicator which shows if the thread is
3208          * properly init'd or not
3209          */
3210         if (!decomp_param[i].compbuf) {
3211             break;
3212         }
3213
3214         qemu_mutex_lock(&decomp_param[i].mutex);
3215         decomp_param[i].quit = true;
3216         qemu_cond_signal(&decomp_param[i].cond);
3217         qemu_mutex_unlock(&decomp_param[i].mutex);
3218     }
3219     for (i = 0; i < thread_count; i++) {
3220         if (!decomp_param[i].compbuf) {
3221             break;
3222         }
3223
3224         qemu_thread_join(decompress_threads + i);
3225         qemu_mutex_destroy(&decomp_param[i].mutex);
3226         qemu_cond_destroy(&decomp_param[i].cond);
3227         inflateEnd(&decomp_param[i].stream);
3228         g_free(decomp_param[i].compbuf);
3229         decomp_param[i].compbuf = NULL;
3230     }
3231     g_free(decompress_threads);
3232     g_free(decomp_param);
3233     decompress_threads = NULL;
3234     decomp_param = NULL;
3235     decomp_file = NULL;
3236 }
3237
3238 static int compress_threads_load_setup(QEMUFile *f)
3239 {
3240     int i, thread_count;
3241
3242     if (!migrate_use_compression()) {
3243         return 0;
3244     }
3245
3246     thread_count = migrate_decompress_threads();
3247     decompress_threads = g_new0(QemuThread, thread_count);
3248     decomp_param = g_new0(DecompressParam, thread_count);
3249     qemu_mutex_init(&decomp_done_lock);
3250     qemu_cond_init(&decomp_done_cond);
3251     decomp_file = f;
3252     for (i = 0; i < thread_count; i++) {
3253         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3254             goto exit;
3255         }
3256
3257         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3258         qemu_mutex_init(&decomp_param[i].mutex);
3259         qemu_cond_init(&decomp_param[i].cond);
3260         decomp_param[i].done = true;
3261         decomp_param[i].quit = false;
3262         qemu_thread_create(decompress_threads + i, "decompress",
3263                            do_data_decompress, decomp_param + i,
3264                            QEMU_THREAD_JOINABLE);
3265     }
3266     return 0;
3267 exit:
3268     compress_threads_load_cleanup();
3269     return -1;
3270 }
3271
3272 static void decompress_data_with_multi_threads(QEMUFile *f,
3273                                                void *host, int len)
3274 {
3275     int idx, thread_count;
3276
3277     thread_count = migrate_decompress_threads();
3278     qemu_mutex_lock(&decomp_done_lock);
3279     while (true) {
3280         for (idx = 0; idx < thread_count; idx++) {
3281             if (decomp_param[idx].done) {
3282                 decomp_param[idx].done = false;
3283                 qemu_mutex_lock(&decomp_param[idx].mutex);
3284                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3285                 decomp_param[idx].des = host;
3286                 decomp_param[idx].len = len;
3287                 qemu_cond_signal(&decomp_param[idx].cond);
3288                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3289                 break;
3290             }
3291         }
3292         if (idx < thread_count) {
3293             break;
3294         } else {
3295             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3296         }
3297     }
3298     qemu_mutex_unlock(&decomp_done_lock);
3299 }
3300
3301  /*
3302   * we must set ram_bulk_stage to false, otherwise in
3303   * migation_bitmap_find_dirty the bitmap will be unused and
3304   * all the pages in ram cache wil be flushed to the ram of
3305   * secondary VM.
3306   */
3307 static void colo_init_ram_state(void)
3308 {
3309     ram_state_init(&ram_state);
3310     ram_state->ram_bulk_stage = false;
3311 }
3312
3313 /*
3314  * colo cache: this is for secondary VM, we cache the whole
3315  * memory of the secondary VM, it is need to hold the global lock
3316  * to call this helper.
3317  */
3318 int colo_init_ram_cache(void)
3319 {
3320     RAMBlock *block;
3321
3322     WITH_RCU_READ_LOCK_GUARD() {
3323         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3324             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3325                                                     NULL,
3326                                                     false);
3327             if (!block->colo_cache) {
3328                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3329                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3330                              block->used_length);
3331                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3332                     if (block->colo_cache) {
3333                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3334                         block->colo_cache = NULL;
3335                     }
3336                 }
3337                 return -errno;
3338             }
3339         }
3340     }
3341
3342     /*
3343     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3344     * with to decide which page in cache should be flushed into SVM's RAM. Here
3345     * we use the same name 'ram_bitmap' as for migration.
3346     */
3347     if (ram_bytes_total()) {
3348         RAMBlock *block;
3349
3350         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3351             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3352             block->bmap = bitmap_new(pages);
3353         }
3354     }
3355
3356     colo_init_ram_state();
3357     return 0;
3358 }
3359
3360 /* TODO: duplicated with ram_init_bitmaps */
3361 void colo_incoming_start_dirty_log(void)
3362 {
3363     RAMBlock *block = NULL;
3364     /* For memory_global_dirty_log_start below. */
3365     qemu_mutex_lock_iothread();
3366     qemu_mutex_lock_ramlist();
3367
3368     memory_global_dirty_log_sync();
3369     WITH_RCU_READ_LOCK_GUARD() {
3370         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3371             ramblock_sync_dirty_bitmap(ram_state, block);
3372             /* Discard this dirty bitmap record */
3373             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3374         }
3375         memory_global_dirty_log_start();
3376     }
3377     ram_state->migration_dirty_pages = 0;
3378     qemu_mutex_unlock_ramlist();
3379     qemu_mutex_unlock_iothread();
3380 }
3381
3382 /* It is need to hold the global lock to call this helper */
3383 void colo_release_ram_cache(void)
3384 {
3385     RAMBlock *block;
3386
3387     memory_global_dirty_log_stop();
3388     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3389         g_free(block->bmap);
3390         block->bmap = NULL;
3391     }
3392
3393     WITH_RCU_READ_LOCK_GUARD() {
3394         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3395             if (block->colo_cache) {
3396                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3397                 block->colo_cache = NULL;
3398             }
3399         }
3400     }
3401     ram_state_cleanup(&ram_state);
3402 }
3403
3404 /**
3405  * ram_load_setup: Setup RAM for migration incoming side
3406  *
3407  * Returns zero to indicate success and negative for error
3408  *
3409  * @f: QEMUFile where to receive the data
3410  * @opaque: RAMState pointer
3411  */
3412 static int ram_load_setup(QEMUFile *f, void *opaque)
3413 {
3414     if (compress_threads_load_setup(f)) {
3415         return -1;
3416     }
3417
3418     xbzrle_load_setup();
3419     ramblock_recv_map_init();
3420
3421     return 0;
3422 }
3423
3424 static int ram_load_cleanup(void *opaque)
3425 {
3426     RAMBlock *rb;
3427
3428     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3429         qemu_ram_block_writeback(rb);
3430     }
3431
3432     xbzrle_load_cleanup();
3433     compress_threads_load_cleanup();
3434
3435     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3436         g_free(rb->receivedmap);
3437         rb->receivedmap = NULL;
3438     }
3439
3440     return 0;
3441 }
3442
3443 /**
3444  * ram_postcopy_incoming_init: allocate postcopy data structures
3445  *
3446  * Returns 0 for success and negative if there was one error
3447  *
3448  * @mis: current migration incoming state
3449  *
3450  * Allocate data structures etc needed by incoming migration with
3451  * postcopy-ram. postcopy-ram's similarly names
3452  * postcopy_ram_incoming_init does the work.
3453  */
3454 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3455 {
3456     return postcopy_ram_incoming_init(mis);
3457 }
3458
3459 /**
3460  * ram_load_postcopy: load a page in postcopy case
3461  *
3462  * Returns 0 for success or -errno in case of error
3463  *
3464  * Called in postcopy mode by ram_load().
3465  * rcu_read_lock is taken prior to this being called.
3466  *
3467  * @f: QEMUFile where to send the data
3468  */
3469 static int ram_load_postcopy(QEMUFile *f)
3470 {
3471     int flags = 0, ret = 0;
3472     bool place_needed = false;
3473     bool matches_target_page_size = false;
3474     MigrationIncomingState *mis = migration_incoming_get_current();
3475     /* Temporary page that is later 'placed' */
3476     void *postcopy_host_page = mis->postcopy_tmp_page;
3477     void *this_host = NULL;
3478     bool all_zero = true;
3479     int target_pages = 0;
3480
3481     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3482         ram_addr_t addr;
3483         void *host = NULL;
3484         void *page_buffer = NULL;
3485         void *place_source = NULL;
3486         RAMBlock *block = NULL;
3487         uint8_t ch;
3488         int len;
3489
3490         addr = qemu_get_be64(f);
3491
3492         /*
3493          * If qemu file error, we should stop here, and then "addr"
3494          * may be invalid
3495          */
3496         ret = qemu_file_get_error(f);
3497         if (ret) {
3498             break;
3499         }
3500
3501         flags = addr & ~TARGET_PAGE_MASK;
3502         addr &= TARGET_PAGE_MASK;
3503
3504         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3505         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3506                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3507             block = ram_block_from_stream(f, flags);
3508
3509             host = host_from_ram_block_offset(block, addr);
3510             if (!host) {
3511                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3512                 ret = -EINVAL;
3513                 break;
3514             }
3515             target_pages++;
3516             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3517             /*
3518              * Postcopy requires that we place whole host pages atomically;
3519              * these may be huge pages for RAMBlocks that are backed by
3520              * hugetlbfs.
3521              * To make it atomic, the data is read into a temporary page
3522              * that's moved into place later.
3523              * The migration protocol uses,  possibly smaller, target-pages
3524              * however the source ensures it always sends all the components
3525              * of a host page in one chunk.
3526              */
3527             page_buffer = postcopy_host_page +
3528                           ((uintptr_t)host & (block->page_size - 1));
3529             if (target_pages == 1) {
3530                 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3531                                                     block->page_size);
3532             } else {
3533                 /* not the 1st TP within the HP */
3534                 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3535                     (uintptr_t)this_host) {
3536                     error_report("Non-same host page %p/%p",
3537                                   host, this_host);
3538                     ret = -EINVAL;
3539                     break;
3540                 }
3541             }
3542
3543             /*
3544              * If it's the last part of a host page then we place the host
3545              * page
3546              */
3547             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3548                 place_needed = true;
3549             }
3550             place_source = postcopy_host_page;
3551         }
3552
3553         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3554         case RAM_SAVE_FLAG_ZERO:
3555             ch = qemu_get_byte(f);
3556             /*
3557              * Can skip to set page_buffer when
3558              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3559              */
3560             if (ch || !matches_target_page_size) {
3561                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3562             }
3563             if (ch) {
3564                 all_zero = false;
3565             }
3566             break;
3567
3568         case RAM_SAVE_FLAG_PAGE:
3569             all_zero = false;
3570             if (!matches_target_page_size) {
3571                 /* For huge pages, we always use temporary buffer */
3572                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3573             } else {
3574                 /*
3575                  * For small pages that matches target page size, we
3576                  * avoid the qemu_file copy.  Instead we directly use
3577                  * the buffer of QEMUFile to place the page.  Note: we
3578                  * cannot do any QEMUFile operation before using that
3579                  * buffer to make sure the buffer is valid when
3580                  * placing the page.
3581                  */
3582                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3583                                          TARGET_PAGE_SIZE);
3584             }
3585             break;
3586         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3587             all_zero = false;
3588             len = qemu_get_be32(f);
3589             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3590                 error_report("Invalid compressed data length: %d", len);
3591                 ret = -EINVAL;
3592                 break;
3593             }
3594             decompress_data_with_multi_threads(f, page_buffer, len);
3595             break;
3596
3597         case RAM_SAVE_FLAG_EOS:
3598             /* normal exit */
3599             multifd_recv_sync_main();
3600             break;
3601         default:
3602             error_report("Unknown combination of migration flags: 0x%x"
3603                          " (postcopy mode)", flags);
3604             ret = -EINVAL;
3605             break;
3606         }
3607
3608         /* Got the whole host page, wait for decompress before placing. */
3609         if (place_needed) {
3610             ret |= wait_for_decompress_done();
3611         }
3612
3613         /* Detect for any possible file errors */
3614         if (!ret && qemu_file_get_error(f)) {
3615             ret = qemu_file_get_error(f);
3616         }
3617
3618         if (!ret && place_needed) {
3619             /* This gets called at the last target page in the host page */
3620             void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3621                                                        block->page_size);
3622
3623             if (all_zero) {
3624                 ret = postcopy_place_page_zero(mis, place_dest,
3625                                                block);
3626             } else {
3627                 ret = postcopy_place_page(mis, place_dest,
3628                                           place_source, block);
3629             }
3630             place_needed = false;
3631             target_pages = 0;
3632             /* Assume we have a zero page until we detect something different */
3633             all_zero = true;
3634         }
3635     }
3636
3637     return ret;
3638 }
3639
3640 static bool postcopy_is_advised(void)
3641 {
3642     PostcopyState ps = postcopy_state_get();
3643     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3644 }
3645
3646 static bool postcopy_is_running(void)
3647 {
3648     PostcopyState ps = postcopy_state_get();
3649     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3650 }
3651
3652 /*
3653  * Flush content of RAM cache into SVM's memory.
3654  * Only flush the pages that be dirtied by PVM or SVM or both.
3655  */
3656 void colo_flush_ram_cache(void)
3657 {
3658     RAMBlock *block = NULL;
3659     void *dst_host;
3660     void *src_host;
3661     unsigned long offset = 0;
3662
3663     memory_global_dirty_log_sync();
3664     WITH_RCU_READ_LOCK_GUARD() {
3665         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3666             ramblock_sync_dirty_bitmap(ram_state, block);
3667         }
3668     }
3669
3670     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3671     WITH_RCU_READ_LOCK_GUARD() {
3672         block = QLIST_FIRST_RCU(&ram_list.blocks);
3673
3674         while (block) {
3675             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3676
3677             if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3678                 >= block->used_length) {
3679                 offset = 0;
3680                 block = QLIST_NEXT_RCU(block, next);
3681             } else {
3682                 migration_bitmap_clear_dirty(ram_state, block, offset);
3683                 dst_host = block->host
3684                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3685                 src_host = block->colo_cache
3686                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3687                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3688             }
3689         }
3690     }
3691     trace_colo_flush_ram_cache_end();
3692 }
3693
3694 /**
3695  * ram_load_precopy: load pages in precopy case
3696  *
3697  * Returns 0 for success or -errno in case of error
3698  *
3699  * Called in precopy mode by ram_load().
3700  * rcu_read_lock is taken prior to this being called.
3701  *
3702  * @f: QEMUFile where to send the data
3703  */
3704 static int ram_load_precopy(QEMUFile *f)
3705 {
3706     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3707     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3708     bool postcopy_advised = postcopy_is_advised();
3709     if (!migrate_use_compression()) {
3710         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3711     }
3712
3713     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3714         ram_addr_t addr, total_ram_bytes;
3715         void *host = NULL, *host_bak = NULL;
3716         uint8_t ch;
3717
3718         /*
3719          * Yield periodically to let main loop run, but an iteration of
3720          * the main loop is expensive, so do it each some iterations
3721          */
3722         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3723             aio_co_schedule(qemu_get_current_aio_context(),
3724                             qemu_coroutine_self());
3725             qemu_coroutine_yield();
3726         }
3727         i++;
3728
3729         addr = qemu_get_be64(f);
3730         flags = addr & ~TARGET_PAGE_MASK;
3731         addr &= TARGET_PAGE_MASK;
3732
3733         if (flags & invalid_flags) {
3734             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3735                 error_report("Received an unexpected compressed page");
3736             }
3737
3738             ret = -EINVAL;
3739             break;
3740         }
3741
3742         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3743                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3744             RAMBlock *block = ram_block_from_stream(f, flags);
3745
3746             host = host_from_ram_block_offset(block, addr);
3747             /*
3748              * After going into COLO stage, we should not load the page
3749              * into SVM's memory directly, we put them into colo_cache firstly.
3750              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3751              * Previously, we copied all these memory in preparing stage of COLO
3752              * while we need to stop VM, which is a time-consuming process.
3753              * Here we optimize it by a trick, back-up every page while in
3754              * migration process while COLO is enabled, though it affects the
3755              * speed of the migration, but it obviously reduce the downtime of
3756              * back-up all SVM'S memory in COLO preparing stage.
3757              */
3758             if (migration_incoming_colo_enabled()) {
3759                 if (migration_incoming_in_colo_state()) {
3760                     /* In COLO stage, put all pages into cache temporarily */
3761                     host = colo_cache_from_block_offset(block, addr, true);
3762                 } else {
3763                    /*
3764                     * In migration stage but before COLO stage,
3765                     * Put all pages into both cache and SVM's memory.
3766                     */
3767                     host_bak = colo_cache_from_block_offset(block, addr, false);
3768                 }
3769             }
3770             if (!host) {
3771                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3772                 ret = -EINVAL;
3773                 break;
3774             }
3775             if (!migration_incoming_in_colo_state()) {
3776                 ramblock_recv_bitmap_set(block, host);
3777             }
3778
3779             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3780         }
3781
3782         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3783         case RAM_SAVE_FLAG_MEM_SIZE:
3784             /* Synchronize RAM block list */
3785             total_ram_bytes = addr;
3786             while (!ret && total_ram_bytes) {
3787                 RAMBlock *block;
3788                 char id[256];
3789                 ram_addr_t length;
3790
3791                 len = qemu_get_byte(f);
3792                 qemu_get_buffer(f, (uint8_t *)id, len);
3793                 id[len] = 0;
3794                 length = qemu_get_be64(f);
3795
3796                 block = qemu_ram_block_by_name(id);
3797                 if (block && !qemu_ram_is_migratable(block)) {
3798                     error_report("block %s should not be migrated !", id);
3799                     ret = -EINVAL;
3800                 } else if (block) {
3801                     if (length != block->used_length) {
3802                         Error *local_err = NULL;
3803
3804                         ret = qemu_ram_resize(block, length,
3805                                               &local_err);
3806                         if (local_err) {
3807                             error_report_err(local_err);
3808                         }
3809                     }
3810                     /* For postcopy we need to check hugepage sizes match */
3811                     if (postcopy_advised && migrate_postcopy_ram() &&
3812                         block->page_size != qemu_host_page_size) {
3813                         uint64_t remote_page_size = qemu_get_be64(f);
3814                         if (remote_page_size != block->page_size) {
3815                             error_report("Mismatched RAM page size %s "
3816                                          "(local) %zd != %" PRId64,
3817                                          id, block->page_size,
3818                                          remote_page_size);
3819                             ret = -EINVAL;
3820                         }
3821                     }
3822                     if (migrate_ignore_shared()) {
3823                         hwaddr addr = qemu_get_be64(f);
3824                         if (ramblock_is_ignored(block) &&
3825                             block->mr->addr != addr) {
3826                             error_report("Mismatched GPAs for block %s "
3827                                          "%" PRId64 "!= %" PRId64,
3828                                          id, (uint64_t)addr,
3829                                          (uint64_t)block->mr->addr);
3830                             ret = -EINVAL;
3831                         }
3832                     }
3833                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3834                                           block->idstr);
3835                 } else {
3836                     error_report("Unknown ramblock \"%s\", cannot "
3837                                  "accept migration", id);
3838                     ret = -EINVAL;
3839                 }
3840
3841                 total_ram_bytes -= length;
3842             }
3843             break;
3844
3845         case RAM_SAVE_FLAG_ZERO:
3846             ch = qemu_get_byte(f);
3847             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3848             break;
3849
3850         case RAM_SAVE_FLAG_PAGE:
3851             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3852             break;
3853
3854         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3855             len = qemu_get_be32(f);
3856             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3857                 error_report("Invalid compressed data length: %d", len);
3858                 ret = -EINVAL;
3859                 break;
3860             }
3861             decompress_data_with_multi_threads(f, host, len);
3862             break;
3863
3864         case RAM_SAVE_FLAG_XBZRLE:
3865             if (load_xbzrle(f, addr, host) < 0) {
3866                 error_report("Failed to decompress XBZRLE page at "
3867                              RAM_ADDR_FMT, addr);
3868                 ret = -EINVAL;
3869                 break;
3870             }
3871             break;
3872         case RAM_SAVE_FLAG_EOS:
3873             /* normal exit */
3874             multifd_recv_sync_main();
3875             break;
3876         default:
3877             if (flags & RAM_SAVE_FLAG_HOOK) {
3878                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3879             } else {
3880                 error_report("Unknown combination of migration flags: 0x%x",
3881                              flags);
3882                 ret = -EINVAL;
3883             }
3884         }
3885         if (!ret) {
3886             ret = qemu_file_get_error(f);
3887         }
3888         if (!ret && host_bak) {
3889             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3890         }
3891     }
3892
3893     ret |= wait_for_decompress_done();
3894     return ret;
3895 }
3896
3897 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3898 {
3899     int ret = 0;
3900     static uint64_t seq_iter;
3901     /*
3902      * If system is running in postcopy mode, page inserts to host memory must
3903      * be atomic
3904      */
3905     bool postcopy_running = postcopy_is_running();
3906
3907     seq_iter++;
3908
3909     if (version_id != 4) {
3910         return -EINVAL;
3911     }
3912
3913     /*
3914      * This RCU critical section can be very long running.
3915      * When RCU reclaims in the code start to become numerous,
3916      * it will be necessary to reduce the granularity of this
3917      * critical section.
3918      */
3919     WITH_RCU_READ_LOCK_GUARD() {
3920         if (postcopy_running) {
3921             ret = ram_load_postcopy(f);
3922         } else {
3923             ret = ram_load_precopy(f);
3924         }
3925     }
3926     trace_ram_load_complete(ret, seq_iter);
3927
3928     return ret;
3929 }
3930
3931 static bool ram_has_postcopy(void *opaque)
3932 {
3933     RAMBlock *rb;
3934     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3935         if (ramblock_is_pmem(rb)) {
3936             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3937                          "is not supported now!", rb->idstr, rb->host);
3938             return false;
3939         }
3940     }
3941
3942     return migrate_postcopy_ram();
3943 }
3944
3945 /* Sync all the dirty bitmap with destination VM.  */
3946 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3947 {
3948     RAMBlock *block;
3949     QEMUFile *file = s->to_dst_file;
3950     int ramblock_count = 0;
3951
3952     trace_ram_dirty_bitmap_sync_start();
3953
3954     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3955         qemu_savevm_send_recv_bitmap(file, block->idstr);
3956         trace_ram_dirty_bitmap_request(block->idstr);
3957         ramblock_count++;
3958     }
3959
3960     trace_ram_dirty_bitmap_sync_wait();
3961
3962     /* Wait until all the ramblocks' dirty bitmap synced */
3963     while (ramblock_count--) {
3964         qemu_sem_wait(&s->rp_state.rp_sem);
3965     }
3966
3967     trace_ram_dirty_bitmap_sync_complete();
3968
3969     return 0;
3970 }
3971
3972 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3973 {
3974     qemu_sem_post(&s->rp_state.rp_sem);
3975 }
3976
3977 /*
3978  * Read the received bitmap, revert it as the initial dirty bitmap.
3979  * This is only used when the postcopy migration is paused but wants
3980  * to resume from a middle point.
3981  */
3982 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3983 {
3984     int ret = -EINVAL;
3985     QEMUFile *file = s->rp_state.from_dst_file;
3986     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3987     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3988     uint64_t size, end_mark;
3989
3990     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3991
3992     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3993         error_report("%s: incorrect state %s", __func__,
3994                      MigrationStatus_str(s->state));
3995         return -EINVAL;
3996     }
3997
3998     /*
3999      * Note: see comments in ramblock_recv_bitmap_send() on why we
4000      * need the endianness conversion, and the paddings.
4001      */
4002     local_size = ROUND_UP(local_size, 8);
4003
4004     /* Add paddings */
4005     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4006
4007     size = qemu_get_be64(file);
4008
4009     /* The size of the bitmap should match with our ramblock */
4010     if (size != local_size) {
4011         error_report("%s: ramblock '%s' bitmap size mismatch "
4012                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4013                      block->idstr, size, local_size);
4014         ret = -EINVAL;
4015         goto out;
4016     }
4017
4018     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4019     end_mark = qemu_get_be64(file);
4020
4021     ret = qemu_file_get_error(file);
4022     if (ret || size != local_size) {
4023         error_report("%s: read bitmap failed for ramblock '%s': %d"
4024                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4025                      __func__, block->idstr, ret, local_size, size);
4026         ret = -EIO;
4027         goto out;
4028     }
4029
4030     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4031         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4032                      __func__, block->idstr, end_mark);
4033         ret = -EINVAL;
4034         goto out;
4035     }
4036
4037     /*
4038      * Endianness conversion. We are during postcopy (though paused).
4039      * The dirty bitmap won't change. We can directly modify it.
4040      */
4041     bitmap_from_le(block->bmap, le_bitmap, nbits);
4042
4043     /*
4044      * What we received is "received bitmap". Revert it as the initial
4045      * dirty bitmap for this ramblock.
4046      */
4047     bitmap_complement(block->bmap, block->bmap, nbits);
4048
4049     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4050
4051     /*
4052      * We succeeded to sync bitmap for current ramblock. If this is
4053      * the last one to sync, we need to notify the main send thread.
4054      */
4055     ram_dirty_bitmap_reload_notify(s);
4056
4057     ret = 0;
4058 out:
4059     g_free(le_bitmap);
4060     return ret;
4061 }
4062
4063 static int ram_resume_prepare(MigrationState *s, void *opaque)
4064 {
4065     RAMState *rs = *(RAMState **)opaque;
4066     int ret;
4067
4068     ret = ram_dirty_bitmap_sync_all(s, rs);
4069     if (ret) {
4070         return ret;
4071     }
4072
4073     ram_state_resume_prepare(rs, s->to_dst_file);
4074
4075     return 0;
4076 }
4077
4078 static SaveVMHandlers savevm_ram_handlers = {
4079     .save_setup = ram_save_setup,
4080     .save_live_iterate = ram_save_iterate,
4081     .save_live_complete_postcopy = ram_save_complete,
4082     .save_live_complete_precopy = ram_save_complete,
4083     .has_postcopy = ram_has_postcopy,
4084     .save_live_pending = ram_save_pending,
4085     .load_state = ram_load,
4086     .save_cleanup = ram_save_cleanup,
4087     .load_setup = ram_load_setup,
4088     .load_cleanup = ram_load_cleanup,
4089     .resume_prepare = ram_resume_prepare,
4090 };
4091
4092 void ram_mig_init(void)
4093 {
4094     qemu_mutex_init(&XBZRLE.lock);
4095     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4096 }