migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include "qemu/cutils.h"
  32 #include "qemu/bitops.h"
  33 #include "qemu/bitmap.h"
  34 #include "qemu/main-loop.h"
  35 #include "xbzrle.h"
  36 #include "ram.h"
  37 #include "migration.h"
  38 #include "migration/register.h"
  39 #include "migration/misc.h"
  40 #include "qemu-file.h"
  41 #include "postcopy-ram.h"
  42 #include "page_cache.h"
  43 #include "qemu/error-report.h"
  44 #include "qapi/error.h"
  45 #include "qapi/qapi-types-migration.h"
  46 #include "qapi/qapi-events-migration.h"
  47 #include "qapi/qmp/qerror.h"
  48 #include "trace.h"
  49 #include "exec/ram_addr.h"
  50 #include "exec/target_page.h"
  51 #include "qemu/rcu_queue.h"
  52 #include "migration/colo.h"
  53 #include "block.h"
  54 #include "sysemu/sysemu.h"
  55 #include "savevm.h"
  56 #include "qemu/iov.h"
  57 #include "multifd.h"
  58
  59 /***********************************************************/
  60 /* ram save/restore */
  61
  62 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  63  * worked for pages that where filled with the same char.  We switched
  64  * it to only search for the zero value.  And to avoid confusion with
  65  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  66  */
  67
  68 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  69 #define RAM_SAVE_FLAG_ZERO     0x02
  70 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  71 #define RAM_SAVE_FLAG_PAGE     0x08
  72 #define RAM_SAVE_FLAG_EOS      0x10
  73 #define RAM_SAVE_FLAG_CONTINUE 0x20
  74 #define RAM_SAVE_FLAG_XBZRLE   0x40
  75 /* 0x80 is reserved in migration.h start with 0x100 next */
  76 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  77
  78 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  79 {
  80     return buffer_is_zero(p, size);
  81 }
  82
  83 XBZRLECacheStats xbzrle_counters;
  84
  85 /* struct contains XBZRLE cache and a static page
  86    used by the compression */
  87 static struct {
  88     /* buffer used for XBZRLE encoding */
  89     uint8_t *encoded_buf;
  90     /* buffer for storing page content */
  91     uint8_t *current_buf;
  92     /* Cache for XBZRLE, Protected by lock. */
  93     PageCache *cache;
  94     QemuMutex lock;
  95     /* it will store a page full of zeros */
  96     uint8_t *zero_target_page;
  97     /* buffer used for XBZRLE decoding */
  98     uint8_t *decoded_buf;
  99 } XBZRLE;
 100
 101 static void XBZRLE_cache_lock(void)
 102 {
 103     if (migrate_use_xbzrle())
 104         qemu_mutex_lock(&XBZRLE.lock);
 105 }
 106
 107 static void XBZRLE_cache_unlock(void)
 108 {
 109     if (migrate_use_xbzrle())
 110         qemu_mutex_unlock(&XBZRLE.lock);
 111 }
 112
 113 /**
 114  * xbzrle_cache_resize: resize the xbzrle cache
 115  *
 116  * This function is called from qmp_migrate_set_cache_size in main
 117  * thread, possibly while a migration is in progress.  A running
 118  * migration may be using the cache and might finish during this call,
 119  * hence changes to the cache are protected by XBZRLE.lock().
 120  *
 121  * Returns 0 for success or -1 for error
 122  *
 123  * @new_size: new cache size
 124  * @errp: set *errp if the check failed, with reason
 125  */
 126 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 127 {
 128     PageCache *new_cache;
 129     int64_t ret = 0;
 130
 131     /* Check for truncation */
 132     if (new_size != (size_t)new_size) {
 133         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 134                    "exceeding address space");
 135         return -1;
 136     }
 137
 138     if (new_size == migrate_xbzrle_cache_size()) {
 139         /* nothing to do */
 140         return 0;
 141     }
 142
 143     XBZRLE_cache_lock();
 144
 145     if (XBZRLE.cache != NULL) {
 146         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 147         if (!new_cache) {
 148             ret = -1;
 149             goto out;
 150         }
 151
 152         cache_fini(XBZRLE.cache);
 153         XBZRLE.cache = new_cache;
 154     }
 155 out:
 156     XBZRLE_cache_unlock();
 157     return ret;
 158 }
 159
 160 static bool ramblock_is_ignored(RAMBlock *block)
 161 {
 162     return !qemu_ram_is_migratable(block) ||
 163            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 164 }
 165
 166 /* Should be holding either ram_list.mutex, or the RCU lock. */
 167 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
 168     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 169         if (ramblock_is_ignored(block)) {} else
 170
 171 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 172     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 173         if (!qemu_ram_is_migratable(block)) {} else
 174
 175 #undef RAMBLOCK_FOREACH
 176
 177 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 178 {
 179     RAMBlock *block;
 180     int ret = 0;
 181
 182     RCU_READ_LOCK_GUARD();
 183
 184     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 185         ret = func(block, opaque);
 186         if (ret) {
 187             break;
 188         }
 189     }
 190     return ret;
 191 }
 192
 193 static void ramblock_recv_map_init(void)
 194 {
 195     RAMBlock *rb;
 196
 197     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 198         assert(!rb->receivedmap);
 199         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 200     }
 201 }
 202
 203 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 204 {
 205     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 206                     rb->receivedmap);
 207 }
 208
 209 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 210 {
 211     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 212 }
 213
 214 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 215 {
 216     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 217 }
 218
 219 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 220                                     size_t nr)
 221 {
 222     bitmap_set_atomic(rb->receivedmap,
 223                       ramblock_recv_bitmap_offset(host_addr, rb),
 224                       nr);
 225 }
 226
 227 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 228
 229 /*
 230  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 231  *
 232  * Returns >0 if success with sent bytes, or <0 if error.
 233  */
 234 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 235                                   const char *block_name)
 236 {
 237     RAMBlock *block = qemu_ram_block_by_name(block_name);
 238     unsigned long *le_bitmap, nbits;
 239     uint64_t size;
 240
 241     if (!block) {
 242         error_report("%s: invalid block name: %s", __func__, block_name);
 243         return -1;
 244     }
 245
 246     nbits = block->used_length >> TARGET_PAGE_BITS;
 247
 248     /*
 249      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 250      * machines we may need 4 more bytes for padding (see below
 251      * comment). So extend it a bit before hand.
 252      */
 253     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 254
 255     /*
 256      * Always use little endian when sending the bitmap. This is
 257      * required that when source and destination VMs are not using the
 258      * same endianess. (Note: big endian won't work.)
 259      */
 260     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 261
 262     /* Size of the bitmap, in bytes */
 263     size = DIV_ROUND_UP(nbits, 8);
 264
 265     /*
 266      * size is always aligned to 8 bytes for 64bit machines, but it
 267      * may not be true for 32bit machines. We need this padding to
 268      * make sure the migration can survive even between 32bit and
 269      * 64bit machines.
 270      */
 271     size = ROUND_UP(size, 8);
 272
 273     qemu_put_be64(file, size);
 274     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 275     /*
 276      * Mark as an end, in case the middle part is screwed up due to
 277      * some "misterious" reason.
 278      */
 279     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 280     qemu_fflush(file);
 281
 282     g_free(le_bitmap);
 283
 284     if (qemu_file_get_error(file)) {
 285         return qemu_file_get_error(file);
 286     }
 287
 288     return size + sizeof(size);
 289 }
 290
 291 /*
 292  * An outstanding page request, on the source, having been received
 293  * and queued
 294  */
 295 struct RAMSrcPageRequest {
 296     RAMBlock *rb;
 297     hwaddr    offset;
 298     hwaddr    len;
 299
 300     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 301 };
 302
 303 /* State of RAM for migration */
 304 struct RAMState {
 305     /* QEMUFile used for this migration */
 306     QEMUFile *f;
 307     /* Last block that we have visited searching for dirty pages */
 308     RAMBlock *last_seen_block;
 309     /* Last block from where we have sent data */
 310     RAMBlock *last_sent_block;
 311     /* Last dirty target page we have sent */
 312     ram_addr_t last_page;
 313     /* last ram version we have seen */
 314     uint32_t last_version;
 315     /* We are in the first round */
 316     bool ram_bulk_stage;
 317     /* The free page optimization is enabled */
 318     bool fpo_enabled;
 319     /* How many times we have dirty too many pages */
 320     int dirty_rate_high_cnt;
 321     /* these variables are used for bitmap sync */
 322     /* last time we did a full bitmap_sync */
 323     int64_t time_last_bitmap_sync;
 324     /* bytes transferred at start_time */
 325     uint64_t bytes_xfer_prev;
 326     /* number of dirty pages since start_time */
 327     uint64_t num_dirty_pages_period;
 328     /* xbzrle misses since the beginning of the period */
 329     uint64_t xbzrle_cache_miss_prev;
 330     /* Amount of xbzrle pages since the beginning of the period */
 331     uint64_t xbzrle_pages_prev;
 332     /* Amount of xbzrle encoded bytes since the beginning of the period */
 333     uint64_t xbzrle_bytes_prev;
 334
 335     /* compression statistics since the beginning of the period */
 336     /* amount of count that no free thread to compress data */
 337     uint64_t compress_thread_busy_prev;
 338     /* amount bytes after compression */
 339     uint64_t compressed_size_prev;
 340     /* amount of compressed pages */
 341     uint64_t compress_pages_prev;
 342
 343     /* total handled target pages at the beginning of period */
 344     uint64_t target_page_count_prev;
 345     /* total handled target pages since start */
 346     uint64_t target_page_count;
 347     /* number of dirty bits in the bitmap */
 348     uint64_t migration_dirty_pages;
 349     /* Protects modification of the bitmap and migration dirty pages */
 350     QemuMutex bitmap_mutex;
 351     /* The RAMBlock used in the last src_page_requests */
 352     RAMBlock *last_req_rb;
 353     /* Queue of outstanding page requests from the destination */
 354     QemuMutex src_page_req_mutex;
 355     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 356 };
 357 typedef struct RAMState RAMState;
 358
 359 static RAMState *ram_state;
 360
 361 static NotifierWithReturnList precopy_notifier_list;
 362
 363 void precopy_infrastructure_init(void)
 364 {
 365     notifier_with_return_list_init(&precopy_notifier_list);
 366 }
 367
 368 void precopy_add_notifier(NotifierWithReturn *n)
 369 {
 370     notifier_with_return_list_add(&precopy_notifier_list, n);
 371 }
 372
 373 void precopy_remove_notifier(NotifierWithReturn *n)
 374 {
 375     notifier_with_return_remove(n);
 376 }
 377
 378 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 379 {
 380     PrecopyNotifyData pnd;
 381     pnd.reason = reason;
 382     pnd.errp = errp;
 383
 384     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 385 }
 386
 387 void precopy_enable_free_page_optimization(void)
 388 {
 389     if (!ram_state) {
 390         return;
 391     }
 392
 393     ram_state->fpo_enabled = true;
 394 }
 395
 396 uint64_t ram_bytes_remaining(void)
 397 {
 398     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 399                        0;
 400 }
 401
 402 MigrationStats ram_counters;
 403
 404 /* used by the search for pages to send */
 405 struct PageSearchStatus {
 406     /* Current block being searched */
 407     RAMBlock    *block;
 408     /* Current page to search from */
 409     unsigned long page;
 410     /* Set once we wrap around */
 411     bool         complete_round;
 412 };
 413 typedef struct PageSearchStatus PageSearchStatus;
 414
 415 CompressionStats compression_counters;
 416
 417 struct CompressParam {
 418     bool done;
 419     bool quit;
 420     bool zero_page;
 421     QEMUFile *file;
 422     QemuMutex mutex;
 423     QemuCond cond;
 424     RAMBlock *block;
 425     ram_addr_t offset;
 426
 427     /* internally used fields */
 428     z_stream stream;
 429     uint8_t *originbuf;
 430 };
 431 typedef struct CompressParam CompressParam;
 432
 433 struct DecompressParam {
 434     bool done;
 435     bool quit;
 436     QemuMutex mutex;
 437     QemuCond cond;
 438     void *des;
 439     uint8_t *compbuf;
 440     int len;
 441     z_stream stream;
 442 };
 443 typedef struct DecompressParam DecompressParam;
 444
 445 static CompressParam *comp_param;
 446 static QemuThread *compress_threads;
 447 /* comp_done_cond is used to wake up the migration thread when
 448  * one of the compression threads has finished the compression.
 449  * comp_done_lock is used to co-work with comp_done_cond.
 450  */
 451 static QemuMutex comp_done_lock;
 452 static QemuCond comp_done_cond;
 453 /* The empty QEMUFileOps will be used by file in CompressParam */
 454 static const QEMUFileOps empty_ops = { };
 455
 456 static QEMUFile *decomp_file;
 457 static DecompressParam *decomp_param;
 458 static QemuThread *decompress_threads;
 459 static QemuMutex decomp_done_lock;
 460 static QemuCond decomp_done_cond;
 461
 462 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 463                                  ram_addr_t offset, uint8_t *source_buf);
 464
 465 static void *do_data_compress(void *opaque)
 466 {
 467     CompressParam *param = opaque;
 468     RAMBlock *block;
 469     ram_addr_t offset;
 470     bool zero_page;
 471
 472     qemu_mutex_lock(&param->mutex);
 473     while (!param->quit) {
 474         if (param->block) {
 475             block = param->block;
 476             offset = param->offset;
 477             param->block = NULL;
 478             qemu_mutex_unlock(&param->mutex);
 479
 480             zero_page = do_compress_ram_page(param->file, &param->stream,
 481                                              block, offset, param->originbuf);
 482
 483             qemu_mutex_lock(&comp_done_lock);
 484             param->done = true;
 485             param->zero_page = zero_page;
 486             qemu_cond_signal(&comp_done_cond);
 487             qemu_mutex_unlock(&comp_done_lock);
 488
 489             qemu_mutex_lock(&param->mutex);
 490         } else {
 491             qemu_cond_wait(&param->cond, &param->mutex);
 492         }
 493     }
 494     qemu_mutex_unlock(&param->mutex);
 495
 496     return NULL;
 497 }
 498
 499 static void compress_threads_save_cleanup(void)
 500 {
 501     int i, thread_count;
 502
 503     if (!migrate_use_compression() || !comp_param) {
 504         return;
 505     }
 506
 507     thread_count = migrate_compress_threads();
 508     for (i = 0; i < thread_count; i++) {
 509         /*
 510          * we use it as a indicator which shows if the thread is
 511          * properly init'd or not
 512          */
 513         if (!comp_param[i].file) {
 514             break;
 515         }
 516
 517         qemu_mutex_lock(&comp_param[i].mutex);
 518         comp_param[i].quit = true;
 519         qemu_cond_signal(&comp_param[i].cond);
 520         qemu_mutex_unlock(&comp_param[i].mutex);
 521
 522         qemu_thread_join(compress_threads + i);
 523         qemu_mutex_destroy(&comp_param[i].mutex);
 524         qemu_cond_destroy(&comp_param[i].cond);
 525         deflateEnd(&comp_param[i].stream);
 526         g_free(comp_param[i].originbuf);
 527         qemu_fclose(comp_param[i].file);
 528         comp_param[i].file = NULL;
 529     }
 530     qemu_mutex_destroy(&comp_done_lock);
 531     qemu_cond_destroy(&comp_done_cond);
 532     g_free(compress_threads);
 533     g_free(comp_param);
 534     compress_threads = NULL;
 535     comp_param = NULL;
 536 }
 537
 538 static int compress_threads_save_setup(void)
 539 {
 540     int i, thread_count;
 541
 542     if (!migrate_use_compression()) {
 543         return 0;
 544     }
 545     thread_count = migrate_compress_threads();
 546     compress_threads = g_new0(QemuThread, thread_count);
 547     comp_param = g_new0(CompressParam, thread_count);
 548     qemu_cond_init(&comp_done_cond);
 549     qemu_mutex_init(&comp_done_lock);
 550     for (i = 0; i < thread_count; i++) {
 551         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 552         if (!comp_param[i].originbuf) {
 553             goto exit;
 554         }
 555
 556         if (deflateInit(&comp_param[i].stream,
 557                         migrate_compress_level()) != Z_OK) {
 558             g_free(comp_param[i].originbuf);
 559             goto exit;
 560         }
 561
 562         /* comp_param[i].file is just used as a dummy buffer to save data,
 563          * set its ops to empty.
 564          */
 565         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 566         comp_param[i].done = true;
 567         comp_param[i].quit = false;
 568         qemu_mutex_init(&comp_param[i].mutex);
 569         qemu_cond_init(&comp_param[i].cond);
 570         qemu_thread_create(compress_threads + i, "compress",
 571                            do_data_compress, comp_param + i,
 572                            QEMU_THREAD_JOINABLE);
 573     }
 574     return 0;
 575
 576 exit:
 577     compress_threads_save_cleanup();
 578     return -1;
 579 }
 580
 581 /**
 582  * save_page_header: write page header to wire
 583  *
 584  * If this is the 1st block, it also writes the block identification
 585  *
 586  * Returns the number of bytes written
 587  *
 588  * @f: QEMUFile where to send the data
 589  * @block: block that contains the page we want to send
 590  * @offset: offset inside the block for the page
 591  *          in the lower bits, it contains flags
 592  */
 593 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 594                                ram_addr_t offset)
 595 {
 596     size_t size, len;
 597
 598     if (block == rs->last_sent_block) {
 599         offset |= RAM_SAVE_FLAG_CONTINUE;
 600     }
 601     qemu_put_be64(f, offset);
 602     size = 8;
 603
 604     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 605         len = strlen(block->idstr);
 606         qemu_put_byte(f, len);
 607         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 608         size += 1 + len;
 609         rs->last_sent_block = block;
 610     }
 611     return size;
 612 }
 613
 614 /**
 615  * mig_throttle_guest_down: throotle down the guest
 616  *
 617  * Reduce amount of guest cpu execution to hopefully slow down memory
 618  * writes. If guest dirty memory rate is reduced below the rate at
 619  * which we can transfer pages to the destination then we should be
 620  * able to complete migration. Some workloads dirty memory way too
 621  * fast and will not effectively converge, even with auto-converge.
 622  */
 623 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 624                                     uint64_t bytes_dirty_threshold)
 625 {
 626     MigrationState *s = migrate_get_current();
 627     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 628     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 629     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 630     int pct_max = s->parameters.max_cpu_throttle;
 631
 632     uint64_t throttle_now = cpu_throttle_get_percentage();
 633     uint64_t cpu_now, cpu_ideal, throttle_inc;
 634
 635     /* We have not started throttling yet. Let's start it. */
 636     if (!cpu_throttle_active()) {
 637         cpu_throttle_set(pct_initial);
 638     } else {
 639         /* Throttling already on, just increase the rate */
 640         if (!pct_tailslow) {
 641             throttle_inc = pct_increment;
 642         } else {
 643             /* Compute the ideal CPU percentage used by Guest, which may
 644              * make the dirty rate match the dirty rate threshold. */
 645             cpu_now = 100 - throttle_now;
 646             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 647                         bytes_dirty_period);
 648             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 649         }
 650         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 651     }
 652 }
 653
 654 /**
 655  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 656  *
 657  * @rs: current RAM state
 658  * @current_addr: address for the zero page
 659  *
 660  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 661  * The important thing is that a stale (not-yet-0'd) page be replaced
 662  * by the new data.
 663  * As a bonus, if the page wasn't in the cache it gets added so that
 664  * when a small write is made into the 0'd page it gets XBZRLE sent.
 665  */
 666 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 667 {
 668     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 669         return;
 670     }
 671
 672     /* We don't care if this fails to allocate a new cache page
 673      * as long as it updated an old one */
 674     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 675                  ram_counters.dirty_sync_count);
 676 }
 677
 678 #define ENCODING_FLAG_XBZRLE 0x1
 679
 680 /**
 681  * save_xbzrle_page: compress and send current page
 682  *
 683  * Returns: 1 means that we wrote the page
 684  *          0 means that page is identical to the one already sent
 685  *          -1 means that xbzrle would be longer than normal
 686  *
 687  * @rs: current RAM state
 688  * @current_data: pointer to the address of the page contents
 689  * @current_addr: addr of the page
 690  * @block: block that contains the page we want to send
 691  * @offset: offset inside the block for the page
 692  * @last_stage: if we are at the completion stage
 693  */
 694 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 695                             ram_addr_t current_addr, RAMBlock *block,
 696                             ram_addr_t offset, bool last_stage)
 697 {
 698     int encoded_len = 0, bytes_xbzrle;
 699     uint8_t *prev_cached_page;
 700
 701     if (!cache_is_cached(XBZRLE.cache, current_addr,
 702                          ram_counters.dirty_sync_count)) {
 703         xbzrle_counters.cache_miss++;
 704         if (!last_stage) {
 705             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 706                              ram_counters.dirty_sync_count) == -1) {
 707                 return -1;
 708             } else {
 709                 /* update *current_data when the page has been
 710                    inserted into cache */
 711                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 712             }
 713         }
 714         return -1;
 715     }
 716
 717     /*
 718      * Reaching here means the page has hit the xbzrle cache, no matter what
 719      * encoding result it is (normal encoding, overflow or skipping the page),
 720      * count the page as encoded. This is used to caculate the encoding rate.
 721      *
 722      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 723      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 724      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 725      * skipped page included. In this way, the encoding rate can tell if the
 726      * guest page is good for xbzrle encoding.
 727      */
 728     xbzrle_counters.pages++;
 729     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 730
 731     /* save current buffer into memory */
 732     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 733
 734     /* XBZRLE encoding (if there is no overflow) */
 735     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 736                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 737                                        TARGET_PAGE_SIZE);
 738
 739     /*
 740      * Update the cache contents, so that it corresponds to the data
 741      * sent, in all cases except where we skip the page.
 742      */
 743     if (!last_stage && encoded_len != 0) {
 744         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 745         /*
 746          * In the case where we couldn't compress, ensure that the caller
 747          * sends the data from the cache, since the guest might have
 748          * changed the RAM since we copied it.
 749          */
 750         *current_data = prev_cached_page;
 751     }
 752
 753     if (encoded_len == 0) {
 754         trace_save_xbzrle_page_skipping();
 755         return 0;
 756     } else if (encoded_len == -1) {
 757         trace_save_xbzrle_page_overflow();
 758         xbzrle_counters.overflow++;
 759         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 760         return -1;
 761     }
 762
 763     /* Send XBZRLE based compressed page */
 764     bytes_xbzrle = save_page_header(rs, rs->f, block,
 765                                     offset | RAM_SAVE_FLAG_XBZRLE);
 766     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 767     qemu_put_be16(rs->f, encoded_len);
 768     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 769     bytes_xbzrle += encoded_len + 1 + 2;
 770     /*
 771      * Like compressed_size (please see update_compress_thread_counts),
 772      * the xbzrle encoded bytes don't count the 8 byte header with
 773      * RAM_SAVE_FLAG_CONTINUE.
 774      */
 775     xbzrle_counters.bytes += bytes_xbzrle - 8;
 776     ram_counters.transferred += bytes_xbzrle;
 777
 778     return 1;
 779 }
 780
 781 /**
 782  * migration_bitmap_find_dirty: find the next dirty page from start
 783  *
 784  * Returns the page offset within memory region of the start of a dirty page
 785  *
 786  * @rs: current RAM state
 787  * @rb: RAMBlock where to search for dirty pages
 788  * @start: page where we start the search
 789  */
 790 static inline
 791 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 792                                           unsigned long start)
 793 {
 794     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 795     unsigned long *bitmap = rb->bmap;
 796     unsigned long next;
 797
 798     if (ramblock_is_ignored(rb)) {
 799         return size;
 800     }
 801
 802     /*
 803      * When the free page optimization is enabled, we need to check the bitmap
 804      * to send the non-free pages rather than all the pages in the bulk stage.
 805      */
 806     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
 807         next = start + 1;
 808     } else {
 809         next = find_next_bit(bitmap, size, start);
 810     }
 811
 812     return next;
 813 }
 814
 815 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 816                                                 RAMBlock *rb,
 817                                                 unsigned long page)
 818 {
 819     bool ret;
 820
 821     qemu_mutex_lock(&rs->bitmap_mutex);
 822
 823     /*
 824      * Clear dirty bitmap if needed.  This _must_ be called before we
 825      * send any of the page in the chunk because we need to make sure
 826      * we can capture further page content changes when we sync dirty
 827      * log the next time.  So as long as we are going to send any of
 828      * the page in the chunk we clear the remote dirty bitmap for all.
 829      * Clearing it earlier won't be a problem, but too late will.
 830      */
 831     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
 832         uint8_t shift = rb->clear_bmap_shift;
 833         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
 834         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
 835
 836         /*
 837          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 838          * can make things easier sometimes since then start address
 839          * of the small chunk will always be 64 pages aligned so the
 840          * bitmap will always be aligned to unsigned long.  We should
 841          * even be able to remove this restriction but I'm simply
 842          * keeping it.
 843          */
 844         assert(shift >= 6);
 845         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 846         memory_region_clear_dirty_bitmap(rb->mr, start, size);
 847     }
 848
 849     ret = test_and_clear_bit(page, rb->bmap);
 850
 851     if (ret) {
 852         rs->migration_dirty_pages--;
 853     }
 854     qemu_mutex_unlock(&rs->bitmap_mutex);
 855
 856     return ret;
 857 }
 858
 859 /* Called with RCU critical section */
 860 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 861 {
 862     uint64_t new_dirty_pages =
 863         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
 864
 865     rs->migration_dirty_pages += new_dirty_pages;
 866     rs->num_dirty_pages_period += new_dirty_pages;
 867 }
 868
 869 /**
 870  * ram_pagesize_summary: calculate all the pagesizes of a VM
 871  *
 872  * Returns a summary bitmap of the page sizes of all RAMBlocks
 873  *
 874  * For VMs with just normal pages this is equivalent to the host page
 875  * size. If it's got some huge pages then it's the OR of all the
 876  * different page sizes.
 877  */
 878 uint64_t ram_pagesize_summary(void)
 879 {
 880     RAMBlock *block;
 881     uint64_t summary = 0;
 882
 883     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 884         summary |= block->page_size;
 885     }
 886
 887     return summary;
 888 }
 889
 890 uint64_t ram_get_total_transferred_pages(void)
 891 {
 892     return  ram_counters.normal + ram_counters.duplicate +
 893                 compression_counters.pages + xbzrle_counters.pages;
 894 }
 895
 896 static void migration_update_rates(RAMState *rs, int64_t end_time)
 897 {
 898     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 899     double compressed_size;
 900
 901     /* calculate period counters */
 902     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 903                 / (end_time - rs->time_last_bitmap_sync);
 904
 905     if (!page_count) {
 906         return;
 907     }
 908
 909     if (migrate_use_xbzrle()) {
 910         double encoded_size, unencoded_size;
 911
 912         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 913             rs->xbzrle_cache_miss_prev) / page_count;
 914         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 915         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
 916                          TARGET_PAGE_SIZE;
 917         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
 918         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
 919             xbzrle_counters.encoding_rate = 0;
 920         } else {
 921             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
 922         }
 923         rs->xbzrle_pages_prev = xbzrle_counters.pages;
 924         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
 925     }
 926
 927     if (migrate_use_compression()) {
 928         compression_counters.busy_rate = (double)(compression_counters.busy -
 929             rs->compress_thread_busy_prev) / page_count;
 930         rs->compress_thread_busy_prev = compression_counters.busy;
 931
 932         compressed_size = compression_counters.compressed_size -
 933                           rs->compressed_size_prev;
 934         if (compressed_size) {
 935             double uncompressed_size = (compression_counters.pages -
 936                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
 937
 938             /* Compression-Ratio = Uncompressed-size / Compressed-size */
 939             compression_counters.compression_rate =
 940                                         uncompressed_size / compressed_size;
 941
 942             rs->compress_pages_prev = compression_counters.pages;
 943             rs->compressed_size_prev = compression_counters.compressed_size;
 944         }
 945     }
 946 }
 947
 948 static void migration_trigger_throttle(RAMState *rs)
 949 {
 950     MigrationState *s = migrate_get_current();
 951     uint64_t threshold = s->parameters.throttle_trigger_threshold;
 952
 953     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
 954     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
 955     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
 956
 957     /* During block migration the auto-converge logic incorrectly detects
 958      * that ram migration makes no progress. Avoid this by disabling the
 959      * throttling logic during the bulk phase of block migration. */
 960     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 961         /* The following detection logic can be refined later. For now:
 962            Check to see if the ratio between dirtied bytes and the approx.
 963            amount of bytes that just got transferred since the last time
 964            we were in this routine reaches the threshold. If that happens
 965            twice, start or increase throttling. */
 966
 967         if ((bytes_dirty_period > bytes_dirty_threshold) &&
 968             (++rs->dirty_rate_high_cnt >= 2)) {
 969             trace_migration_throttle();
 970             rs->dirty_rate_high_cnt = 0;
 971             mig_throttle_guest_down(bytes_dirty_period,
 972                                     bytes_dirty_threshold);
 973         }
 974     }
 975 }
 976
 977 static void migration_bitmap_sync(RAMState *rs)
 978 {
 979     RAMBlock *block;
 980     int64_t end_time;
 981
 982     ram_counters.dirty_sync_count++;
 983
 984     if (!rs->time_last_bitmap_sync) {
 985         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 986     }
 987
 988     trace_migration_bitmap_sync_start();
 989     memory_global_dirty_log_sync();
 990
 991     qemu_mutex_lock(&rs->bitmap_mutex);
 992     WITH_RCU_READ_LOCK_GUARD() {
 993         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 994             ramblock_sync_dirty_bitmap(rs, block);
 995         }
 996         ram_counters.remaining = ram_bytes_remaining();
 997     }
 998     qemu_mutex_unlock(&rs->bitmap_mutex);
 999
1000     memory_global_after_dirty_log_sync();
1001     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1002
1003     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1004
1005     /* more than 1 second = 1000 millisecons */
1006     if (end_time > rs->time_last_bitmap_sync + 1000) {
1007         migration_trigger_throttle(rs);
1008
1009         migration_update_rates(rs, end_time);
1010
1011         rs->target_page_count_prev = rs->target_page_count;
1012
1013         /* reset period counters */
1014         rs->time_last_bitmap_sync = end_time;
1015         rs->num_dirty_pages_period = 0;
1016         rs->bytes_xfer_prev = ram_counters.transferred;
1017     }
1018     if (migrate_use_events()) {
1019         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1020     }
1021 }
1022
1023 static void migration_bitmap_sync_precopy(RAMState *rs)
1024 {
1025     Error *local_err = NULL;
1026
1027     /*
1028      * The current notifier usage is just an optimization to migration, so we
1029      * don't stop the normal migration process in the error case.
1030      */
1031     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1032         error_report_err(local_err);
1033         local_err = NULL;
1034     }
1035
1036     migration_bitmap_sync(rs);
1037
1038     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1039         error_report_err(local_err);
1040     }
1041 }
1042
1043 /**
1044  * save_zero_page_to_file: send the zero page to the file
1045  *
1046  * Returns the size of data written to the file, 0 means the page is not
1047  * a zero page
1048  *
1049  * @rs: current RAM state
1050  * @file: the file where the data is saved
1051  * @block: block that contains the page we want to send
1052  * @offset: offset inside the block for the page
1053  */
1054 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1055                                   RAMBlock *block, ram_addr_t offset)
1056 {
1057     uint8_t *p = block->host + offset;
1058     int len = 0;
1059
1060     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1061         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1062         qemu_put_byte(file, 0);
1063         len += 1;
1064     }
1065     return len;
1066 }
1067
1068 /**
1069  * save_zero_page: send the zero page to the stream
1070  *
1071  * Returns the number of pages written.
1072  *
1073  * @rs: current RAM state
1074  * @block: block that contains the page we want to send
1075  * @offset: offset inside the block for the page
1076  */
1077 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1078 {
1079     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1080
1081     if (len) {
1082         ram_counters.duplicate++;
1083         ram_counters.transferred += len;
1084         return 1;
1085     }
1086     return -1;
1087 }
1088
1089 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1090 {
1091     if (!migrate_release_ram() || !migration_in_postcopy()) {
1092         return;
1093     }
1094
1095     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1096 }
1097
1098 /*
1099  * @pages: the number of pages written by the control path,
1100  *        < 0 - error
1101  *        > 0 - number of pages written
1102  *
1103  * Return true if the pages has been saved, otherwise false is returned.
1104  */
1105 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1106                               int *pages)
1107 {
1108     uint64_t bytes_xmit = 0;
1109     int ret;
1110
1111     *pages = -1;
1112     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1113                                 &bytes_xmit);
1114     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1115         return false;
1116     }
1117
1118     if (bytes_xmit) {
1119         ram_counters.transferred += bytes_xmit;
1120         *pages = 1;
1121     }
1122
1123     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1124         return true;
1125     }
1126
1127     if (bytes_xmit > 0) {
1128         ram_counters.normal++;
1129     } else if (bytes_xmit == 0) {
1130         ram_counters.duplicate++;
1131     }
1132
1133     return true;
1134 }
1135
1136 /*
1137  * directly send the page to the stream
1138  *
1139  * Returns the number of pages written.
1140  *
1141  * @rs: current RAM state
1142  * @block: block that contains the page we want to send
1143  * @offset: offset inside the block for the page
1144  * @buf: the page to be sent
1145  * @async: send to page asyncly
1146  */
1147 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1148                             uint8_t *buf, bool async)
1149 {
1150     ram_counters.transferred += save_page_header(rs, rs->f, block,
1151                                                  offset | RAM_SAVE_FLAG_PAGE);
1152     if (async) {
1153         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1154                               migrate_release_ram() &
1155                               migration_in_postcopy());
1156     } else {
1157         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1158     }
1159     ram_counters.transferred += TARGET_PAGE_SIZE;
1160     ram_counters.normal++;
1161     return 1;
1162 }
1163
1164 /**
1165  * ram_save_page: send the given page to the stream
1166  *
1167  * Returns the number of pages written.
1168  *          < 0 - error
1169  *          >=0 - Number of pages written - this might legally be 0
1170  *                if xbzrle noticed the page was the same.
1171  *
1172  * @rs: current RAM state
1173  * @block: block that contains the page we want to send
1174  * @offset: offset inside the block for the page
1175  * @last_stage: if we are at the completion stage
1176  */
1177 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1178 {
1179     int pages = -1;
1180     uint8_t *p;
1181     bool send_async = true;
1182     RAMBlock *block = pss->block;
1183     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1184     ram_addr_t current_addr = block->offset + offset;
1185
1186     p = block->host + offset;
1187     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1188
1189     XBZRLE_cache_lock();
1190     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1191         migrate_use_xbzrle()) {
1192         pages = save_xbzrle_page(rs, &p, current_addr, block,
1193                                  offset, last_stage);
1194         if (!last_stage) {
1195             /* Can't send this cached data async, since the cache page
1196              * might get updated before it gets to the wire
1197              */
1198             send_async = false;
1199         }
1200     }
1201
1202     /* XBZRLE overflow or normal page */
1203     if (pages == -1) {
1204         pages = save_normal_page(rs, block, offset, p, send_async);
1205     }
1206
1207     XBZRLE_cache_unlock();
1208
1209     return pages;
1210 }
1211
1212 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1213                                  ram_addr_t offset)
1214 {
1215     if (multifd_queue_page(rs->f, block, offset) < 0) {
1216         return -1;
1217     }
1218     ram_counters.normal++;
1219
1220     return 1;
1221 }
1222
1223 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1224                                  ram_addr_t offset, uint8_t *source_buf)
1225 {
1226     RAMState *rs = ram_state;
1227     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1228     bool zero_page = false;
1229     int ret;
1230
1231     if (save_zero_page_to_file(rs, f, block, offset)) {
1232         zero_page = true;
1233         goto exit;
1234     }
1235
1236     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1237
1238     /*
1239      * copy it to a internal buffer to avoid it being modified by VM
1240      * so that we can catch up the error during compression and
1241      * decompression
1242      */
1243     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1244     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1245     if (ret < 0) {
1246         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1247         error_report("compressed data failed!");
1248         return false;
1249     }
1250
1251 exit:
1252     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1253     return zero_page;
1254 }
1255
1256 static void
1257 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1258 {
1259     ram_counters.transferred += bytes_xmit;
1260
1261     if (param->zero_page) {
1262         ram_counters.duplicate++;
1263         return;
1264     }
1265
1266     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1267     compression_counters.compressed_size += bytes_xmit - 8;
1268     compression_counters.pages++;
1269 }
1270
1271 static bool save_page_use_compression(RAMState *rs);
1272
1273 static void flush_compressed_data(RAMState *rs)
1274 {
1275     int idx, len, thread_count;
1276
1277     if (!save_page_use_compression(rs)) {
1278         return;
1279     }
1280     thread_count = migrate_compress_threads();
1281
1282     qemu_mutex_lock(&comp_done_lock);
1283     for (idx = 0; idx < thread_count; idx++) {
1284         while (!comp_param[idx].done) {
1285             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1286         }
1287     }
1288     qemu_mutex_unlock(&comp_done_lock);
1289
1290     for (idx = 0; idx < thread_count; idx++) {
1291         qemu_mutex_lock(&comp_param[idx].mutex);
1292         if (!comp_param[idx].quit) {
1293             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1294             /*
1295              * it's safe to fetch zero_page without holding comp_done_lock
1296              * as there is no further request submitted to the thread,
1297              * i.e, the thread should be waiting for a request at this point.
1298              */
1299             update_compress_thread_counts(&comp_param[idx], len);
1300         }
1301         qemu_mutex_unlock(&comp_param[idx].mutex);
1302     }
1303 }
1304
1305 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1306                                        ram_addr_t offset)
1307 {
1308     param->block = block;
1309     param->offset = offset;
1310 }
1311
1312 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1313                                            ram_addr_t offset)
1314 {
1315     int idx, thread_count, bytes_xmit = -1, pages = -1;
1316     bool wait = migrate_compress_wait_thread();
1317
1318     thread_count = migrate_compress_threads();
1319     qemu_mutex_lock(&comp_done_lock);
1320 retry:
1321     for (idx = 0; idx < thread_count; idx++) {
1322         if (comp_param[idx].done) {
1323             comp_param[idx].done = false;
1324             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1325             qemu_mutex_lock(&comp_param[idx].mutex);
1326             set_compress_params(&comp_param[idx], block, offset);
1327             qemu_cond_signal(&comp_param[idx].cond);
1328             qemu_mutex_unlock(&comp_param[idx].mutex);
1329             pages = 1;
1330             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1331             break;
1332         }
1333     }
1334
1335     /*
1336      * wait for the free thread if the user specifies 'compress-wait-thread',
1337      * otherwise we will post the page out in the main thread as normal page.
1338      */
1339     if (pages < 0 && wait) {
1340         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1341         goto retry;
1342     }
1343     qemu_mutex_unlock(&comp_done_lock);
1344
1345     return pages;
1346 }
1347
1348 /**
1349  * find_dirty_block: find the next dirty page and update any state
1350  * associated with the search process.
1351  *
1352  * Returns true if a page is found
1353  *
1354  * @rs: current RAM state
1355  * @pss: data about the state of the current dirty page scan
1356  * @again: set to false if the search has scanned the whole of RAM
1357  */
1358 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1359 {
1360     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1361     if (pss->complete_round && pss->block == rs->last_seen_block &&
1362         pss->page >= rs->last_page) {
1363         /*
1364          * We've been once around the RAM and haven't found anything.
1365          * Give up.
1366          */
1367         *again = false;
1368         return false;
1369     }
1370     if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1371         >= pss->block->used_length) {
1372         /* Didn't find anything in this RAM Block */
1373         pss->page = 0;
1374         pss->block = QLIST_NEXT_RCU(pss->block, next);
1375         if (!pss->block) {
1376             /*
1377              * If memory migration starts over, we will meet a dirtied page
1378              * which may still exists in compression threads's ring, so we
1379              * should flush the compressed data to make sure the new page
1380              * is not overwritten by the old one in the destination.
1381              *
1382              * Also If xbzrle is on, stop using the data compression at this
1383              * point. In theory, xbzrle can do better than compression.
1384              */
1385             flush_compressed_data(rs);
1386
1387             /* Hit the end of the list */
1388             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1389             /* Flag that we've looped */
1390             pss->complete_round = true;
1391             rs->ram_bulk_stage = false;
1392         }
1393         /* Didn't find anything this time, but try again on the new block */
1394         *again = true;
1395         return false;
1396     } else {
1397         /* Can go around again, but... */
1398         *again = true;
1399         /* We've found something so probably don't need to */
1400         return true;
1401     }
1402 }
1403
1404 /**
1405  * unqueue_page: gets a page of the queue
1406  *
1407  * Helper for 'get_queued_page' - gets a page off the queue
1408  *
1409  * Returns the block of the page (or NULL if none available)
1410  *
1411  * @rs: current RAM state
1412  * @offset: used to return the offset within the RAMBlock
1413  */
1414 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1415 {
1416     RAMBlock *block = NULL;
1417
1418     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1419         return NULL;
1420     }
1421
1422     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1423     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1424         struct RAMSrcPageRequest *entry =
1425                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1426         block = entry->rb;
1427         *offset = entry->offset;
1428
1429         if (entry->len > TARGET_PAGE_SIZE) {
1430             entry->len -= TARGET_PAGE_SIZE;
1431             entry->offset += TARGET_PAGE_SIZE;
1432         } else {
1433             memory_region_unref(block->mr);
1434             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1435             g_free(entry);
1436             migration_consume_urgent_request();
1437         }
1438     }
1439
1440     return block;
1441 }
1442
1443 /**
1444  * get_queued_page: unqueue a page from the postcopy requests
1445  *
1446  * Skips pages that are already sent (!dirty)
1447  *
1448  * Returns true if a queued page is found
1449  *
1450  * @rs: current RAM state
1451  * @pss: data about the state of the current dirty page scan
1452  */
1453 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1454 {
1455     RAMBlock  *block;
1456     ram_addr_t offset;
1457     bool dirty;
1458
1459     do {
1460         block = unqueue_page(rs, &offset);
1461         /*
1462          * We're sending this page, and since it's postcopy nothing else
1463          * will dirty it, and we must make sure it doesn't get sent again
1464          * even if this queue request was received after the background
1465          * search already sent it.
1466          */
1467         if (block) {
1468             unsigned long page;
1469
1470             page = offset >> TARGET_PAGE_BITS;
1471             dirty = test_bit(page, block->bmap);
1472             if (!dirty) {
1473                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1474                                                 page);
1475             } else {
1476                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1477             }
1478         }
1479
1480     } while (block && !dirty);
1481
1482     if (block) {
1483         /*
1484          * As soon as we start servicing pages out of order, then we have
1485          * to kill the bulk stage, since the bulk stage assumes
1486          * in (migration_bitmap_find_and_reset_dirty) that every page is
1487          * dirty, that's no longer true.
1488          */
1489         rs->ram_bulk_stage = false;
1490
1491         /*
1492          * We want the background search to continue from the queued page
1493          * since the guest is likely to want other pages near to the page
1494          * it just requested.
1495          */
1496         pss->block = block;
1497         pss->page = offset >> TARGET_PAGE_BITS;
1498
1499         /*
1500          * This unqueued page would break the "one round" check, even is
1501          * really rare.
1502          */
1503         pss->complete_round = false;
1504     }
1505
1506     return !!block;
1507 }
1508
1509 /**
1510  * migration_page_queue_free: drop any remaining pages in the ram
1511  * request queue
1512  *
1513  * It should be empty at the end anyway, but in error cases there may
1514  * be some left.  in case that there is any page left, we drop it.
1515  *
1516  */
1517 static void migration_page_queue_free(RAMState *rs)
1518 {
1519     struct RAMSrcPageRequest *mspr, *next_mspr;
1520     /* This queue generally should be empty - but in the case of a failed
1521      * migration might have some droppings in.
1522      */
1523     RCU_READ_LOCK_GUARD();
1524     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1525         memory_region_unref(mspr->rb->mr);
1526         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1527         g_free(mspr);
1528     }
1529 }
1530
1531 /**
1532  * ram_save_queue_pages: queue the page for transmission
1533  *
1534  * A request from postcopy destination for example.
1535  *
1536  * Returns zero on success or negative on error
1537  *
1538  * @rbname: Name of the RAMBLock of the request. NULL means the
1539  *          same that last one.
1540  * @start: starting address from the start of the RAMBlock
1541  * @len: length (in bytes) to send
1542  */
1543 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1544 {
1545     RAMBlock *ramblock;
1546     RAMState *rs = ram_state;
1547
1548     ram_counters.postcopy_requests++;
1549     RCU_READ_LOCK_GUARD();
1550
1551     if (!rbname) {
1552         /* Reuse last RAMBlock */
1553         ramblock = rs->last_req_rb;
1554
1555         if (!ramblock) {
1556             /*
1557              * Shouldn't happen, we can't reuse the last RAMBlock if
1558              * it's the 1st request.
1559              */
1560             error_report("ram_save_queue_pages no previous block");
1561             return -1;
1562         }
1563     } else {
1564         ramblock = qemu_ram_block_by_name(rbname);
1565
1566         if (!ramblock) {
1567             /* We shouldn't be asked for a non-existent RAMBlock */
1568             error_report("ram_save_queue_pages no block '%s'", rbname);
1569             return -1;
1570         }
1571         rs->last_req_rb = ramblock;
1572     }
1573     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1574     if (start+len > ramblock->used_length) {
1575         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1576                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1577                      __func__, start, len, ramblock->used_length);
1578         return -1;
1579     }
1580
1581     struct RAMSrcPageRequest *new_entry =
1582         g_malloc0(sizeof(struct RAMSrcPageRequest));
1583     new_entry->rb = ramblock;
1584     new_entry->offset = start;
1585     new_entry->len = len;
1586
1587     memory_region_ref(ramblock->mr);
1588     qemu_mutex_lock(&rs->src_page_req_mutex);
1589     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1590     migration_make_urgent_request();
1591     qemu_mutex_unlock(&rs->src_page_req_mutex);
1592
1593     return 0;
1594 }
1595
1596 static bool save_page_use_compression(RAMState *rs)
1597 {
1598     if (!migrate_use_compression()) {
1599         return false;
1600     }
1601
1602     /*
1603      * If xbzrle is on, stop using the data compression after first
1604      * round of migration even if compression is enabled. In theory,
1605      * xbzrle can do better than compression.
1606      */
1607     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1608         return true;
1609     }
1610
1611     return false;
1612 }
1613
1614 /*
1615  * try to compress the page before posting it out, return true if the page
1616  * has been properly handled by compression, otherwise needs other
1617  * paths to handle it
1618  */
1619 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1620 {
1621     if (!save_page_use_compression(rs)) {
1622         return false;
1623     }
1624
1625     /*
1626      * When starting the process of a new block, the first page of
1627      * the block should be sent out before other pages in the same
1628      * block, and all the pages in last block should have been sent
1629      * out, keeping this order is important, because the 'cont' flag
1630      * is used to avoid resending the block name.
1631      *
1632      * We post the fist page as normal page as compression will take
1633      * much CPU resource.
1634      */
1635     if (block != rs->last_sent_block) {
1636         flush_compressed_data(rs);
1637         return false;
1638     }
1639
1640     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1641         return true;
1642     }
1643
1644     compression_counters.busy++;
1645     return false;
1646 }
1647
1648 /**
1649  * ram_save_target_page: save one target page
1650  *
1651  * Returns the number of pages written
1652  *
1653  * @rs: current RAM state
1654  * @pss: data about the page we want to send
1655  * @last_stage: if we are at the completion stage
1656  */
1657 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1658                                 bool last_stage)
1659 {
1660     RAMBlock *block = pss->block;
1661     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1662     int res;
1663
1664     if (control_save_page(rs, block, offset, &res)) {
1665         return res;
1666     }
1667
1668     if (save_compress_page(rs, block, offset)) {
1669         return 1;
1670     }
1671
1672     res = save_zero_page(rs, block, offset);
1673     if (res > 0) {
1674         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1675          * page would be stale
1676          */
1677         if (!save_page_use_compression(rs)) {
1678             XBZRLE_cache_lock();
1679             xbzrle_cache_zero_page(rs, block->offset + offset);
1680             XBZRLE_cache_unlock();
1681         }
1682         ram_release_pages(block->idstr, offset, res);
1683         return res;
1684     }
1685
1686     /*
1687      * Do not use multifd for:
1688      * 1. Compression as the first page in the new block should be posted out
1689      *    before sending the compressed page
1690      * 2. In postcopy as one whole host page should be placed
1691      */
1692     if (!save_page_use_compression(rs) && migrate_use_multifd()
1693         && !migration_in_postcopy()) {
1694         return ram_save_multifd_page(rs, block, offset);
1695     }
1696
1697     return ram_save_page(rs, pss, last_stage);
1698 }
1699
1700 /**
1701  * ram_save_host_page: save a whole host page
1702  *
1703  * Starting at *offset send pages up to the end of the current host
1704  * page. It's valid for the initial offset to point into the middle of
1705  * a host page in which case the remainder of the hostpage is sent.
1706  * Only dirty target pages are sent. Note that the host page size may
1707  * be a huge page for this block.
1708  * The saving stops at the boundary of the used_length of the block
1709  * if the RAMBlock isn't a multiple of the host page size.
1710  *
1711  * Returns the number of pages written or negative on error
1712  *
1713  * @rs: current RAM state
1714  * @ms: current migration state
1715  * @pss: data about the page we want to send
1716  * @last_stage: if we are at the completion stage
1717  */
1718 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1719                               bool last_stage)
1720 {
1721     int tmppages, pages = 0;
1722     size_t pagesize_bits =
1723         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1724
1725     if (ramblock_is_ignored(pss->block)) {
1726         error_report("block %s should not be migrated !", pss->block->idstr);
1727         return 0;
1728     }
1729
1730     do {
1731         /* Check the pages is dirty and if it is send it */
1732         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1733             pss->page++;
1734             continue;
1735         }
1736
1737         tmppages = ram_save_target_page(rs, pss, last_stage);
1738         if (tmppages < 0) {
1739             return tmppages;
1740         }
1741
1742         pages += tmppages;
1743         pss->page++;
1744         /* Allow rate limiting to happen in the middle of huge pages */
1745         migration_rate_limit();
1746     } while ((pss->page & (pagesize_bits - 1)) &&
1747              offset_in_ramblock(pss->block,
1748                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
1749
1750     /* The offset we leave with is the last one we looked at */
1751     pss->page--;
1752     return pages;
1753 }
1754
1755 /**
1756  * ram_find_and_save_block: finds a dirty page and sends it to f
1757  *
1758  * Called within an RCU critical section.
1759  *
1760  * Returns the number of pages written where zero means no dirty pages,
1761  * or negative on error
1762  *
1763  * @rs: current RAM state
1764  * @last_stage: if we are at the completion stage
1765  *
1766  * On systems where host-page-size > target-page-size it will send all the
1767  * pages in a host page that are dirty.
1768  */
1769
1770 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1771 {
1772     PageSearchStatus pss;
1773     int pages = 0;
1774     bool again, found;
1775
1776     /* No dirty page as there is zero RAM */
1777     if (!ram_bytes_total()) {
1778         return pages;
1779     }
1780
1781     pss.block = rs->last_seen_block;
1782     pss.page = rs->last_page;
1783     pss.complete_round = false;
1784
1785     if (!pss.block) {
1786         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1787     }
1788
1789     do {
1790         again = true;
1791         found = get_queued_page(rs, &pss);
1792
1793         if (!found) {
1794             /* priority queue empty, so just search for something dirty */
1795             found = find_dirty_block(rs, &pss, &again);
1796         }
1797
1798         if (found) {
1799             pages = ram_save_host_page(rs, &pss, last_stage);
1800         }
1801     } while (!pages && again);
1802
1803     rs->last_seen_block = pss.block;
1804     rs->last_page = pss.page;
1805
1806     return pages;
1807 }
1808
1809 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1810 {
1811     uint64_t pages = size / TARGET_PAGE_SIZE;
1812
1813     if (zero) {
1814         ram_counters.duplicate += pages;
1815     } else {
1816         ram_counters.normal += pages;
1817         ram_counters.transferred += size;
1818         qemu_update_position(f, size);
1819     }
1820 }
1821
1822 static uint64_t ram_bytes_total_common(bool count_ignored)
1823 {
1824     RAMBlock *block;
1825     uint64_t total = 0;
1826
1827     RCU_READ_LOCK_GUARD();
1828
1829     if (count_ignored) {
1830         RAMBLOCK_FOREACH_MIGRATABLE(block) {
1831             total += block->used_length;
1832         }
1833     } else {
1834         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1835             total += block->used_length;
1836         }
1837     }
1838     return total;
1839 }
1840
1841 uint64_t ram_bytes_total(void)
1842 {
1843     return ram_bytes_total_common(false);
1844 }
1845
1846 static void xbzrle_load_setup(void)
1847 {
1848     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1849 }
1850
1851 static void xbzrle_load_cleanup(void)
1852 {
1853     g_free(XBZRLE.decoded_buf);
1854     XBZRLE.decoded_buf = NULL;
1855 }
1856
1857 static void ram_state_cleanup(RAMState **rsp)
1858 {
1859     if (*rsp) {
1860         migration_page_queue_free(*rsp);
1861         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1862         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1863         g_free(*rsp);
1864         *rsp = NULL;
1865     }
1866 }
1867
1868 static void xbzrle_cleanup(void)
1869 {
1870     XBZRLE_cache_lock();
1871     if (XBZRLE.cache) {
1872         cache_fini(XBZRLE.cache);
1873         g_free(XBZRLE.encoded_buf);
1874         g_free(XBZRLE.current_buf);
1875         g_free(XBZRLE.zero_target_page);
1876         XBZRLE.cache = NULL;
1877         XBZRLE.encoded_buf = NULL;
1878         XBZRLE.current_buf = NULL;
1879         XBZRLE.zero_target_page = NULL;
1880     }
1881     XBZRLE_cache_unlock();
1882 }
1883
1884 static void ram_save_cleanup(void *opaque)
1885 {
1886     RAMState **rsp = opaque;
1887     RAMBlock *block;
1888
1889     /* caller have hold iothread lock or is in a bh, so there is
1890      * no writing race against the migration bitmap
1891      */
1892     memory_global_dirty_log_stop();
1893
1894     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1895         g_free(block->clear_bmap);
1896         block->clear_bmap = NULL;
1897         g_free(block->bmap);
1898         block->bmap = NULL;
1899     }
1900
1901     xbzrle_cleanup();
1902     compress_threads_save_cleanup();
1903     ram_state_cleanup(rsp);
1904 }
1905
1906 static void ram_state_reset(RAMState *rs)
1907 {
1908     rs->last_seen_block = NULL;
1909     rs->last_sent_block = NULL;
1910     rs->last_page = 0;
1911     rs->last_version = ram_list.version;
1912     rs->ram_bulk_stage = true;
1913     rs->fpo_enabled = false;
1914 }
1915
1916 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1917
1918 /*
1919  * 'expected' is the value you expect the bitmap mostly to be full
1920  * of; it won't bother printing lines that are all this value.
1921  * If 'todump' is null the migration bitmap is dumped.
1922  */
1923 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1924                            unsigned long pages)
1925 {
1926     int64_t cur;
1927     int64_t linelen = 128;
1928     char linebuf[129];
1929
1930     for (cur = 0; cur < pages; cur += linelen) {
1931         int64_t curb;
1932         bool found = false;
1933         /*
1934          * Last line; catch the case where the line length
1935          * is longer than remaining ram
1936          */
1937         if (cur + linelen > pages) {
1938             linelen = pages - cur;
1939         }
1940         for (curb = 0; curb < linelen; curb++) {
1941             bool thisbit = test_bit(cur + curb, todump);
1942             linebuf[curb] = thisbit ? '1' : '.';
1943             found = found || (thisbit != expected);
1944         }
1945         if (found) {
1946             linebuf[curb] = '\0';
1947             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1948         }
1949     }
1950 }
1951
1952 /* **** functions for postcopy ***** */
1953
1954 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1955 {
1956     struct RAMBlock *block;
1957
1958     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1959         unsigned long *bitmap = block->bmap;
1960         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1961         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1962
1963         while (run_start < range) {
1964             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1965             ram_discard_range(block->idstr,
1966                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
1967                               ((ram_addr_t)(run_end - run_start))
1968                                 << TARGET_PAGE_BITS);
1969             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1970         }
1971     }
1972 }
1973
1974 /**
1975  * postcopy_send_discard_bm_ram: discard a RAMBlock
1976  *
1977  * Returns zero on success
1978  *
1979  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1980  *
1981  * @ms: current migration state
1982  * @block: RAMBlock to discard
1983  */
1984 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
1985 {
1986     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1987     unsigned long current;
1988     unsigned long *bitmap = block->bmap;
1989
1990     for (current = 0; current < end; ) {
1991         unsigned long one = find_next_bit(bitmap, end, current);
1992         unsigned long zero, discard_length;
1993
1994         if (one >= end) {
1995             break;
1996         }
1997
1998         zero = find_next_zero_bit(bitmap, end, one + 1);
1999
2000         if (zero >= end) {
2001             discard_length = end - one;
2002         } else {
2003             discard_length = zero - one;
2004         }
2005         postcopy_discard_send_range(ms, one, discard_length);
2006         current = one + discard_length;
2007     }
2008
2009     return 0;
2010 }
2011
2012 /**
2013  * postcopy_each_ram_send_discard: discard all RAMBlocks
2014  *
2015  * Returns 0 for success or negative for error
2016  *
2017  * Utility for the outgoing postcopy code.
2018  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2019  *   passing it bitmap indexes and name.
2020  * (qemu_ram_foreach_block ends up passing unscaled lengths
2021  *  which would mean postcopy code would have to deal with target page)
2022  *
2023  * @ms: current migration state
2024  */
2025 static int postcopy_each_ram_send_discard(MigrationState *ms)
2026 {
2027     struct RAMBlock *block;
2028     int ret;
2029
2030     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2031         postcopy_discard_send_init(ms, block->idstr);
2032
2033         /*
2034          * Postcopy sends chunks of bitmap over the wire, but it
2035          * just needs indexes at this point, avoids it having
2036          * target page specific code.
2037          */
2038         ret = postcopy_send_discard_bm_ram(ms, block);
2039         postcopy_discard_send_finish(ms);
2040         if (ret) {
2041             return ret;
2042         }
2043     }
2044
2045     return 0;
2046 }
2047
2048 /**
2049  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2050  *
2051  * Helper for postcopy_chunk_hostpages; it's called twice to
2052  * canonicalize the two bitmaps, that are similar, but one is
2053  * inverted.
2054  *
2055  * Postcopy requires that all target pages in a hostpage are dirty or
2056  * clean, not a mix.  This function canonicalizes the bitmaps.
2057  *
2058  * @ms: current migration state
2059  * @block: block that contains the page we want to canonicalize
2060  */
2061 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2062 {
2063     RAMState *rs = ram_state;
2064     unsigned long *bitmap = block->bmap;
2065     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2066     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2067     unsigned long run_start;
2068
2069     if (block->page_size == TARGET_PAGE_SIZE) {
2070         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2071         return;
2072     }
2073
2074     /* Find a dirty page */
2075     run_start = find_next_bit(bitmap, pages, 0);
2076
2077     while (run_start < pages) {
2078
2079         /*
2080          * If the start of this run of pages is in the middle of a host
2081          * page, then we need to fixup this host page.
2082          */
2083         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2084             /* Find the end of this run */
2085             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2086             /*
2087              * If the end isn't at the start of a host page, then the
2088              * run doesn't finish at the end of a host page
2089              * and we need to discard.
2090              */
2091         }
2092
2093         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2094             unsigned long page;
2095             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2096                                                              host_ratio);
2097             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2098
2099             /* Clean up the bitmap */
2100             for (page = fixup_start_addr;
2101                  page < fixup_start_addr + host_ratio; page++) {
2102                 /*
2103                  * Remark them as dirty, updating the count for any pages
2104                  * that weren't previously dirty.
2105                  */
2106                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2107             }
2108         }
2109
2110         /* Find the next dirty page for the next iteration */
2111         run_start = find_next_bit(bitmap, pages, run_start);
2112     }
2113 }
2114
2115 /**
2116  * postcopy_chunk_hostpages: discard any partially sent host page
2117  *
2118  * Utility for the outgoing postcopy code.
2119  *
2120  * Discard any partially sent host-page size chunks, mark any partially
2121  * dirty host-page size chunks as all dirty.  In this case the host-page
2122  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2123  *
2124  * Returns zero on success
2125  *
2126  * @ms: current migration state
2127  * @block: block we want to work with
2128  */
2129 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2130 {
2131     postcopy_discard_send_init(ms, block->idstr);
2132
2133     /*
2134      * Ensure that all partially dirty host pages are made fully dirty.
2135      */
2136     postcopy_chunk_hostpages_pass(ms, block);
2137
2138     postcopy_discard_send_finish(ms);
2139     return 0;
2140 }
2141
2142 /**
2143  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2144  *
2145  * Returns zero on success
2146  *
2147  * Transmit the set of pages to be discarded after precopy to the target
2148  * these are pages that:
2149  *     a) Have been previously transmitted but are now dirty again
2150  *     b) Pages that have never been transmitted, this ensures that
2151  *        any pages on the destination that have been mapped by background
2152  *        tasks get discarded (transparent huge pages is the specific concern)
2153  * Hopefully this is pretty sparse
2154  *
2155  * @ms: current migration state
2156  */
2157 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2158 {
2159     RAMState *rs = ram_state;
2160     RAMBlock *block;
2161     int ret;
2162
2163     RCU_READ_LOCK_GUARD();
2164
2165     /* This should be our last sync, the src is now paused */
2166     migration_bitmap_sync(rs);
2167
2168     /* Easiest way to make sure we don't resume in the middle of a host-page */
2169     rs->last_seen_block = NULL;
2170     rs->last_sent_block = NULL;
2171     rs->last_page = 0;
2172
2173     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2174         /* Deal with TPS != HPS and huge pages */
2175         ret = postcopy_chunk_hostpages(ms, block);
2176         if (ret) {
2177             return ret;
2178         }
2179
2180 #ifdef DEBUG_POSTCOPY
2181         ram_debug_dump_bitmap(block->bmap, true,
2182                               block->used_length >> TARGET_PAGE_BITS);
2183 #endif
2184     }
2185     trace_ram_postcopy_send_discard_bitmap();
2186
2187     return postcopy_each_ram_send_discard(ms);
2188 }
2189
2190 /**
2191  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2192  *
2193  * Returns zero on success
2194  *
2195  * @rbname: name of the RAMBlock of the request. NULL means the
2196  *          same that last one.
2197  * @start: RAMBlock starting page
2198  * @length: RAMBlock size
2199  */
2200 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2201 {
2202     trace_ram_discard_range(rbname, start, length);
2203
2204     RCU_READ_LOCK_GUARD();
2205     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2206
2207     if (!rb) {
2208         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2209         return -1;
2210     }
2211
2212     /*
2213      * On source VM, we don't need to update the received bitmap since
2214      * we don't even have one.
2215      */
2216     if (rb->receivedmap) {
2217         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2218                      length >> qemu_target_page_bits());
2219     }
2220
2221     return ram_block_discard_range(rb, start, length);
2222 }
2223
2224 /*
2225  * For every allocation, we will try not to crash the VM if the
2226  * allocation failed.
2227  */
2228 static int xbzrle_init(void)
2229 {
2230     Error *local_err = NULL;
2231
2232     if (!migrate_use_xbzrle()) {
2233         return 0;
2234     }
2235
2236     XBZRLE_cache_lock();
2237
2238     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2239     if (!XBZRLE.zero_target_page) {
2240         error_report("%s: Error allocating zero page", __func__);
2241         goto err_out;
2242     }
2243
2244     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2245                               TARGET_PAGE_SIZE, &local_err);
2246     if (!XBZRLE.cache) {
2247         error_report_err(local_err);
2248         goto free_zero_page;
2249     }
2250
2251     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2252     if (!XBZRLE.encoded_buf) {
2253         error_report("%s: Error allocating encoded_buf", __func__);
2254         goto free_cache;
2255     }
2256
2257     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2258     if (!XBZRLE.current_buf) {
2259         error_report("%s: Error allocating current_buf", __func__);
2260         goto free_encoded_buf;
2261     }
2262
2263     /* We are all good */
2264     XBZRLE_cache_unlock();
2265     return 0;
2266
2267 free_encoded_buf:
2268     g_free(XBZRLE.encoded_buf);
2269     XBZRLE.encoded_buf = NULL;
2270 free_cache:
2271     cache_fini(XBZRLE.cache);
2272     XBZRLE.cache = NULL;
2273 free_zero_page:
2274     g_free(XBZRLE.zero_target_page);
2275     XBZRLE.zero_target_page = NULL;
2276 err_out:
2277     XBZRLE_cache_unlock();
2278     return -ENOMEM;
2279 }
2280
2281 static int ram_state_init(RAMState **rsp)
2282 {
2283     *rsp = g_try_new0(RAMState, 1);
2284
2285     if (!*rsp) {
2286         error_report("%s: Init ramstate fail", __func__);
2287         return -1;
2288     }
2289
2290     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2291     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2292     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2293
2294     /*
2295      * Count the total number of pages used by ram blocks not including any
2296      * gaps due to alignment or unplugs.
2297      * This must match with the initial values of dirty bitmap.
2298      */
2299     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2300     ram_state_reset(*rsp);
2301
2302     return 0;
2303 }
2304
2305 static void ram_list_init_bitmaps(void)
2306 {
2307     MigrationState *ms = migrate_get_current();
2308     RAMBlock *block;
2309     unsigned long pages;
2310     uint8_t shift;
2311
2312     /* Skip setting bitmap if there is no RAM */
2313     if (ram_bytes_total()) {
2314         shift = ms->clear_bitmap_shift;
2315         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2316             error_report("clear_bitmap_shift (%u) too big, using "
2317                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2318             shift = CLEAR_BITMAP_SHIFT_MAX;
2319         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2320             error_report("clear_bitmap_shift (%u) too small, using "
2321                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2322             shift = CLEAR_BITMAP_SHIFT_MIN;
2323         }
2324
2325         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2326             pages = block->max_length >> TARGET_PAGE_BITS;
2327             /*
2328              * The initial dirty bitmap for migration must be set with all
2329              * ones to make sure we'll migrate every guest RAM page to
2330              * destination.
2331              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2332              * new migration after a failed migration, ram_list.
2333              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2334              * guest memory.
2335              */
2336             block->bmap = bitmap_new(pages);
2337             bitmap_set(block->bmap, 0, pages);
2338             block->clear_bmap_shift = shift;
2339             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2340         }
2341     }
2342 }
2343
2344 static void ram_init_bitmaps(RAMState *rs)
2345 {
2346     /* For memory_global_dirty_log_start below.  */
2347     qemu_mutex_lock_iothread();
2348     qemu_mutex_lock_ramlist();
2349
2350     WITH_RCU_READ_LOCK_GUARD() {
2351         ram_list_init_bitmaps();
2352         memory_global_dirty_log_start();
2353         migration_bitmap_sync_precopy(rs);
2354     }
2355     qemu_mutex_unlock_ramlist();
2356     qemu_mutex_unlock_iothread();
2357 }
2358
2359 static int ram_init_all(RAMState **rsp)
2360 {
2361     if (ram_state_init(rsp)) {
2362         return -1;
2363     }
2364
2365     if (xbzrle_init()) {
2366         ram_state_cleanup(rsp);
2367         return -1;
2368     }
2369
2370     ram_init_bitmaps(*rsp);
2371
2372     return 0;
2373 }
2374
2375 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2376 {
2377     RAMBlock *block;
2378     uint64_t pages = 0;
2379
2380     /*
2381      * Postcopy is not using xbzrle/compression, so no need for that.
2382      * Also, since source are already halted, we don't need to care
2383      * about dirty page logging as well.
2384      */
2385
2386     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2387         pages += bitmap_count_one(block->bmap,
2388                                   block->used_length >> TARGET_PAGE_BITS);
2389     }
2390
2391     /* This may not be aligned with current bitmaps. Recalculate. */
2392     rs->migration_dirty_pages = pages;
2393
2394     rs->last_seen_block = NULL;
2395     rs->last_sent_block = NULL;
2396     rs->last_page = 0;
2397     rs->last_version = ram_list.version;
2398     /*
2399      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2400      * matter what we have sent.
2401      */
2402     rs->ram_bulk_stage = false;
2403
2404     /* Update RAMState cache of output QEMUFile */
2405     rs->f = out;
2406
2407     trace_ram_state_resume_prepare(pages);
2408 }
2409
2410 /*
2411  * This function clears bits of the free pages reported by the caller from the
2412  * migration dirty bitmap. @addr is the host address corresponding to the
2413  * start of the continuous guest free pages, and @len is the total bytes of
2414  * those pages.
2415  */
2416 void qemu_guest_free_page_hint(void *addr, size_t len)
2417 {
2418     RAMBlock *block;
2419     ram_addr_t offset;
2420     size_t used_len, start, npages;
2421     MigrationState *s = migrate_get_current();
2422
2423     /* This function is currently expected to be used during live migration */
2424     if (!migration_is_setup_or_active(s->state)) {
2425         return;
2426     }
2427
2428     for (; len > 0; len -= used_len, addr += used_len) {
2429         block = qemu_ram_block_from_host(addr, false, &offset);
2430         if (unlikely(!block || offset >= block->used_length)) {
2431             /*
2432              * The implementation might not support RAMBlock resize during
2433              * live migration, but it could happen in theory with future
2434              * updates. So we add a check here to capture that case.
2435              */
2436             error_report_once("%s unexpected error", __func__);
2437             return;
2438         }
2439
2440         if (len <= block->used_length - offset) {
2441             used_len = len;
2442         } else {
2443             used_len = block->used_length - offset;
2444         }
2445
2446         start = offset >> TARGET_PAGE_BITS;
2447         npages = used_len >> TARGET_PAGE_BITS;
2448
2449         qemu_mutex_lock(&ram_state->bitmap_mutex);
2450         ram_state->migration_dirty_pages -=
2451                       bitmap_count_one_with_offset(block->bmap, start, npages);
2452         bitmap_clear(block->bmap, start, npages);
2453         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2454     }
2455 }
2456
2457 /*
2458  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2459  * long-running RCU critical section.  When rcu-reclaims in the code
2460  * start to become numerous it will be necessary to reduce the
2461  * granularity of these critical sections.
2462  */
2463
2464 /**
2465  * ram_save_setup: Setup RAM for migration
2466  *
2467  * Returns zero to indicate success and negative for error
2468  *
2469  * @f: QEMUFile where to send the data
2470  * @opaque: RAMState pointer
2471  */
2472 static int ram_save_setup(QEMUFile *f, void *opaque)
2473 {
2474     RAMState **rsp = opaque;
2475     RAMBlock *block;
2476
2477     if (compress_threads_save_setup()) {
2478         return -1;
2479     }
2480
2481     /* migration has already setup the bitmap, reuse it. */
2482     if (!migration_in_colo_state()) {
2483         if (ram_init_all(rsp) != 0) {
2484             compress_threads_save_cleanup();
2485             return -1;
2486         }
2487     }
2488     (*rsp)->f = f;
2489
2490     WITH_RCU_READ_LOCK_GUARD() {
2491         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2492
2493         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2494             qemu_put_byte(f, strlen(block->idstr));
2495             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2496             qemu_put_be64(f, block->used_length);
2497             if (migrate_postcopy_ram() && block->page_size !=
2498                                           qemu_host_page_size) {
2499                 qemu_put_be64(f, block->page_size);
2500             }
2501             if (migrate_ignore_shared()) {
2502                 qemu_put_be64(f, block->mr->addr);
2503             }
2504         }
2505     }
2506
2507     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2508     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2509
2510     multifd_send_sync_main(f);
2511     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2512     qemu_fflush(f);
2513
2514     return 0;
2515 }
2516
2517 /**
2518  * ram_save_iterate: iterative stage for migration
2519  *
2520  * Returns zero to indicate success and negative for error
2521  *
2522  * @f: QEMUFile where to send the data
2523  * @opaque: RAMState pointer
2524  */
2525 static int ram_save_iterate(QEMUFile *f, void *opaque)
2526 {
2527     RAMState **temp = opaque;
2528     RAMState *rs = *temp;
2529     int ret = 0;
2530     int i;
2531     int64_t t0;
2532     int done = 0;
2533
2534     if (blk_mig_bulk_active()) {
2535         /* Avoid transferring ram during bulk phase of block migration as
2536          * the bulk phase will usually take a long time and transferring
2537          * ram updates during that time is pointless. */
2538         goto out;
2539     }
2540
2541     WITH_RCU_READ_LOCK_GUARD() {
2542         if (ram_list.version != rs->last_version) {
2543             ram_state_reset(rs);
2544         }
2545
2546         /* Read version before ram_list.blocks */
2547         smp_rmb();
2548
2549         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2550
2551         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2552         i = 0;
2553         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2554                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2555             int pages;
2556
2557             if (qemu_file_get_error(f)) {
2558                 break;
2559             }
2560
2561             pages = ram_find_and_save_block(rs, false);
2562             /* no more pages to sent */
2563             if (pages == 0) {
2564                 done = 1;
2565                 break;
2566             }
2567
2568             if (pages < 0) {
2569                 qemu_file_set_error(f, pages);
2570                 break;
2571             }
2572
2573             rs->target_page_count += pages;
2574
2575             /*
2576              * During postcopy, it is necessary to make sure one whole host
2577              * page is sent in one chunk.
2578              */
2579             if (migrate_postcopy_ram()) {
2580                 flush_compressed_data(rs);
2581             }
2582
2583             /*
2584              * we want to check in the 1st loop, just in case it was the 1st
2585              * time and we had to sync the dirty bitmap.
2586              * qemu_clock_get_ns() is a bit expensive, so we only check each
2587              * some iterations
2588              */
2589             if ((i & 63) == 0) {
2590                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2591                               1000000;
2592                 if (t1 > MAX_WAIT) {
2593                     trace_ram_save_iterate_big_wait(t1, i);
2594                     break;
2595                 }
2596             }
2597             i++;
2598         }
2599     }
2600
2601     /*
2602      * Must occur before EOS (or any QEMUFile operation)
2603      * because of RDMA protocol.
2604      */
2605     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2606
2607 out:
2608     if (ret >= 0
2609         && migration_is_setup_or_active(migrate_get_current()->state)) {
2610         multifd_send_sync_main(rs->f);
2611         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2612         qemu_fflush(f);
2613         ram_counters.transferred += 8;
2614
2615         ret = qemu_file_get_error(f);
2616     }
2617     if (ret < 0) {
2618         return ret;
2619     }
2620
2621     return done;
2622 }
2623
2624 /**
2625  * ram_save_complete: function called to send the remaining amount of ram
2626  *
2627  * Returns zero to indicate success or negative on error
2628  *
2629  * Called with iothread lock
2630  *
2631  * @f: QEMUFile where to send the data
2632  * @opaque: RAMState pointer
2633  */
2634 static int ram_save_complete(QEMUFile *f, void *opaque)
2635 {
2636     RAMState **temp = opaque;
2637     RAMState *rs = *temp;
2638     int ret = 0;
2639
2640     WITH_RCU_READ_LOCK_GUARD() {
2641         if (!migration_in_postcopy()) {
2642             migration_bitmap_sync_precopy(rs);
2643         }
2644
2645         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2646
2647         /* try transferring iterative blocks of memory */
2648
2649         /* flush all remaining blocks regardless of rate limiting */
2650         while (true) {
2651             int pages;
2652
2653             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2654             /* no more blocks to sent */
2655             if (pages == 0) {
2656                 break;
2657             }
2658             if (pages < 0) {
2659                 ret = pages;
2660                 break;
2661             }
2662         }
2663
2664         flush_compressed_data(rs);
2665         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2666     }
2667
2668     if (ret >= 0) {
2669         multifd_send_sync_main(rs->f);
2670         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2671         qemu_fflush(f);
2672     }
2673
2674     return ret;
2675 }
2676
2677 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2678                              uint64_t *res_precopy_only,
2679                              uint64_t *res_compatible,
2680                              uint64_t *res_postcopy_only)
2681 {
2682     RAMState **temp = opaque;
2683     RAMState *rs = *temp;
2684     uint64_t remaining_size;
2685
2686     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2687
2688     if (!migration_in_postcopy() &&
2689         remaining_size < max_size) {
2690         qemu_mutex_lock_iothread();
2691         WITH_RCU_READ_LOCK_GUARD() {
2692             migration_bitmap_sync_precopy(rs);
2693         }
2694         qemu_mutex_unlock_iothread();
2695         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2696     }
2697
2698     if (migrate_postcopy_ram()) {
2699         /* We can do postcopy, and all the data is postcopiable */
2700         *res_compatible += remaining_size;
2701     } else {
2702         *res_precopy_only += remaining_size;
2703     }
2704 }
2705
2706 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2707 {
2708     unsigned int xh_len;
2709     int xh_flags;
2710     uint8_t *loaded_data;
2711
2712     /* extract RLE header */
2713     xh_flags = qemu_get_byte(f);
2714     xh_len = qemu_get_be16(f);
2715
2716     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2717         error_report("Failed to load XBZRLE page - wrong compression!");
2718         return -1;
2719     }
2720
2721     if (xh_len > TARGET_PAGE_SIZE) {
2722         error_report("Failed to load XBZRLE page - len overflow!");
2723         return -1;
2724     }
2725     loaded_data = XBZRLE.decoded_buf;
2726     /* load data and decode */
2727     /* it can change loaded_data to point to an internal buffer */
2728     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2729
2730     /* decode RLE */
2731     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2732                              TARGET_PAGE_SIZE) == -1) {
2733         error_report("Failed to load XBZRLE page - decode error!");
2734         return -1;
2735     }
2736
2737     return 0;
2738 }
2739
2740 /**
2741  * ram_block_from_stream: read a RAMBlock id from the migration stream
2742  *
2743  * Must be called from within a rcu critical section.
2744  *
2745  * Returns a pointer from within the RCU-protected ram_list.
2746  *
2747  * @f: QEMUFile where to read the data from
2748  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2749  */
2750 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2751 {
2752     static RAMBlock *block = NULL;
2753     char id[256];
2754     uint8_t len;
2755
2756     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2757         if (!block) {
2758             error_report("Ack, bad migration stream!");
2759             return NULL;
2760         }
2761         return block;
2762     }
2763
2764     len = qemu_get_byte(f);
2765     qemu_get_buffer(f, (uint8_t *)id, len);
2766     id[len] = 0;
2767
2768     block = qemu_ram_block_by_name(id);
2769     if (!block) {
2770         error_report("Can't find block %s", id);
2771         return NULL;
2772     }
2773
2774     if (ramblock_is_ignored(block)) {
2775         error_report("block %s should not be migrated !", id);
2776         return NULL;
2777     }
2778
2779     return block;
2780 }
2781
2782 static inline void *host_from_ram_block_offset(RAMBlock *block,
2783                                                ram_addr_t offset)
2784 {
2785     if (!offset_in_ramblock(block, offset)) {
2786         return NULL;
2787     }
2788
2789     return block->host + offset;
2790 }
2791
2792 static inline void *colo_cache_from_block_offset(RAMBlock *block,
2793                              ram_addr_t offset, bool record_bitmap)
2794 {
2795     if (!offset_in_ramblock(block, offset)) {
2796         return NULL;
2797     }
2798     if (!block->colo_cache) {
2799         error_report("%s: colo_cache is NULL in block :%s",
2800                      __func__, block->idstr);
2801         return NULL;
2802     }
2803
2804     /*
2805     * During colo checkpoint, we need bitmap of these migrated pages.
2806     * It help us to decide which pages in ram cache should be flushed
2807     * into VM's RAM later.
2808     */
2809     if (record_bitmap &&
2810         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
2811         ram_state->migration_dirty_pages++;
2812     }
2813     return block->colo_cache + offset;
2814 }
2815
2816 /**
2817  * ram_handle_compressed: handle the zero page case
2818  *
2819  * If a page (or a whole RDMA chunk) has been
2820  * determined to be zero, then zap it.
2821  *
2822  * @host: host address for the zero page
2823  * @ch: what the page is filled from.  We only support zero
2824  * @size: size of the zero page
2825  */
2826 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2827 {
2828     if (ch != 0 || !is_zero_range(host, size)) {
2829         memset(host, ch, size);
2830     }
2831 }
2832
2833 /* return the size after decompression, or negative value on error */
2834 static int
2835 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2836                      const uint8_t *source, size_t source_len)
2837 {
2838     int err;
2839
2840     err = inflateReset(stream);
2841     if (err != Z_OK) {
2842         return -1;
2843     }
2844
2845     stream->avail_in = source_len;
2846     stream->next_in = (uint8_t *)source;
2847     stream->avail_out = dest_len;
2848     stream->next_out = dest;
2849
2850     err = inflate(stream, Z_NO_FLUSH);
2851     if (err != Z_STREAM_END) {
2852         return -1;
2853     }
2854
2855     return stream->total_out;
2856 }
2857
2858 static void *do_data_decompress(void *opaque)
2859 {
2860     DecompressParam *param = opaque;
2861     unsigned long pagesize;
2862     uint8_t *des;
2863     int len, ret;
2864
2865     qemu_mutex_lock(&param->mutex);
2866     while (!param->quit) {
2867         if (param->des) {
2868             des = param->des;
2869             len = param->len;
2870             param->des = 0;
2871             qemu_mutex_unlock(&param->mutex);
2872
2873             pagesize = TARGET_PAGE_SIZE;
2874
2875             ret = qemu_uncompress_data(&param->stream, des, pagesize,
2876                                        param->compbuf, len);
2877             if (ret < 0 && migrate_get_current()->decompress_error_check) {
2878                 error_report("decompress data failed");
2879                 qemu_file_set_error(decomp_file, ret);
2880             }
2881
2882             qemu_mutex_lock(&decomp_done_lock);
2883             param->done = true;
2884             qemu_cond_signal(&decomp_done_cond);
2885             qemu_mutex_unlock(&decomp_done_lock);
2886
2887             qemu_mutex_lock(&param->mutex);
2888         } else {
2889             qemu_cond_wait(&param->cond, &param->mutex);
2890         }
2891     }
2892     qemu_mutex_unlock(&param->mutex);
2893
2894     return NULL;
2895 }
2896
2897 static int wait_for_decompress_done(void)
2898 {
2899     int idx, thread_count;
2900
2901     if (!migrate_use_compression()) {
2902         return 0;
2903     }
2904
2905     thread_count = migrate_decompress_threads();
2906     qemu_mutex_lock(&decomp_done_lock);
2907     for (idx = 0; idx < thread_count; idx++) {
2908         while (!decomp_param[idx].done) {
2909             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2910         }
2911     }
2912     qemu_mutex_unlock(&decomp_done_lock);
2913     return qemu_file_get_error(decomp_file);
2914 }
2915
2916 static void compress_threads_load_cleanup(void)
2917 {
2918     int i, thread_count;
2919
2920     if (!migrate_use_compression()) {
2921         return;
2922     }
2923     thread_count = migrate_decompress_threads();
2924     for (i = 0; i < thread_count; i++) {
2925         /*
2926          * we use it as a indicator which shows if the thread is
2927          * properly init'd or not
2928          */
2929         if (!decomp_param[i].compbuf) {
2930             break;
2931         }
2932
2933         qemu_mutex_lock(&decomp_param[i].mutex);
2934         decomp_param[i].quit = true;
2935         qemu_cond_signal(&decomp_param[i].cond);
2936         qemu_mutex_unlock(&decomp_param[i].mutex);
2937     }
2938     for (i = 0; i < thread_count; i++) {
2939         if (!decomp_param[i].compbuf) {
2940             break;
2941         }
2942
2943         qemu_thread_join(decompress_threads + i);
2944         qemu_mutex_destroy(&decomp_param[i].mutex);
2945         qemu_cond_destroy(&decomp_param[i].cond);
2946         inflateEnd(&decomp_param[i].stream);
2947         g_free(decomp_param[i].compbuf);
2948         decomp_param[i].compbuf = NULL;
2949     }
2950     g_free(decompress_threads);
2951     g_free(decomp_param);
2952     decompress_threads = NULL;
2953     decomp_param = NULL;
2954     decomp_file = NULL;
2955 }
2956
2957 static int compress_threads_load_setup(QEMUFile *f)
2958 {
2959     int i, thread_count;
2960
2961     if (!migrate_use_compression()) {
2962         return 0;
2963     }
2964
2965     thread_count = migrate_decompress_threads();
2966     decompress_threads = g_new0(QemuThread, thread_count);
2967     decomp_param = g_new0(DecompressParam, thread_count);
2968     qemu_mutex_init(&decomp_done_lock);
2969     qemu_cond_init(&decomp_done_cond);
2970     decomp_file = f;
2971     for (i = 0; i < thread_count; i++) {
2972         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2973             goto exit;
2974         }
2975
2976         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2977         qemu_mutex_init(&decomp_param[i].mutex);
2978         qemu_cond_init(&decomp_param[i].cond);
2979         decomp_param[i].done = true;
2980         decomp_param[i].quit = false;
2981         qemu_thread_create(decompress_threads + i, "decompress",
2982                            do_data_decompress, decomp_param + i,
2983                            QEMU_THREAD_JOINABLE);
2984     }
2985     return 0;
2986 exit:
2987     compress_threads_load_cleanup();
2988     return -1;
2989 }
2990
2991 static void decompress_data_with_multi_threads(QEMUFile *f,
2992                                                void *host, int len)
2993 {
2994     int idx, thread_count;
2995
2996     thread_count = migrate_decompress_threads();
2997     qemu_mutex_lock(&decomp_done_lock);
2998     while (true) {
2999         for (idx = 0; idx < thread_count; idx++) {
3000             if (decomp_param[idx].done) {
3001                 decomp_param[idx].done = false;
3002                 qemu_mutex_lock(&decomp_param[idx].mutex);
3003                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3004                 decomp_param[idx].des = host;
3005                 decomp_param[idx].len = len;
3006                 qemu_cond_signal(&decomp_param[idx].cond);
3007                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3008                 break;
3009             }
3010         }
3011         if (idx < thread_count) {
3012             break;
3013         } else {
3014             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3015         }
3016     }
3017     qemu_mutex_unlock(&decomp_done_lock);
3018 }
3019
3020 /*
3021  * colo cache: this is for secondary VM, we cache the whole
3022  * memory of the secondary VM, it is need to hold the global lock
3023  * to call this helper.
3024  */
3025 int colo_init_ram_cache(void)
3026 {
3027     RAMBlock *block;
3028
3029     WITH_RCU_READ_LOCK_GUARD() {
3030         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3031             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3032                                                     NULL,
3033                                                     false);
3034             if (!block->colo_cache) {
3035                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3036                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3037                              block->used_length);
3038                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3039                     if (block->colo_cache) {
3040                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3041                         block->colo_cache = NULL;
3042                     }
3043                 }
3044                 return -errno;
3045             }
3046         }
3047     }
3048
3049     /*
3050     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3051     * with to decide which page in cache should be flushed into SVM's RAM. Here
3052     * we use the same name 'ram_bitmap' as for migration.
3053     */
3054     if (ram_bytes_total()) {
3055         RAMBlock *block;
3056
3057         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3058             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3059             block->bmap = bitmap_new(pages);
3060         }
3061     }
3062
3063     ram_state_init(&ram_state);
3064     return 0;
3065 }
3066
3067 /* TODO: duplicated with ram_init_bitmaps */
3068 void colo_incoming_start_dirty_log(void)
3069 {
3070     RAMBlock *block = NULL;
3071     /* For memory_global_dirty_log_start below. */
3072     qemu_mutex_lock_iothread();
3073     qemu_mutex_lock_ramlist();
3074
3075     memory_global_dirty_log_sync();
3076     WITH_RCU_READ_LOCK_GUARD() {
3077         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3078             ramblock_sync_dirty_bitmap(ram_state, block);
3079             /* Discard this dirty bitmap record */
3080             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3081         }
3082         memory_global_dirty_log_start();
3083     }
3084     ram_state->migration_dirty_pages = 0;
3085     qemu_mutex_unlock_ramlist();
3086     qemu_mutex_unlock_iothread();
3087 }
3088
3089 /* It is need to hold the global lock to call this helper */
3090 void colo_release_ram_cache(void)
3091 {
3092     RAMBlock *block;
3093
3094     memory_global_dirty_log_stop();
3095     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3096         g_free(block->bmap);
3097         block->bmap = NULL;
3098     }
3099
3100     WITH_RCU_READ_LOCK_GUARD() {
3101         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3102             if (block->colo_cache) {
3103                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3104                 block->colo_cache = NULL;
3105             }
3106         }
3107     }
3108     ram_state_cleanup(&ram_state);
3109 }
3110
3111 /**
3112  * ram_load_setup: Setup RAM for migration incoming side
3113  *
3114  * Returns zero to indicate success and negative for error
3115  *
3116  * @f: QEMUFile where to receive the data
3117  * @opaque: RAMState pointer
3118  */
3119 static int ram_load_setup(QEMUFile *f, void *opaque)
3120 {
3121     if (compress_threads_load_setup(f)) {
3122         return -1;
3123     }
3124
3125     xbzrle_load_setup();
3126     ramblock_recv_map_init();
3127
3128     return 0;
3129 }
3130
3131 static int ram_load_cleanup(void *opaque)
3132 {
3133     RAMBlock *rb;
3134
3135     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3136         qemu_ram_block_writeback(rb);
3137     }
3138
3139     xbzrle_load_cleanup();
3140     compress_threads_load_cleanup();
3141
3142     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3143         g_free(rb->receivedmap);
3144         rb->receivedmap = NULL;
3145     }
3146
3147     return 0;
3148 }
3149
3150 /**
3151  * ram_postcopy_incoming_init: allocate postcopy data structures
3152  *
3153  * Returns 0 for success and negative if there was one error
3154  *
3155  * @mis: current migration incoming state
3156  *
3157  * Allocate data structures etc needed by incoming migration with
3158  * postcopy-ram. postcopy-ram's similarly names
3159  * postcopy_ram_incoming_init does the work.
3160  */
3161 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3162 {
3163     return postcopy_ram_incoming_init(mis);
3164 }
3165
3166 /**
3167  * ram_load_postcopy: load a page in postcopy case
3168  *
3169  * Returns 0 for success or -errno in case of error
3170  *
3171  * Called in postcopy mode by ram_load().
3172  * rcu_read_lock is taken prior to this being called.
3173  *
3174  * @f: QEMUFile where to send the data
3175  */
3176 static int ram_load_postcopy(QEMUFile *f)
3177 {
3178     int flags = 0, ret = 0;
3179     bool place_needed = false;
3180     bool matches_target_page_size = false;
3181     MigrationIncomingState *mis = migration_incoming_get_current();
3182     /* Temporary page that is later 'placed' */
3183     void *postcopy_host_page = mis->postcopy_tmp_page;
3184     void *this_host = NULL;
3185     bool all_zero = true;
3186     int target_pages = 0;
3187
3188     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3189         ram_addr_t addr;
3190         void *host = NULL;
3191         void *page_buffer = NULL;
3192         void *place_source = NULL;
3193         RAMBlock *block = NULL;
3194         uint8_t ch;
3195         int len;
3196
3197         addr = qemu_get_be64(f);
3198
3199         /*
3200          * If qemu file error, we should stop here, and then "addr"
3201          * may be invalid
3202          */
3203         ret = qemu_file_get_error(f);
3204         if (ret) {
3205             break;
3206         }
3207
3208         flags = addr & ~TARGET_PAGE_MASK;
3209         addr &= TARGET_PAGE_MASK;
3210
3211         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3212         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3213                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3214             block = ram_block_from_stream(f, flags);
3215
3216             host = host_from_ram_block_offset(block, addr);
3217             if (!host) {
3218                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3219                 ret = -EINVAL;
3220                 break;
3221             }
3222             target_pages++;
3223             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3224             /*
3225              * Postcopy requires that we place whole host pages atomically;
3226              * these may be huge pages for RAMBlocks that are backed by
3227              * hugetlbfs.
3228              * To make it atomic, the data is read into a temporary page
3229              * that's moved into place later.
3230              * The migration protocol uses,  possibly smaller, target-pages
3231              * however the source ensures it always sends all the components
3232              * of a host page in one chunk.
3233              */
3234             page_buffer = postcopy_host_page +
3235                           ((uintptr_t)host & (block->page_size - 1));
3236             if (target_pages == 1) {
3237                 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3238                                                     block->page_size);
3239             } else {
3240                 /* not the 1st TP within the HP */
3241                 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3242                     (uintptr_t)this_host) {
3243                     error_report("Non-same host page %p/%p",
3244                                   host, this_host);
3245                     ret = -EINVAL;
3246                     break;
3247                 }
3248             }
3249
3250             /*
3251              * If it's the last part of a host page then we place the host
3252              * page
3253              */
3254             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3255                 place_needed = true;
3256             }
3257             place_source = postcopy_host_page;
3258         }
3259
3260         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3261         case RAM_SAVE_FLAG_ZERO:
3262             ch = qemu_get_byte(f);
3263             /*
3264              * Can skip to set page_buffer when
3265              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3266              */
3267             if (ch || !matches_target_page_size) {
3268                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3269             }
3270             if (ch) {
3271                 all_zero = false;
3272             }
3273             break;
3274
3275         case RAM_SAVE_FLAG_PAGE:
3276             all_zero = false;
3277             if (!matches_target_page_size) {
3278                 /* For huge pages, we always use temporary buffer */
3279                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3280             } else {
3281                 /*
3282                  * For small pages that matches target page size, we
3283                  * avoid the qemu_file copy.  Instead we directly use
3284                  * the buffer of QEMUFile to place the page.  Note: we
3285                  * cannot do any QEMUFile operation before using that
3286                  * buffer to make sure the buffer is valid when
3287                  * placing the page.
3288                  */
3289                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3290                                          TARGET_PAGE_SIZE);
3291             }
3292             break;
3293         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3294             all_zero = false;
3295             len = qemu_get_be32(f);
3296             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3297                 error_report("Invalid compressed data length: %d", len);
3298                 ret = -EINVAL;
3299                 break;
3300             }
3301             decompress_data_with_multi_threads(f, page_buffer, len);
3302             break;
3303
3304         case RAM_SAVE_FLAG_EOS:
3305             /* normal exit */
3306             multifd_recv_sync_main();
3307             break;
3308         default:
3309             error_report("Unknown combination of migration flags: %#x"
3310                          " (postcopy mode)", flags);
3311             ret = -EINVAL;
3312             break;
3313         }
3314
3315         /* Got the whole host page, wait for decompress before placing. */
3316         if (place_needed) {
3317             ret |= wait_for_decompress_done();
3318         }
3319
3320         /* Detect for any possible file errors */
3321         if (!ret && qemu_file_get_error(f)) {
3322             ret = qemu_file_get_error(f);
3323         }
3324
3325         if (!ret && place_needed) {
3326             /* This gets called at the last target page in the host page */
3327             void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3328                                                        block->page_size);
3329
3330             if (all_zero) {
3331                 ret = postcopy_place_page_zero(mis, place_dest,
3332                                                block);
3333             } else {
3334                 ret = postcopy_place_page(mis, place_dest,
3335                                           place_source, block);
3336             }
3337             place_needed = false;
3338             target_pages = 0;
3339             /* Assume we have a zero page until we detect something different */
3340             all_zero = true;
3341         }
3342     }
3343
3344     return ret;
3345 }
3346
3347 static bool postcopy_is_advised(void)
3348 {
3349     PostcopyState ps = postcopy_state_get();
3350     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3351 }
3352
3353 static bool postcopy_is_running(void)
3354 {
3355     PostcopyState ps = postcopy_state_get();
3356     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3357 }
3358
3359 /*
3360  * Flush content of RAM cache into SVM's memory.
3361  * Only flush the pages that be dirtied by PVM or SVM or both.
3362  */
3363 void colo_flush_ram_cache(void)
3364 {
3365     RAMBlock *block = NULL;
3366     void *dst_host;
3367     void *src_host;
3368     unsigned long offset = 0;
3369
3370     memory_global_dirty_log_sync();
3371     WITH_RCU_READ_LOCK_GUARD() {
3372         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3373             ramblock_sync_dirty_bitmap(ram_state, block);
3374         }
3375     }
3376
3377     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3378     WITH_RCU_READ_LOCK_GUARD() {
3379         block = QLIST_FIRST_RCU(&ram_list.blocks);
3380
3381         while (block) {
3382             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3383
3384             if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3385                 >= block->used_length) {
3386                 offset = 0;
3387                 block = QLIST_NEXT_RCU(block, next);
3388             } else {
3389                 migration_bitmap_clear_dirty(ram_state, block, offset);
3390                 dst_host = block->host
3391                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3392                 src_host = block->colo_cache
3393                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3394                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3395             }
3396         }
3397     }
3398     trace_colo_flush_ram_cache_end();
3399 }
3400
3401 /**
3402  * ram_load_precopy: load pages in precopy case
3403  *
3404  * Returns 0 for success or -errno in case of error
3405  *
3406  * Called in precopy mode by ram_load().
3407  * rcu_read_lock is taken prior to this being called.
3408  *
3409  * @f: QEMUFile where to send the data
3410  */
3411 static int ram_load_precopy(QEMUFile *f)
3412 {
3413     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3414     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3415     bool postcopy_advised = postcopy_is_advised();
3416     if (!migrate_use_compression()) {
3417         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3418     }
3419
3420     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3421         ram_addr_t addr, total_ram_bytes;
3422         void *host = NULL, *host_bak = NULL;
3423         uint8_t ch;
3424
3425         /*
3426          * Yield periodically to let main loop run, but an iteration of
3427          * the main loop is expensive, so do it each some iterations
3428          */
3429         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3430             aio_co_schedule(qemu_get_current_aio_context(),
3431                             qemu_coroutine_self());
3432             qemu_coroutine_yield();
3433         }
3434         i++;
3435
3436         addr = qemu_get_be64(f);
3437         flags = addr & ~TARGET_PAGE_MASK;
3438         addr &= TARGET_PAGE_MASK;
3439
3440         if (flags & invalid_flags) {
3441             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3442                 error_report("Received an unexpected compressed page");
3443             }
3444
3445             ret = -EINVAL;
3446             break;
3447         }
3448
3449         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3450                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3451             RAMBlock *block = ram_block_from_stream(f, flags);
3452
3453             host = host_from_ram_block_offset(block, addr);
3454             /*
3455              * After going into COLO stage, we should not load the page
3456              * into SVM's memory directly, we put them into colo_cache firstly.
3457              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3458              * Previously, we copied all these memory in preparing stage of COLO
3459              * while we need to stop VM, which is a time-consuming process.
3460              * Here we optimize it by a trick, back-up every page while in
3461              * migration process while COLO is enabled, though it affects the
3462              * speed of the migration, but it obviously reduce the downtime of
3463              * back-up all SVM'S memory in COLO preparing stage.
3464              */
3465             if (migration_incoming_colo_enabled()) {
3466                 if (migration_incoming_in_colo_state()) {
3467                     /* In COLO stage, put all pages into cache temporarily */
3468                     host = colo_cache_from_block_offset(block, addr, true);
3469                 } else {
3470                    /*
3471                     * In migration stage but before COLO stage,
3472                     * Put all pages into both cache and SVM's memory.
3473                     */
3474                     host_bak = colo_cache_from_block_offset(block, addr, false);
3475                 }
3476             }
3477             if (!host) {
3478                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3479                 ret = -EINVAL;
3480                 break;
3481             }
3482             if (!migration_incoming_in_colo_state()) {
3483                 ramblock_recv_bitmap_set(block, host);
3484             }
3485
3486             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3487         }
3488
3489         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3490         case RAM_SAVE_FLAG_MEM_SIZE:
3491             /* Synchronize RAM block list */
3492             total_ram_bytes = addr;
3493             while (!ret && total_ram_bytes) {
3494                 RAMBlock *block;
3495                 char id[256];
3496                 ram_addr_t length;
3497
3498                 len = qemu_get_byte(f);
3499                 qemu_get_buffer(f, (uint8_t *)id, len);
3500                 id[len] = 0;
3501                 length = qemu_get_be64(f);
3502
3503                 block = qemu_ram_block_by_name(id);
3504                 if (block && !qemu_ram_is_migratable(block)) {
3505                     error_report("block %s should not be migrated !", id);
3506                     ret = -EINVAL;
3507                 } else if (block) {
3508                     if (length != block->used_length) {
3509                         Error *local_err = NULL;
3510
3511                         ret = qemu_ram_resize(block, length,
3512                                               &local_err);
3513                         if (local_err) {
3514                             error_report_err(local_err);
3515                         }
3516                     }
3517                     /* For postcopy we need to check hugepage sizes match */
3518                     if (postcopy_advised &&
3519                         block->page_size != qemu_host_page_size) {
3520                         uint64_t remote_page_size = qemu_get_be64(f);
3521                         if (remote_page_size != block->page_size) {
3522                             error_report("Mismatched RAM page size %s "
3523                                          "(local) %zd != %" PRId64,
3524                                          id, block->page_size,
3525                                          remote_page_size);
3526                             ret = -EINVAL;
3527                         }
3528                     }
3529                     if (migrate_ignore_shared()) {
3530                         hwaddr addr = qemu_get_be64(f);
3531                         if (ramblock_is_ignored(block) &&
3532                             block->mr->addr != addr) {
3533                             error_report("Mismatched GPAs for block %s "
3534                                          "%" PRId64 "!= %" PRId64,
3535                                          id, (uint64_t)addr,
3536                                          (uint64_t)block->mr->addr);
3537                             ret = -EINVAL;
3538                         }
3539                     }
3540                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3541                                           block->idstr);
3542                 } else {
3543                     error_report("Unknown ramblock \"%s\", cannot "
3544                                  "accept migration", id);
3545                     ret = -EINVAL;
3546                 }
3547
3548                 total_ram_bytes -= length;
3549             }
3550             break;
3551
3552         case RAM_SAVE_FLAG_ZERO:
3553             ch = qemu_get_byte(f);
3554             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3555             break;
3556
3557         case RAM_SAVE_FLAG_PAGE:
3558             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3559             break;
3560
3561         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3562             len = qemu_get_be32(f);
3563             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3564                 error_report("Invalid compressed data length: %d", len);
3565                 ret = -EINVAL;
3566                 break;
3567             }
3568             decompress_data_with_multi_threads(f, host, len);
3569             break;
3570
3571         case RAM_SAVE_FLAG_XBZRLE:
3572             if (load_xbzrle(f, addr, host) < 0) {
3573                 error_report("Failed to decompress XBZRLE page at "
3574                              RAM_ADDR_FMT, addr);
3575                 ret = -EINVAL;
3576                 break;
3577             }
3578             break;
3579         case RAM_SAVE_FLAG_EOS:
3580             /* normal exit */
3581             multifd_recv_sync_main();
3582             break;
3583         default:
3584             if (flags & RAM_SAVE_FLAG_HOOK) {
3585                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3586             } else {
3587                 error_report("Unknown combination of migration flags: %#x",
3588                              flags);
3589                 ret = -EINVAL;
3590             }
3591         }
3592         if (!ret) {
3593             ret = qemu_file_get_error(f);
3594         }
3595         if (!ret && host_bak) {
3596             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3597         }
3598     }
3599
3600     ret |= wait_for_decompress_done();
3601     return ret;
3602 }
3603
3604 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3605 {
3606     int ret = 0;
3607     static uint64_t seq_iter;
3608     /*
3609      * If system is running in postcopy mode, page inserts to host memory must
3610      * be atomic
3611      */
3612     bool postcopy_running = postcopy_is_running();
3613
3614     seq_iter++;
3615
3616     if (version_id != 4) {
3617         return -EINVAL;
3618     }
3619
3620     /*
3621      * This RCU critical section can be very long running.
3622      * When RCU reclaims in the code start to become numerous,
3623      * it will be necessary to reduce the granularity of this
3624      * critical section.
3625      */
3626     WITH_RCU_READ_LOCK_GUARD() {
3627         if (postcopy_running) {
3628             ret = ram_load_postcopy(f);
3629         } else {
3630             ret = ram_load_precopy(f);
3631         }
3632     }
3633     trace_ram_load_complete(ret, seq_iter);
3634
3635     return ret;
3636 }
3637
3638 static bool ram_has_postcopy(void *opaque)
3639 {
3640     RAMBlock *rb;
3641     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3642         if (ramblock_is_pmem(rb)) {
3643             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3644                          "is not supported now!", rb->idstr, rb->host);
3645             return false;
3646         }
3647     }
3648
3649     return migrate_postcopy_ram();
3650 }
3651
3652 /* Sync all the dirty bitmap with destination VM.  */
3653 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3654 {
3655     RAMBlock *block;
3656     QEMUFile *file = s->to_dst_file;
3657     int ramblock_count = 0;
3658
3659     trace_ram_dirty_bitmap_sync_start();
3660
3661     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3662         qemu_savevm_send_recv_bitmap(file, block->idstr);
3663         trace_ram_dirty_bitmap_request(block->idstr);
3664         ramblock_count++;
3665     }
3666
3667     trace_ram_dirty_bitmap_sync_wait();
3668
3669     /* Wait until all the ramblocks' dirty bitmap synced */
3670     while (ramblock_count--) {
3671         qemu_sem_wait(&s->rp_state.rp_sem);
3672     }
3673
3674     trace_ram_dirty_bitmap_sync_complete();
3675
3676     return 0;
3677 }
3678
3679 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3680 {
3681     qemu_sem_post(&s->rp_state.rp_sem);
3682 }
3683
3684 /*
3685  * Read the received bitmap, revert it as the initial dirty bitmap.
3686  * This is only used when the postcopy migration is paused but wants
3687  * to resume from a middle point.
3688  */
3689 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3690 {
3691     int ret = -EINVAL;
3692     QEMUFile *file = s->rp_state.from_dst_file;
3693     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3694     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3695     uint64_t size, end_mark;
3696
3697     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3698
3699     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3700         error_report("%s: incorrect state %s", __func__,
3701                      MigrationStatus_str(s->state));
3702         return -EINVAL;
3703     }
3704
3705     /*
3706      * Note: see comments in ramblock_recv_bitmap_send() on why we
3707      * need the endianess convertion, and the paddings.
3708      */
3709     local_size = ROUND_UP(local_size, 8);
3710
3711     /* Add paddings */
3712     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3713
3714     size = qemu_get_be64(file);
3715
3716     /* The size of the bitmap should match with our ramblock */
3717     if (size != local_size) {
3718         error_report("%s: ramblock '%s' bitmap size mismatch "
3719                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3720                      block->idstr, size, local_size);
3721         ret = -EINVAL;
3722         goto out;
3723     }
3724
3725     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3726     end_mark = qemu_get_be64(file);
3727
3728     ret = qemu_file_get_error(file);
3729     if (ret || size != local_size) {
3730         error_report("%s: read bitmap failed for ramblock '%s': %d"
3731                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3732                      __func__, block->idstr, ret, local_size, size);
3733         ret = -EIO;
3734         goto out;
3735     }
3736
3737     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3738         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3739                      __func__, block->idstr, end_mark);
3740         ret = -EINVAL;
3741         goto out;
3742     }
3743
3744     /*
3745      * Endianess convertion. We are during postcopy (though paused).
3746      * The dirty bitmap won't change. We can directly modify it.
3747      */
3748     bitmap_from_le(block->bmap, le_bitmap, nbits);
3749
3750     /*
3751      * What we received is "received bitmap". Revert it as the initial
3752      * dirty bitmap for this ramblock.
3753      */
3754     bitmap_complement(block->bmap, block->bmap, nbits);
3755
3756     trace_ram_dirty_bitmap_reload_complete(block->idstr);
3757
3758     /*
3759      * We succeeded to sync bitmap for current ramblock. If this is
3760      * the last one to sync, we need to notify the main send thread.
3761      */
3762     ram_dirty_bitmap_reload_notify(s);
3763
3764     ret = 0;
3765 out:
3766     g_free(le_bitmap);
3767     return ret;
3768 }
3769
3770 static int ram_resume_prepare(MigrationState *s, void *opaque)
3771 {
3772     RAMState *rs = *(RAMState **)opaque;
3773     int ret;
3774
3775     ret = ram_dirty_bitmap_sync_all(s, rs);
3776     if (ret) {
3777         return ret;
3778     }
3779
3780     ram_state_resume_prepare(rs, s->to_dst_file);
3781
3782     return 0;
3783 }
3784
3785 static SaveVMHandlers savevm_ram_handlers = {
3786     .save_setup = ram_save_setup,
3787     .save_live_iterate = ram_save_iterate,
3788     .save_live_complete_postcopy = ram_save_complete,
3789     .save_live_complete_precopy = ram_save_complete,
3790     .has_postcopy = ram_has_postcopy,
3791     .save_live_pending = ram_save_pending,
3792     .load_state = ram_load,
3793     .save_cleanup = ram_save_cleanup,
3794     .load_setup = ram_load_setup,
3795     .load_cleanup = ram_load_cleanup,
3796     .resume_prepare = ram_resume_prepare,
3797 };
3798
3799 void ram_mig_init(void)
3800 {
3801     qemu_mutex_init(&XBZRLE.lock);
3802     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
3803 }