migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qapi-event.h"
  33 #include "qemu/cutils.h"
  34 #include "qemu/bitops.h"
  35 #include "qemu/bitmap.h"
  36 #include "qemu/main-loop.h"
  37 #include "xbzrle.h"
  38 #include "ram.h"
  39 #include "migration.h"
  40 #include "migration/register.h"
  41 #include "migration/misc.h"
  42 #include "qemu-file.h"
  43 #include "postcopy-ram.h"
  44 #include "migration/page_cache.h"
  45 #include "qemu/error-report.h"
  46 #include "qapi/error.h"
  47 #include "qapi/qmp/qerror.h"
  48 #include "trace.h"
  49 #include "exec/ram_addr.h"
  50 #include "exec/target_page.h"
  51 #include "qemu/rcu_queue.h"
  52 #include "migration/colo.h"
  53 #include "migration/block.h"
  54
  55 /***********************************************************/
  56 /* ram save/restore */
  57
  58 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  59  * worked for pages that where filled with the same char.  We switched
  60  * it to only search for the zero value.  And to avoid confusion with
  61  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  62  */
  63
  64 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  65 #define RAM_SAVE_FLAG_ZERO     0x02
  66 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  67 #define RAM_SAVE_FLAG_PAGE     0x08
  68 #define RAM_SAVE_FLAG_EOS      0x10
  69 #define RAM_SAVE_FLAG_CONTINUE 0x20
  70 #define RAM_SAVE_FLAG_XBZRLE   0x40
  71 /* 0x80 is reserved in migration.h start with 0x100 next */
  72 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  73
  74 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  75 {
  76     return buffer_is_zero(p, size);
  77 }
  78
  79 XBZRLECacheStats xbzrle_counters;
  80
  81 /* struct contains XBZRLE cache and a static page
  82    used by the compression */
  83 static struct {
  84     /* buffer used for XBZRLE encoding */
  85     uint8_t *encoded_buf;
  86     /* buffer for storing page content */
  87     uint8_t *current_buf;
  88     /* Cache for XBZRLE, Protected by lock. */
  89     PageCache *cache;
  90     QemuMutex lock;
  91     /* it will store a page full of zeros */
  92     uint8_t *zero_target_page;
  93     /* buffer used for XBZRLE decoding */
  94     uint8_t *decoded_buf;
  95 } XBZRLE;
  96
  97 static void XBZRLE_cache_lock(void)
  98 {
  99     if (migrate_use_xbzrle())
 100         qemu_mutex_lock(&XBZRLE.lock);
 101 }
 102
 103 static void XBZRLE_cache_unlock(void)
 104 {
 105     if (migrate_use_xbzrle())
 106         qemu_mutex_unlock(&XBZRLE.lock);
 107 }
 108
 109 /**
 110  * xbzrle_cache_resize: resize the xbzrle cache
 111  *
 112  * This function is called from qmp_migrate_set_cache_size in main
 113  * thread, possibly while a migration is in progress.  A running
 114  * migration may be using the cache and might finish during this call,
 115  * hence changes to the cache are protected by XBZRLE.lock().
 116  *
 117  * Returns 0 for success or -1 for error
 118  *
 119  * @new_size: new cache size
 120  * @errp: set *errp if the check failed, with reason
 121  */
 122 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 123 {
 124     PageCache *new_cache;
 125     int64_t ret = 0;
 126
 127     /* Check for truncation */
 128     if (new_size != (size_t)new_size) {
 129         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 130                    "exceeding address space");
 131         return -1;
 132     }
 133
 134     if (new_size == migrate_xbzrle_cache_size()) {
 135         /* nothing to do */
 136         return 0;
 137     }
 138
 139     XBZRLE_cache_lock();
 140
 141     if (XBZRLE.cache != NULL) {
 142         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 143         if (!new_cache) {
 144             ret = -1;
 145             goto out;
 146         }
 147
 148         cache_fini(XBZRLE.cache);
 149         XBZRLE.cache = new_cache;
 150     }
 151 out:
 152     XBZRLE_cache_unlock();
 153     return ret;
 154 }
 155
 156 static void ramblock_recv_map_init(void)
 157 {
 158     RAMBlock *rb;
 159
 160     RAMBLOCK_FOREACH(rb) {
 161         assert(!rb->receivedmap);
 162         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 163     }
 164 }
 165
 166 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 167 {
 168     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 169                     rb->receivedmap);
 170 }
 171
 172 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 173 {
 174     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 175 }
 176
 177 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 178                                     size_t nr)
 179 {
 180     bitmap_set_atomic(rb->receivedmap,
 181                       ramblock_recv_bitmap_offset(host_addr, rb),
 182                       nr);
 183 }
 184
 185 /*
 186  * An outstanding page request, on the source, having been received
 187  * and queued
 188  */
 189 struct RAMSrcPageRequest {
 190     RAMBlock *rb;
 191     hwaddr    offset;
 192     hwaddr    len;
 193
 194     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 195 };
 196
 197 /* State of RAM for migration */
 198 struct RAMState {
 199     /* QEMUFile used for this migration */
 200     QEMUFile *f;
 201     /* Last block that we have visited searching for dirty pages */
 202     RAMBlock *last_seen_block;
 203     /* Last block from where we have sent data */
 204     RAMBlock *last_sent_block;
 205     /* Last dirty target page we have sent */
 206     ram_addr_t last_page;
 207     /* last ram version we have seen */
 208     uint32_t last_version;
 209     /* We are in the first round */
 210     bool ram_bulk_stage;
 211     /* How many times we have dirty too many pages */
 212     int dirty_rate_high_cnt;
 213     /* these variables are used for bitmap sync */
 214     /* last time we did a full bitmap_sync */
 215     int64_t time_last_bitmap_sync;
 216     /* bytes transferred at start_time */
 217     uint64_t bytes_xfer_prev;
 218     /* number of dirty pages since start_time */
 219     uint64_t num_dirty_pages_period;
 220     /* xbzrle misses since the beginning of the period */
 221     uint64_t xbzrle_cache_miss_prev;
 222     /* number of iterations at the beginning of period */
 223     uint64_t iterations_prev;
 224     /* Iterations since start */
 225     uint64_t iterations;
 226     /* number of dirty bits in the bitmap */
 227     uint64_t migration_dirty_pages;
 228     /* protects modification of the bitmap */
 229     QemuMutex bitmap_mutex;
 230     /* The RAMBlock used in the last src_page_requests */
 231     RAMBlock *last_req_rb;
 232     /* Queue of outstanding page requests from the destination */
 233     QemuMutex src_page_req_mutex;
 234     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 235 };
 236 typedef struct RAMState RAMState;
 237
 238 static RAMState *ram_state;
 239
 240 uint64_t ram_bytes_remaining(void)
 241 {
 242     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 243                        0;
 244 }
 245
 246 MigrationStats ram_counters;
 247
 248 /* used by the search for pages to send */
 249 struct PageSearchStatus {
 250     /* Current block being searched */
 251     RAMBlock    *block;
 252     /* Current page to search from */
 253     unsigned long page;
 254     /* Set once we wrap around */
 255     bool         complete_round;
 256 };
 257 typedef struct PageSearchStatus PageSearchStatus;
 258
 259 struct CompressParam {
 260     bool done;
 261     bool quit;
 262     QEMUFile *file;
 263     QemuMutex mutex;
 264     QemuCond cond;
 265     RAMBlock *block;
 266     ram_addr_t offset;
 267 };
 268 typedef struct CompressParam CompressParam;
 269
 270 struct DecompressParam {
 271     bool done;
 272     bool quit;
 273     QemuMutex mutex;
 274     QemuCond cond;
 275     void *des;
 276     uint8_t *compbuf;
 277     int len;
 278 };
 279 typedef struct DecompressParam DecompressParam;
 280
 281 static CompressParam *comp_param;
 282 static QemuThread *compress_threads;
 283 /* comp_done_cond is used to wake up the migration thread when
 284  * one of the compression threads has finished the compression.
 285  * comp_done_lock is used to co-work with comp_done_cond.
 286  */
 287 static QemuMutex comp_done_lock;
 288 static QemuCond comp_done_cond;
 289 /* The empty QEMUFileOps will be used by file in CompressParam */
 290 static const QEMUFileOps empty_ops = { };
 291
 292 static DecompressParam *decomp_param;
 293 static QemuThread *decompress_threads;
 294 static QemuMutex decomp_done_lock;
 295 static QemuCond decomp_done_cond;
 296
 297 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 298                                 ram_addr_t offset);
 299
 300 static void *do_data_compress(void *opaque)
 301 {
 302     CompressParam *param = opaque;
 303     RAMBlock *block;
 304     ram_addr_t offset;
 305
 306     qemu_mutex_lock(&param->mutex);
 307     while (!param->quit) {
 308         if (param->block) {
 309             block = param->block;
 310             offset = param->offset;
 311             param->block = NULL;
 312             qemu_mutex_unlock(&param->mutex);
 313
 314             do_compress_ram_page(param->file, block, offset);
 315
 316             qemu_mutex_lock(&comp_done_lock);
 317             param->done = true;
 318             qemu_cond_signal(&comp_done_cond);
 319             qemu_mutex_unlock(&comp_done_lock);
 320
 321             qemu_mutex_lock(&param->mutex);
 322         } else {
 323             qemu_cond_wait(&param->cond, &param->mutex);
 324         }
 325     }
 326     qemu_mutex_unlock(&param->mutex);
 327
 328     return NULL;
 329 }
 330
 331 static inline void terminate_compression_threads(void)
 332 {
 333     int idx, thread_count;
 334
 335     thread_count = migrate_compress_threads();
 336
 337     for (idx = 0; idx < thread_count; idx++) {
 338         qemu_mutex_lock(&comp_param[idx].mutex);
 339         comp_param[idx].quit = true;
 340         qemu_cond_signal(&comp_param[idx].cond);
 341         qemu_mutex_unlock(&comp_param[idx].mutex);
 342     }
 343 }
 344
 345 static void compress_threads_save_cleanup(void)
 346 {
 347     int i, thread_count;
 348
 349     if (!migrate_use_compression()) {
 350         return;
 351     }
 352     terminate_compression_threads();
 353     thread_count = migrate_compress_threads();
 354     for (i = 0; i < thread_count; i++) {
 355         qemu_thread_join(compress_threads + i);
 356         qemu_fclose(comp_param[i].file);
 357         qemu_mutex_destroy(&comp_param[i].mutex);
 358         qemu_cond_destroy(&comp_param[i].cond);
 359     }
 360     qemu_mutex_destroy(&comp_done_lock);
 361     qemu_cond_destroy(&comp_done_cond);
 362     g_free(compress_threads);
 363     g_free(comp_param);
 364     compress_threads = NULL;
 365     comp_param = NULL;
 366 }
 367
 368 static void compress_threads_save_setup(void)
 369 {
 370     int i, thread_count;
 371
 372     if (!migrate_use_compression()) {
 373         return;
 374     }
 375     thread_count = migrate_compress_threads();
 376     compress_threads = g_new0(QemuThread, thread_count);
 377     comp_param = g_new0(CompressParam, thread_count);
 378     qemu_cond_init(&comp_done_cond);
 379     qemu_mutex_init(&comp_done_lock);
 380     for (i = 0; i < thread_count; i++) {
 381         /* comp_param[i].file is just used as a dummy buffer to save data,
 382          * set its ops to empty.
 383          */
 384         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 385         comp_param[i].done = true;
 386         comp_param[i].quit = false;
 387         qemu_mutex_init(&comp_param[i].mutex);
 388         qemu_cond_init(&comp_param[i].cond);
 389         qemu_thread_create(compress_threads + i, "compress",
 390                            do_data_compress, comp_param + i,
 391                            QEMU_THREAD_JOINABLE);
 392     }
 393 }
 394
 395 /* Multiple fd's */
 396
 397 struct MultiFDSendParams {
 398     uint8_t id;
 399     char *name;
 400     QemuThread thread;
 401     QemuSemaphore sem;
 402     QemuMutex mutex;
 403     bool quit;
 404 };
 405 typedef struct MultiFDSendParams MultiFDSendParams;
 406
 407 struct {
 408     MultiFDSendParams *params;
 409     /* number of created threads */
 410     int count;
 411 } *multifd_send_state;
 412
 413 static void terminate_multifd_send_threads(Error *errp)
 414 {
 415     int i;
 416
 417     for (i = 0; i < multifd_send_state->count; i++) {
 418         MultiFDSendParams *p = &multifd_send_state->params[i];
 419
 420         qemu_mutex_lock(&p->mutex);
 421         p->quit = true;
 422         qemu_sem_post(&p->sem);
 423         qemu_mutex_unlock(&p->mutex);
 424     }
 425 }
 426
 427 int multifd_save_cleanup(Error **errp)
 428 {
 429     int i;
 430     int ret = 0;
 431
 432     if (!migrate_use_multifd()) {
 433         return 0;
 434     }
 435     terminate_multifd_send_threads(NULL);
 436     for (i = 0; i < multifd_send_state->count; i++) {
 437         MultiFDSendParams *p = &multifd_send_state->params[i];
 438
 439         qemu_thread_join(&p->thread);
 440         qemu_mutex_destroy(&p->mutex);
 441         qemu_sem_destroy(&p->sem);
 442         g_free(p->name);
 443         p->name = NULL;
 444     }
 445     g_free(multifd_send_state->params);
 446     multifd_send_state->params = NULL;
 447     g_free(multifd_send_state);
 448     multifd_send_state = NULL;
 449     return ret;
 450 }
 451
 452 static void *multifd_send_thread(void *opaque)
 453 {
 454     MultiFDSendParams *p = opaque;
 455
 456     while (true) {
 457         qemu_mutex_lock(&p->mutex);
 458         if (p->quit) {
 459             qemu_mutex_unlock(&p->mutex);
 460             break;
 461         }
 462         qemu_mutex_unlock(&p->mutex);
 463         qemu_sem_wait(&p->sem);
 464     }
 465
 466     return NULL;
 467 }
 468
 469 int multifd_save_setup(void)
 470 {
 471     int thread_count;
 472     uint8_t i;
 473
 474     if (!migrate_use_multifd()) {
 475         return 0;
 476     }
 477     thread_count = migrate_multifd_channels();
 478     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
 479     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
 480     multifd_send_state->count = 0;
 481     for (i = 0; i < thread_count; i++) {
 482         MultiFDSendParams *p = &multifd_send_state->params[i];
 483
 484         qemu_mutex_init(&p->mutex);
 485         qemu_sem_init(&p->sem, 0);
 486         p->quit = false;
 487         p->id = i;
 488         p->name = g_strdup_printf("multifdsend_%d", i);
 489         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
 490                            QEMU_THREAD_JOINABLE);
 491
 492         multifd_send_state->count++;
 493     }
 494     return 0;
 495 }
 496
 497 struct MultiFDRecvParams {
 498     uint8_t id;
 499     char *name;
 500     QemuThread thread;
 501     QemuSemaphore sem;
 502     QemuMutex mutex;
 503     bool quit;
 504 };
 505 typedef struct MultiFDRecvParams MultiFDRecvParams;
 506
 507 struct {
 508     MultiFDRecvParams *params;
 509     /* number of created threads */
 510     int count;
 511 } *multifd_recv_state;
 512
 513 static void terminate_multifd_recv_threads(Error *errp)
 514 {
 515     int i;
 516
 517     for (i = 0; i < multifd_recv_state->count; i++) {
 518         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 519
 520         qemu_mutex_lock(&p->mutex);
 521         p->quit = true;
 522         qemu_sem_post(&p->sem);
 523         qemu_mutex_unlock(&p->mutex);
 524     }
 525 }
 526
 527 int multifd_load_cleanup(Error **errp)
 528 {
 529     int i;
 530     int ret = 0;
 531
 532     if (!migrate_use_multifd()) {
 533         return 0;
 534     }
 535     terminate_multifd_recv_threads(NULL);
 536     for (i = 0; i < multifd_recv_state->count; i++) {
 537         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 538
 539         qemu_thread_join(&p->thread);
 540         qemu_mutex_destroy(&p->mutex);
 541         qemu_sem_destroy(&p->sem);
 542         g_free(p->name);
 543         p->name = NULL;
 544     }
 545     g_free(multifd_recv_state->params);
 546     multifd_recv_state->params = NULL;
 547     g_free(multifd_recv_state);
 548     multifd_recv_state = NULL;
 549
 550     return ret;
 551 }
 552
 553 static void *multifd_recv_thread(void *opaque)
 554 {
 555     MultiFDRecvParams *p = opaque;
 556
 557     while (true) {
 558         qemu_mutex_lock(&p->mutex);
 559         if (p->quit) {
 560             qemu_mutex_unlock(&p->mutex);
 561             break;
 562         }
 563         qemu_mutex_unlock(&p->mutex);
 564         qemu_sem_wait(&p->sem);
 565     }
 566
 567     return NULL;
 568 }
 569
 570 int multifd_load_setup(void)
 571 {
 572     int thread_count;
 573     uint8_t i;
 574
 575     if (!migrate_use_multifd()) {
 576         return 0;
 577     }
 578     thread_count = migrate_multifd_channels();
 579     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
 580     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
 581     multifd_recv_state->count = 0;
 582     for (i = 0; i < thread_count; i++) {
 583         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 584
 585         qemu_mutex_init(&p->mutex);
 586         qemu_sem_init(&p->sem, 0);
 587         p->quit = false;
 588         p->id = i;
 589         p->name = g_strdup_printf("multifdrecv_%d", i);
 590         qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
 591                            QEMU_THREAD_JOINABLE);
 592         multifd_recv_state->count++;
 593     }
 594     return 0;
 595 }
 596
 597 /**
 598  * save_page_header: write page header to wire
 599  *
 600  * If this is the 1st block, it also writes the block identification
 601  *
 602  * Returns the number of bytes written
 603  *
 604  * @f: QEMUFile where to send the data
 605  * @block: block that contains the page we want to send
 606  * @offset: offset inside the block for the page
 607  *          in the lower bits, it contains flags
 608  */
 609 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 610                                ram_addr_t offset)
 611 {
 612     size_t size, len;
 613
 614     if (block == rs->last_sent_block) {
 615         offset |= RAM_SAVE_FLAG_CONTINUE;
 616     }
 617     qemu_put_be64(f, offset);
 618     size = 8;
 619
 620     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 621         len = strlen(block->idstr);
 622         qemu_put_byte(f, len);
 623         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 624         size += 1 + len;
 625         rs->last_sent_block = block;
 626     }
 627     return size;
 628 }
 629
 630 /**
 631  * mig_throttle_guest_down: throotle down the guest
 632  *
 633  * Reduce amount of guest cpu execution to hopefully slow down memory
 634  * writes. If guest dirty memory rate is reduced below the rate at
 635  * which we can transfer pages to the destination then we should be
 636  * able to complete migration. Some workloads dirty memory way too
 637  * fast and will not effectively converge, even with auto-converge.
 638  */
 639 static void mig_throttle_guest_down(void)
 640 {
 641     MigrationState *s = migrate_get_current();
 642     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 643     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 644
 645     /* We have not started throttling yet. Let's start it. */
 646     if (!cpu_throttle_active()) {
 647         cpu_throttle_set(pct_initial);
 648     } else {
 649         /* Throttling already on, just increase the rate */
 650         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 651     }
 652 }
 653
 654 /**
 655  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 656  *
 657  * @rs: current RAM state
 658  * @current_addr: address for the zero page
 659  *
 660  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 661  * The important thing is that a stale (not-yet-0'd) page be replaced
 662  * by the new data.
 663  * As a bonus, if the page wasn't in the cache it gets added so that
 664  * when a small write is made into the 0'd page it gets XBZRLE sent.
 665  */
 666 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 667 {
 668     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 669         return;
 670     }
 671
 672     /* We don't care if this fails to allocate a new cache page
 673      * as long as it updated an old one */
 674     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 675                  ram_counters.dirty_sync_count);
 676 }
 677
 678 #define ENCODING_FLAG_XBZRLE 0x1
 679
 680 /**
 681  * save_xbzrle_page: compress and send current page
 682  *
 683  * Returns: 1 means that we wrote the page
 684  *          0 means that page is identical to the one already sent
 685  *          -1 means that xbzrle would be longer than normal
 686  *
 687  * @rs: current RAM state
 688  * @current_data: pointer to the address of the page contents
 689  * @current_addr: addr of the page
 690  * @block: block that contains the page we want to send
 691  * @offset: offset inside the block for the page
 692  * @last_stage: if we are at the completion stage
 693  */
 694 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 695                             ram_addr_t current_addr, RAMBlock *block,
 696                             ram_addr_t offset, bool last_stage)
 697 {
 698     int encoded_len = 0, bytes_xbzrle;
 699     uint8_t *prev_cached_page;
 700
 701     if (!cache_is_cached(XBZRLE.cache, current_addr,
 702                          ram_counters.dirty_sync_count)) {
 703         xbzrle_counters.cache_miss++;
 704         if (!last_stage) {
 705             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 706                              ram_counters.dirty_sync_count) == -1) {
 707                 return -1;
 708             } else {
 709                 /* update *current_data when the page has been
 710                    inserted into cache */
 711                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 712             }
 713         }
 714         return -1;
 715     }
 716
 717     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 718
 719     /* save current buffer into memory */
 720     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 721
 722     /* XBZRLE encoding (if there is no overflow) */
 723     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 724                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 725                                        TARGET_PAGE_SIZE);
 726     if (encoded_len == 0) {
 727         trace_save_xbzrle_page_skipping();
 728         return 0;
 729     } else if (encoded_len == -1) {
 730         trace_save_xbzrle_page_overflow();
 731         xbzrle_counters.overflow++;
 732         /* update data in the cache */
 733         if (!last_stage) {
 734             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 735             *current_data = prev_cached_page;
 736         }
 737         return -1;
 738     }
 739
 740     /* we need to update the data in the cache, in order to get the same data */
 741     if (!last_stage) {
 742         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 743     }
 744
 745     /* Send XBZRLE based compressed page */
 746     bytes_xbzrle = save_page_header(rs, rs->f, block,
 747                                     offset | RAM_SAVE_FLAG_XBZRLE);
 748     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 749     qemu_put_be16(rs->f, encoded_len);
 750     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 751     bytes_xbzrle += encoded_len + 1 + 2;
 752     xbzrle_counters.pages++;
 753     xbzrle_counters.bytes += bytes_xbzrle;
 754     ram_counters.transferred += bytes_xbzrle;
 755
 756     return 1;
 757 }
 758
 759 /**
 760  * migration_bitmap_find_dirty: find the next dirty page from start
 761  *
 762  * Called with rcu_read_lock() to protect migration_bitmap
 763  *
 764  * Returns the byte offset within memory region of the start of a dirty page
 765  *
 766  * @rs: current RAM state
 767  * @rb: RAMBlock where to search for dirty pages
 768  * @start: page where we start the search
 769  */
 770 static inline
 771 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 772                                           unsigned long start)
 773 {
 774     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 775     unsigned long *bitmap = rb->bmap;
 776     unsigned long next;
 777
 778     if (rs->ram_bulk_stage && start > 0) {
 779         next = start + 1;
 780     } else {
 781         next = find_next_bit(bitmap, size, start);
 782     }
 783
 784     return next;
 785 }
 786
 787 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 788                                                 RAMBlock *rb,
 789                                                 unsigned long page)
 790 {
 791     bool ret;
 792
 793     ret = test_and_clear_bit(page, rb->bmap);
 794
 795     if (ret) {
 796         rs->migration_dirty_pages--;
 797     }
 798     return ret;
 799 }
 800
 801 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 802                                         ram_addr_t start, ram_addr_t length)
 803 {
 804     rs->migration_dirty_pages +=
 805         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 806                                               &rs->num_dirty_pages_period);
 807 }
 808
 809 /**
 810  * ram_pagesize_summary: calculate all the pagesizes of a VM
 811  *
 812  * Returns a summary bitmap of the page sizes of all RAMBlocks
 813  *
 814  * For VMs with just normal pages this is equivalent to the host page
 815  * size. If it's got some huge pages then it's the OR of all the
 816  * different page sizes.
 817  */
 818 uint64_t ram_pagesize_summary(void)
 819 {
 820     RAMBlock *block;
 821     uint64_t summary = 0;
 822
 823     RAMBLOCK_FOREACH(block) {
 824         summary |= block->page_size;
 825     }
 826
 827     return summary;
 828 }
 829
 830 static void migration_bitmap_sync(RAMState *rs)
 831 {
 832     RAMBlock *block;
 833     int64_t end_time;
 834     uint64_t bytes_xfer_now;
 835
 836     ram_counters.dirty_sync_count++;
 837
 838     if (!rs->time_last_bitmap_sync) {
 839         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 840     }
 841
 842     trace_migration_bitmap_sync_start();
 843     memory_global_dirty_log_sync();
 844
 845     qemu_mutex_lock(&rs->bitmap_mutex);
 846     rcu_read_lock();
 847     RAMBLOCK_FOREACH(block) {
 848         migration_bitmap_sync_range(rs, block, 0, block->used_length);
 849     }
 850     rcu_read_unlock();
 851     qemu_mutex_unlock(&rs->bitmap_mutex);
 852
 853     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 854
 855     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 856
 857     /* more than 1 second = 1000 millisecons */
 858     if (end_time > rs->time_last_bitmap_sync + 1000) {
 859         /* calculate period counters */
 860         ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 861             / (end_time - rs->time_last_bitmap_sync);
 862         bytes_xfer_now = ram_counters.transferred;
 863
 864         /* During block migration the auto-converge logic incorrectly detects
 865          * that ram migration makes no progress. Avoid this by disabling the
 866          * throttling logic during the bulk phase of block migration. */
 867         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 868             /* The following detection logic can be refined later. For now:
 869                Check to see if the dirtied bytes is 50% more than the approx.
 870                amount of bytes that just got transferred since the last time we
 871                were in this routine. If that happens twice, start or increase
 872                throttling */
 873
 874             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 875                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 876                 (++rs->dirty_rate_high_cnt >= 2)) {
 877                     trace_migration_throttle();
 878                     rs->dirty_rate_high_cnt = 0;
 879                     mig_throttle_guest_down();
 880             }
 881         }
 882
 883         if (migrate_use_xbzrle()) {
 884             if (rs->iterations_prev != rs->iterations) {
 885                 xbzrle_counters.cache_miss_rate =
 886                    (double)(xbzrle_counters.cache_miss -
 887                             rs->xbzrle_cache_miss_prev) /
 888                    (rs->iterations - rs->iterations_prev);
 889             }
 890             rs->iterations_prev = rs->iterations;
 891             rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 892         }
 893
 894         /* reset period counters */
 895         rs->time_last_bitmap_sync = end_time;
 896         rs->num_dirty_pages_period = 0;
 897         rs->bytes_xfer_prev = bytes_xfer_now;
 898     }
 899     if (migrate_use_events()) {
 900         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
 901     }
 902 }
 903
 904 /**
 905  * save_zero_page: send the zero page to the stream
 906  *
 907  * Returns the number of pages written.
 908  *
 909  * @rs: current RAM state
 910  * @block: block that contains the page we want to send
 911  * @offset: offset inside the block for the page
 912  */
 913 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
 914 {
 915     uint8_t *p = block->host + offset;
 916     int pages = -1;
 917
 918     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 919         ram_counters.duplicate++;
 920         ram_counters.transferred +=
 921             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
 922         qemu_put_byte(rs->f, 0);
 923         ram_counters.transferred += 1;
 924         pages = 1;
 925     }
 926
 927     return pages;
 928 }
 929
 930 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 931 {
 932     if (!migrate_release_ram() || !migration_in_postcopy()) {
 933         return;
 934     }
 935
 936     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 937 }
 938
 939 /**
 940  * ram_save_page: send the given page to the stream
 941  *
 942  * Returns the number of pages written.
 943  *          < 0 - error
 944  *          >=0 - Number of pages written - this might legally be 0
 945  *                if xbzrle noticed the page was the same.
 946  *
 947  * @rs: current RAM state
 948  * @block: block that contains the page we want to send
 949  * @offset: offset inside the block for the page
 950  * @last_stage: if we are at the completion stage
 951  */
 952 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
 953 {
 954     int pages = -1;
 955     uint64_t bytes_xmit;
 956     ram_addr_t current_addr;
 957     uint8_t *p;
 958     int ret;
 959     bool send_async = true;
 960     RAMBlock *block = pss->block;
 961     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 962
 963     p = block->host + offset;
 964     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
 965
 966     /* In doubt sent page as normal */
 967     bytes_xmit = 0;
 968     ret = ram_control_save_page(rs->f, block->offset,
 969                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 970     if (bytes_xmit) {
 971         ram_counters.transferred += bytes_xmit;
 972         pages = 1;
 973     }
 974
 975     XBZRLE_cache_lock();
 976
 977     current_addr = block->offset + offset;
 978
 979     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 980         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 981             if (bytes_xmit > 0) {
 982                 ram_counters.normal++;
 983             } else if (bytes_xmit == 0) {
 984                 ram_counters.duplicate++;
 985             }
 986         }
 987     } else {
 988         pages = save_zero_page(rs, block, offset);
 989         if (pages > 0) {
 990             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 991              * page would be stale
 992              */
 993             xbzrle_cache_zero_page(rs, current_addr);
 994             ram_release_pages(block->idstr, offset, pages);
 995         } else if (!rs->ram_bulk_stage &&
 996                    !migration_in_postcopy() && migrate_use_xbzrle()) {
 997             pages = save_xbzrle_page(rs, &p, current_addr, block,
 998                                      offset, last_stage);
 999             if (!last_stage) {
1000                 /* Can't send this cached data async, since the cache page
1001                  * might get updated before it gets to the wire
1002                  */
1003                 send_async = false;
1004             }
1005         }
1006     }
1007
1008     /* XBZRLE overflow or normal page */
1009     if (pages == -1) {
1010         ram_counters.transferred +=
1011             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
1012         if (send_async) {
1013             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
1014                                   migrate_release_ram() &
1015                                   migration_in_postcopy());
1016         } else {
1017             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
1018         }
1019         ram_counters.transferred += TARGET_PAGE_SIZE;
1020         pages = 1;
1021         ram_counters.normal++;
1022     }
1023
1024     XBZRLE_cache_unlock();
1025
1026     return pages;
1027 }
1028
1029 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
1030                                 ram_addr_t offset)
1031 {
1032     RAMState *rs = ram_state;
1033     int bytes_sent, blen;
1034     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1035
1036     bytes_sent = save_page_header(rs, f, block, offset |
1037                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
1038     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1039                                      migrate_compress_level());
1040     if (blen < 0) {
1041         bytes_sent = 0;
1042         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1043         error_report("compressed data failed!");
1044     } else {
1045         bytes_sent += blen;
1046         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1047     }
1048
1049     return bytes_sent;
1050 }
1051
1052 static void flush_compressed_data(RAMState *rs)
1053 {
1054     int idx, len, thread_count;
1055
1056     if (!migrate_use_compression()) {
1057         return;
1058     }
1059     thread_count = migrate_compress_threads();
1060
1061     qemu_mutex_lock(&comp_done_lock);
1062     for (idx = 0; idx < thread_count; idx++) {
1063         while (!comp_param[idx].done) {
1064             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1065         }
1066     }
1067     qemu_mutex_unlock(&comp_done_lock);
1068
1069     for (idx = 0; idx < thread_count; idx++) {
1070         qemu_mutex_lock(&comp_param[idx].mutex);
1071         if (!comp_param[idx].quit) {
1072             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1073             ram_counters.transferred += len;
1074         }
1075         qemu_mutex_unlock(&comp_param[idx].mutex);
1076     }
1077 }
1078
1079 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1080                                        ram_addr_t offset)
1081 {
1082     param->block = block;
1083     param->offset = offset;
1084 }
1085
1086 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1087                                            ram_addr_t offset)
1088 {
1089     int idx, thread_count, bytes_xmit = -1, pages = -1;
1090
1091     thread_count = migrate_compress_threads();
1092     qemu_mutex_lock(&comp_done_lock);
1093     while (true) {
1094         for (idx = 0; idx < thread_count; idx++) {
1095             if (comp_param[idx].done) {
1096                 comp_param[idx].done = false;
1097                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1098                 qemu_mutex_lock(&comp_param[idx].mutex);
1099                 set_compress_params(&comp_param[idx], block, offset);
1100                 qemu_cond_signal(&comp_param[idx].cond);
1101                 qemu_mutex_unlock(&comp_param[idx].mutex);
1102                 pages = 1;
1103                 ram_counters.normal++;
1104                 ram_counters.transferred += bytes_xmit;
1105                 break;
1106             }
1107         }
1108         if (pages > 0) {
1109             break;
1110         } else {
1111             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1112         }
1113     }
1114     qemu_mutex_unlock(&comp_done_lock);
1115
1116     return pages;
1117 }
1118
1119 /**
1120  * ram_save_compressed_page: compress the given page and send it to the stream
1121  *
1122  * Returns the number of pages written.
1123  *
1124  * @rs: current RAM state
1125  * @block: block that contains the page we want to send
1126  * @offset: offset inside the block for the page
1127  * @last_stage: if we are at the completion stage
1128  */
1129 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
1130                                     bool last_stage)
1131 {
1132     int pages = -1;
1133     uint64_t bytes_xmit = 0;
1134     uint8_t *p;
1135     int ret, blen;
1136     RAMBlock *block = pss->block;
1137     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1138
1139     p = block->host + offset;
1140
1141     ret = ram_control_save_page(rs->f, block->offset,
1142                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
1143     if (bytes_xmit) {
1144         ram_counters.transferred += bytes_xmit;
1145         pages = 1;
1146     }
1147     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
1148         if (ret != RAM_SAVE_CONTROL_DELAYED) {
1149             if (bytes_xmit > 0) {
1150                 ram_counters.normal++;
1151             } else if (bytes_xmit == 0) {
1152                 ram_counters.duplicate++;
1153             }
1154         }
1155     } else {
1156         /* When starting the process of a new block, the first page of
1157          * the block should be sent out before other pages in the same
1158          * block, and all the pages in last block should have been sent
1159          * out, keeping this order is important, because the 'cont' flag
1160          * is used to avoid resending the block name.
1161          */
1162         if (block != rs->last_sent_block) {
1163             flush_compressed_data(rs);
1164             pages = save_zero_page(rs, block, offset);
1165             if (pages == -1) {
1166                 /* Make sure the first page is sent out before other pages */
1167                 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1168                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
1169                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1170                                                  migrate_compress_level());
1171                 if (blen > 0) {
1172                     ram_counters.transferred += bytes_xmit + blen;
1173                     ram_counters.normal++;
1174                     pages = 1;
1175                 } else {
1176                     qemu_file_set_error(rs->f, blen);
1177                     error_report("compressed data failed!");
1178                 }
1179             }
1180             if (pages > 0) {
1181                 ram_release_pages(block->idstr, offset, pages);
1182             }
1183         } else {
1184             pages = save_zero_page(rs, block, offset);
1185             if (pages == -1) {
1186                 pages = compress_page_with_multi_thread(rs, block, offset);
1187             } else {
1188                 ram_release_pages(block->idstr, offset, pages);
1189             }
1190         }
1191     }
1192
1193     return pages;
1194 }
1195
1196 /**
1197  * find_dirty_block: find the next dirty page and update any state
1198  * associated with the search process.
1199  *
1200  * Returns if a page is found
1201  *
1202  * @rs: current RAM state
1203  * @pss: data about the state of the current dirty page scan
1204  * @again: set to false if the search has scanned the whole of RAM
1205  */
1206 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1207 {
1208     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1209     if (pss->complete_round && pss->block == rs->last_seen_block &&
1210         pss->page >= rs->last_page) {
1211         /*
1212          * We've been once around the RAM and haven't found anything.
1213          * Give up.
1214          */
1215         *again = false;
1216         return false;
1217     }
1218     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1219         /* Didn't find anything in this RAM Block */
1220         pss->page = 0;
1221         pss->block = QLIST_NEXT_RCU(pss->block, next);
1222         if (!pss->block) {
1223             /* Hit the end of the list */
1224             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1225             /* Flag that we've looped */
1226             pss->complete_round = true;
1227             rs->ram_bulk_stage = false;
1228             if (migrate_use_xbzrle()) {
1229                 /* If xbzrle is on, stop using the data compression at this
1230                  * point. In theory, xbzrle can do better than compression.
1231                  */
1232                 flush_compressed_data(rs);
1233             }
1234         }
1235         /* Didn't find anything this time, but try again on the new block */
1236         *again = true;
1237         return false;
1238     } else {
1239         /* Can go around again, but... */
1240         *again = true;
1241         /* We've found something so probably don't need to */
1242         return true;
1243     }
1244 }
1245
1246 /**
1247  * unqueue_page: gets a page of the queue
1248  *
1249  * Helper for 'get_queued_page' - gets a page off the queue
1250  *
1251  * Returns the block of the page (or NULL if none available)
1252  *
1253  * @rs: current RAM state
1254  * @offset: used to return the offset within the RAMBlock
1255  */
1256 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1257 {
1258     RAMBlock *block = NULL;
1259
1260     qemu_mutex_lock(&rs->src_page_req_mutex);
1261     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1262         struct RAMSrcPageRequest *entry =
1263                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1264         block = entry->rb;
1265         *offset = entry->offset;
1266
1267         if (entry->len > TARGET_PAGE_SIZE) {
1268             entry->len -= TARGET_PAGE_SIZE;
1269             entry->offset += TARGET_PAGE_SIZE;
1270         } else {
1271             memory_region_unref(block->mr);
1272             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1273             g_free(entry);
1274         }
1275     }
1276     qemu_mutex_unlock(&rs->src_page_req_mutex);
1277
1278     return block;
1279 }
1280
1281 /**
1282  * get_queued_page: unqueue a page from the postocpy requests
1283  *
1284  * Skips pages that are already sent (!dirty)
1285  *
1286  * Returns if a queued page is found
1287  *
1288  * @rs: current RAM state
1289  * @pss: data about the state of the current dirty page scan
1290  */
1291 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1292 {
1293     RAMBlock  *block;
1294     ram_addr_t offset;
1295     bool dirty;
1296
1297     do {
1298         block = unqueue_page(rs, &offset);
1299         /*
1300          * We're sending this page, and since it's postcopy nothing else
1301          * will dirty it, and we must make sure it doesn't get sent again
1302          * even if this queue request was received after the background
1303          * search already sent it.
1304          */
1305         if (block) {
1306             unsigned long page;
1307
1308             page = offset >> TARGET_PAGE_BITS;
1309             dirty = test_bit(page, block->bmap);
1310             if (!dirty) {
1311                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1312                        page, test_bit(page, block->unsentmap));
1313             } else {
1314                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1315             }
1316         }
1317
1318     } while (block && !dirty);
1319
1320     if (block) {
1321         /*
1322          * As soon as we start servicing pages out of order, then we have
1323          * to kill the bulk stage, since the bulk stage assumes
1324          * in (migration_bitmap_find_and_reset_dirty) that every page is
1325          * dirty, that's no longer true.
1326          */
1327         rs->ram_bulk_stage = false;
1328
1329         /*
1330          * We want the background search to continue from the queued page
1331          * since the guest is likely to want other pages near to the page
1332          * it just requested.
1333          */
1334         pss->block = block;
1335         pss->page = offset >> TARGET_PAGE_BITS;
1336     }
1337
1338     return !!block;
1339 }
1340
1341 /**
1342  * migration_page_queue_free: drop any remaining pages in the ram
1343  * request queue
1344  *
1345  * It should be empty at the end anyway, but in error cases there may
1346  * be some left.  in case that there is any page left, we drop it.
1347  *
1348  */
1349 static void migration_page_queue_free(RAMState *rs)
1350 {
1351     struct RAMSrcPageRequest *mspr, *next_mspr;
1352     /* This queue generally should be empty - but in the case of a failed
1353      * migration might have some droppings in.
1354      */
1355     rcu_read_lock();
1356     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1357         memory_region_unref(mspr->rb->mr);
1358         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1359         g_free(mspr);
1360     }
1361     rcu_read_unlock();
1362 }
1363
1364 /**
1365  * ram_save_queue_pages: queue the page for transmission
1366  *
1367  * A request from postcopy destination for example.
1368  *
1369  * Returns zero on success or negative on error
1370  *
1371  * @rbname: Name of the RAMBLock of the request. NULL means the
1372  *          same that last one.
1373  * @start: starting address from the start of the RAMBlock
1374  * @len: length (in bytes) to send
1375  */
1376 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1377 {
1378     RAMBlock *ramblock;
1379     RAMState *rs = ram_state;
1380
1381     ram_counters.postcopy_requests++;
1382     rcu_read_lock();
1383     if (!rbname) {
1384         /* Reuse last RAMBlock */
1385         ramblock = rs->last_req_rb;
1386
1387         if (!ramblock) {
1388             /*
1389              * Shouldn't happen, we can't reuse the last RAMBlock if
1390              * it's the 1st request.
1391              */
1392             error_report("ram_save_queue_pages no previous block");
1393             goto err;
1394         }
1395     } else {
1396         ramblock = qemu_ram_block_by_name(rbname);
1397
1398         if (!ramblock) {
1399             /* We shouldn't be asked for a non-existent RAMBlock */
1400             error_report("ram_save_queue_pages no block '%s'", rbname);
1401             goto err;
1402         }
1403         rs->last_req_rb = ramblock;
1404     }
1405     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1406     if (start+len > ramblock->used_length) {
1407         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1408                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1409                      __func__, start, len, ramblock->used_length);
1410         goto err;
1411     }
1412
1413     struct RAMSrcPageRequest *new_entry =
1414         g_malloc0(sizeof(struct RAMSrcPageRequest));
1415     new_entry->rb = ramblock;
1416     new_entry->offset = start;
1417     new_entry->len = len;
1418
1419     memory_region_ref(ramblock->mr);
1420     qemu_mutex_lock(&rs->src_page_req_mutex);
1421     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1422     qemu_mutex_unlock(&rs->src_page_req_mutex);
1423     rcu_read_unlock();
1424
1425     return 0;
1426
1427 err:
1428     rcu_read_unlock();
1429     return -1;
1430 }
1431
1432 /**
1433  * ram_save_target_page: save one target page
1434  *
1435  * Returns the number of pages written
1436  *
1437  * @rs: current RAM state
1438  * @ms: current migration state
1439  * @pss: data about the page we want to send
1440  * @last_stage: if we are at the completion stage
1441  */
1442 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1443                                 bool last_stage)
1444 {
1445     int res = 0;
1446
1447     /* Check the pages is dirty and if it is send it */
1448     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1449         /*
1450          * If xbzrle is on, stop using the data compression after first
1451          * round of migration even if compression is enabled. In theory,
1452          * xbzrle can do better than compression.
1453          */
1454         if (migrate_use_compression() &&
1455             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1456             res = ram_save_compressed_page(rs, pss, last_stage);
1457         } else {
1458             res = ram_save_page(rs, pss, last_stage);
1459         }
1460
1461         if (res < 0) {
1462             return res;
1463         }
1464         if (pss->block->unsentmap) {
1465             clear_bit(pss->page, pss->block->unsentmap);
1466         }
1467     }
1468
1469     return res;
1470 }
1471
1472 /**
1473  * ram_save_host_page: save a whole host page
1474  *
1475  * Starting at *offset send pages up to the end of the current host
1476  * page. It's valid for the initial offset to point into the middle of
1477  * a host page in which case the remainder of the hostpage is sent.
1478  * Only dirty target pages are sent. Note that the host page size may
1479  * be a huge page for this block.
1480  * The saving stops at the boundary of the used_length of the block
1481  * if the RAMBlock isn't a multiple of the host page size.
1482  *
1483  * Returns the number of pages written or negative on error
1484  *
1485  * @rs: current RAM state
1486  * @ms: current migration state
1487  * @pss: data about the page we want to send
1488  * @last_stage: if we are at the completion stage
1489  */
1490 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1491                               bool last_stage)
1492 {
1493     int tmppages, pages = 0;
1494     size_t pagesize_bits =
1495         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1496
1497     do {
1498         tmppages = ram_save_target_page(rs, pss, last_stage);
1499         if (tmppages < 0) {
1500             return tmppages;
1501         }
1502
1503         pages += tmppages;
1504         pss->page++;
1505     } while ((pss->page & (pagesize_bits - 1)) &&
1506              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1507
1508     /* The offset we leave with is the last one we looked at */
1509     pss->page--;
1510     return pages;
1511 }
1512
1513 /**
1514  * ram_find_and_save_block: finds a dirty page and sends it to f
1515  *
1516  * Called within an RCU critical section.
1517  *
1518  * Returns the number of pages written where zero means no dirty pages
1519  *
1520  * @rs: current RAM state
1521  * @last_stage: if we are at the completion stage
1522  *
1523  * On systems where host-page-size > target-page-size it will send all the
1524  * pages in a host page that are dirty.
1525  */
1526
1527 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1528 {
1529     PageSearchStatus pss;
1530     int pages = 0;
1531     bool again, found;
1532
1533     /* No dirty page as there is zero RAM */
1534     if (!ram_bytes_total()) {
1535         return pages;
1536     }
1537
1538     pss.block = rs->last_seen_block;
1539     pss.page = rs->last_page;
1540     pss.complete_round = false;
1541
1542     if (!pss.block) {
1543         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1544     }
1545
1546     do {
1547         again = true;
1548         found = get_queued_page(rs, &pss);
1549
1550         if (!found) {
1551             /* priority queue empty, so just search for something dirty */
1552             found = find_dirty_block(rs, &pss, &again);
1553         }
1554
1555         if (found) {
1556             pages = ram_save_host_page(rs, &pss, last_stage);
1557         }
1558     } while (!pages && again);
1559
1560     rs->last_seen_block = pss.block;
1561     rs->last_page = pss.page;
1562
1563     return pages;
1564 }
1565
1566 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1567 {
1568     uint64_t pages = size / TARGET_PAGE_SIZE;
1569
1570     if (zero) {
1571         ram_counters.duplicate += pages;
1572     } else {
1573         ram_counters.normal += pages;
1574         ram_counters.transferred += size;
1575         qemu_update_position(f, size);
1576     }
1577 }
1578
1579 uint64_t ram_bytes_total(void)
1580 {
1581     RAMBlock *block;
1582     uint64_t total = 0;
1583
1584     rcu_read_lock();
1585     RAMBLOCK_FOREACH(block) {
1586         total += block->used_length;
1587     }
1588     rcu_read_unlock();
1589     return total;
1590 }
1591
1592 static void xbzrle_load_setup(void)
1593 {
1594     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1595 }
1596
1597 static void xbzrle_load_cleanup(void)
1598 {
1599     g_free(XBZRLE.decoded_buf);
1600     XBZRLE.decoded_buf = NULL;
1601 }
1602
1603 static void ram_state_cleanup(RAMState **rsp)
1604 {
1605     migration_page_queue_free(*rsp);
1606     qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1607     qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1608     g_free(*rsp);
1609     *rsp = NULL;
1610 }
1611
1612 static void xbzrle_cleanup(void)
1613 {
1614     XBZRLE_cache_lock();
1615     if (XBZRLE.cache) {
1616         cache_fini(XBZRLE.cache);
1617         g_free(XBZRLE.encoded_buf);
1618         g_free(XBZRLE.current_buf);
1619         g_free(XBZRLE.zero_target_page);
1620         XBZRLE.cache = NULL;
1621         XBZRLE.encoded_buf = NULL;
1622         XBZRLE.current_buf = NULL;
1623         XBZRLE.zero_target_page = NULL;
1624     }
1625     XBZRLE_cache_unlock();
1626 }
1627
1628 static void ram_save_cleanup(void *opaque)
1629 {
1630     RAMState **rsp = opaque;
1631     RAMBlock *block;
1632
1633     /* caller have hold iothread lock or is in a bh, so there is
1634      * no writing race against this migration_bitmap
1635      */
1636     memory_global_dirty_log_stop();
1637
1638     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1639         g_free(block->bmap);
1640         block->bmap = NULL;
1641         g_free(block->unsentmap);
1642         block->unsentmap = NULL;
1643     }
1644
1645     xbzrle_cleanup();
1646     compress_threads_save_cleanup();
1647     ram_state_cleanup(rsp);
1648 }
1649
1650 static void ram_state_reset(RAMState *rs)
1651 {
1652     rs->last_seen_block = NULL;
1653     rs->last_sent_block = NULL;
1654     rs->last_page = 0;
1655     rs->last_version = ram_list.version;
1656     rs->ram_bulk_stage = true;
1657 }
1658
1659 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1660
1661 /*
1662  * 'expected' is the value you expect the bitmap mostly to be full
1663  * of; it won't bother printing lines that are all this value.
1664  * If 'todump' is null the migration bitmap is dumped.
1665  */
1666 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1667                            unsigned long pages)
1668 {
1669     int64_t cur;
1670     int64_t linelen = 128;
1671     char linebuf[129];
1672
1673     for (cur = 0; cur < pages; cur += linelen) {
1674         int64_t curb;
1675         bool found = false;
1676         /*
1677          * Last line; catch the case where the line length
1678          * is longer than remaining ram
1679          */
1680         if (cur + linelen > pages) {
1681             linelen = pages - cur;
1682         }
1683         for (curb = 0; curb < linelen; curb++) {
1684             bool thisbit = test_bit(cur + curb, todump);
1685             linebuf[curb] = thisbit ? '1' : '.';
1686             found = found || (thisbit != expected);
1687         }
1688         if (found) {
1689             linebuf[curb] = '\0';
1690             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1691         }
1692     }
1693 }
1694
1695 /* **** functions for postcopy ***** */
1696
1697 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1698 {
1699     struct RAMBlock *block;
1700
1701     RAMBLOCK_FOREACH(block) {
1702         unsigned long *bitmap = block->bmap;
1703         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1704         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1705
1706         while (run_start < range) {
1707             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1708             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1709                               (run_end - run_start) << TARGET_PAGE_BITS);
1710             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1711         }
1712     }
1713 }
1714
1715 /**
1716  * postcopy_send_discard_bm_ram: discard a RAMBlock
1717  *
1718  * Returns zero on success
1719  *
1720  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1721  * Note: At this point the 'unsentmap' is the processed bitmap combined
1722  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1723  *
1724  * @ms: current migration state
1725  * @pds: state for postcopy
1726  * @start: RAMBlock starting page
1727  * @length: RAMBlock size
1728  */
1729 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1730                                         PostcopyDiscardState *pds,
1731                                         RAMBlock *block)
1732 {
1733     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1734     unsigned long current;
1735     unsigned long *unsentmap = block->unsentmap;
1736
1737     for (current = 0; current < end; ) {
1738         unsigned long one = find_next_bit(unsentmap, end, current);
1739
1740         if (one <= end) {
1741             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1742             unsigned long discard_length;
1743
1744             if (zero >= end) {
1745                 discard_length = end - one;
1746             } else {
1747                 discard_length = zero - one;
1748             }
1749             if (discard_length) {
1750                 postcopy_discard_send_range(ms, pds, one, discard_length);
1751             }
1752             current = one + discard_length;
1753         } else {
1754             current = one;
1755         }
1756     }
1757
1758     return 0;
1759 }
1760
1761 /**
1762  * postcopy_each_ram_send_discard: discard all RAMBlocks
1763  *
1764  * Returns 0 for success or negative for error
1765  *
1766  * Utility for the outgoing postcopy code.
1767  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1768  *   passing it bitmap indexes and name.
1769  * (qemu_ram_foreach_block ends up passing unscaled lengths
1770  *  which would mean postcopy code would have to deal with target page)
1771  *
1772  * @ms: current migration state
1773  */
1774 static int postcopy_each_ram_send_discard(MigrationState *ms)
1775 {
1776     struct RAMBlock *block;
1777     int ret;
1778
1779     RAMBLOCK_FOREACH(block) {
1780         PostcopyDiscardState *pds =
1781             postcopy_discard_send_init(ms, block->idstr);
1782
1783         /*
1784          * Postcopy sends chunks of bitmap over the wire, but it
1785          * just needs indexes at this point, avoids it having
1786          * target page specific code.
1787          */
1788         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1789         postcopy_discard_send_finish(ms, pds);
1790         if (ret) {
1791             return ret;
1792         }
1793     }
1794
1795     return 0;
1796 }
1797
1798 /**
1799  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1800  *
1801  * Helper for postcopy_chunk_hostpages; it's called twice to
1802  * canonicalize the two bitmaps, that are similar, but one is
1803  * inverted.
1804  *
1805  * Postcopy requires that all target pages in a hostpage are dirty or
1806  * clean, not a mix.  This function canonicalizes the bitmaps.
1807  *
1808  * @ms: current migration state
1809  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1810  *               otherwise we need to canonicalize partially dirty host pages
1811  * @block: block that contains the page we want to canonicalize
1812  * @pds: state for postcopy
1813  */
1814 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1815                                           RAMBlock *block,
1816                                           PostcopyDiscardState *pds)
1817 {
1818     RAMState *rs = ram_state;
1819     unsigned long *bitmap = block->bmap;
1820     unsigned long *unsentmap = block->unsentmap;
1821     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1822     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1823     unsigned long run_start;
1824
1825     if (block->page_size == TARGET_PAGE_SIZE) {
1826         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1827         return;
1828     }
1829
1830     if (unsent_pass) {
1831         /* Find a sent page */
1832         run_start = find_next_zero_bit(unsentmap, pages, 0);
1833     } else {
1834         /* Find a dirty page */
1835         run_start = find_next_bit(bitmap, pages, 0);
1836     }
1837
1838     while (run_start < pages) {
1839         bool do_fixup = false;
1840         unsigned long fixup_start_addr;
1841         unsigned long host_offset;
1842
1843         /*
1844          * If the start of this run of pages is in the middle of a host
1845          * page, then we need to fixup this host page.
1846          */
1847         host_offset = run_start % host_ratio;
1848         if (host_offset) {
1849             do_fixup = true;
1850             run_start -= host_offset;
1851             fixup_start_addr = run_start;
1852             /* For the next pass */
1853             run_start = run_start + host_ratio;
1854         } else {
1855             /* Find the end of this run */
1856             unsigned long run_end;
1857             if (unsent_pass) {
1858                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1859             } else {
1860                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1861             }
1862             /*
1863              * If the end isn't at the start of a host page, then the
1864              * run doesn't finish at the end of a host page
1865              * and we need to discard.
1866              */
1867             host_offset = run_end % host_ratio;
1868             if (host_offset) {
1869                 do_fixup = true;
1870                 fixup_start_addr = run_end - host_offset;
1871                 /*
1872                  * This host page has gone, the next loop iteration starts
1873                  * from after the fixup
1874                  */
1875                 run_start = fixup_start_addr + host_ratio;
1876             } else {
1877                 /*
1878                  * No discards on this iteration, next loop starts from
1879                  * next sent/dirty page
1880                  */
1881                 run_start = run_end + 1;
1882             }
1883         }
1884
1885         if (do_fixup) {
1886             unsigned long page;
1887
1888             /* Tell the destination to discard this page */
1889             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1890                 /* For the unsent_pass we:
1891                  *     discard partially sent pages
1892                  * For the !unsent_pass (dirty) we:
1893                  *     discard partially dirty pages that were sent
1894                  *     (any partially sent pages were already discarded
1895                  *     by the previous unsent_pass)
1896                  */
1897                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1898                                             host_ratio);
1899             }
1900
1901             /* Clean up the bitmap */
1902             for (page = fixup_start_addr;
1903                  page < fixup_start_addr + host_ratio; page++) {
1904                 /* All pages in this host page are now not sent */
1905                 set_bit(page, unsentmap);
1906
1907                 /*
1908                  * Remark them as dirty, updating the count for any pages
1909                  * that weren't previously dirty.
1910                  */
1911                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1912             }
1913         }
1914
1915         if (unsent_pass) {
1916             /* Find the next sent page for the next iteration */
1917             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1918         } else {
1919             /* Find the next dirty page for the next iteration */
1920             run_start = find_next_bit(bitmap, pages, run_start);
1921         }
1922     }
1923 }
1924
1925 /**
1926  * postcopy_chuck_hostpages: discrad any partially sent host page
1927  *
1928  * Utility for the outgoing postcopy code.
1929  *
1930  * Discard any partially sent host-page size chunks, mark any partially
1931  * dirty host-page size chunks as all dirty.  In this case the host-page
1932  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1933  *
1934  * Returns zero on success
1935  *
1936  * @ms: current migration state
1937  * @block: block we want to work with
1938  */
1939 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1940 {
1941     PostcopyDiscardState *pds =
1942         postcopy_discard_send_init(ms, block->idstr);
1943
1944     /* First pass: Discard all partially sent host pages */
1945     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1946     /*
1947      * Second pass: Ensure that all partially dirty host pages are made
1948      * fully dirty.
1949      */
1950     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1951
1952     postcopy_discard_send_finish(ms, pds);
1953     return 0;
1954 }
1955
1956 /**
1957  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1958  *
1959  * Returns zero on success
1960  *
1961  * Transmit the set of pages to be discarded after precopy to the target
1962  * these are pages that:
1963  *     a) Have been previously transmitted but are now dirty again
1964  *     b) Pages that have never been transmitted, this ensures that
1965  *        any pages on the destination that have been mapped by background
1966  *        tasks get discarded (transparent huge pages is the specific concern)
1967  * Hopefully this is pretty sparse
1968  *
1969  * @ms: current migration state
1970  */
1971 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1972 {
1973     RAMState *rs = ram_state;
1974     RAMBlock *block;
1975     int ret;
1976
1977     rcu_read_lock();
1978
1979     /* This should be our last sync, the src is now paused */
1980     migration_bitmap_sync(rs);
1981
1982     /* Easiest way to make sure we don't resume in the middle of a host-page */
1983     rs->last_seen_block = NULL;
1984     rs->last_sent_block = NULL;
1985     rs->last_page = 0;
1986
1987     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1988         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1989         unsigned long *bitmap = block->bmap;
1990         unsigned long *unsentmap = block->unsentmap;
1991
1992         if (!unsentmap) {
1993             /* We don't have a safe way to resize the sentmap, so
1994              * if the bitmap was resized it will be NULL at this
1995              * point.
1996              */
1997             error_report("migration ram resized during precopy phase");
1998             rcu_read_unlock();
1999             return -EINVAL;
2000         }
2001         /* Deal with TPS != HPS and huge pages */
2002         ret = postcopy_chunk_hostpages(ms, block);
2003         if (ret) {
2004             rcu_read_unlock();
2005             return ret;
2006         }
2007
2008         /*
2009          * Update the unsentmap to be unsentmap = unsentmap | dirty
2010          */
2011         bitmap_or(unsentmap, unsentmap, bitmap, pages);
2012 #ifdef DEBUG_POSTCOPY
2013         ram_debug_dump_bitmap(unsentmap, true, pages);
2014 #endif
2015     }
2016     trace_ram_postcopy_send_discard_bitmap();
2017
2018     ret = postcopy_each_ram_send_discard(ms);
2019     rcu_read_unlock();
2020
2021     return ret;
2022 }
2023
2024 /**
2025  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2026  *
2027  * Returns zero on success
2028  *
2029  * @rbname: name of the RAMBlock of the request. NULL means the
2030  *          same that last one.
2031  * @start: RAMBlock starting page
2032  * @length: RAMBlock size
2033  */
2034 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2035 {
2036     int ret = -1;
2037
2038     trace_ram_discard_range(rbname, start, length);
2039
2040     rcu_read_lock();
2041     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2042
2043     if (!rb) {
2044         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2045         goto err;
2046     }
2047
2048     bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2049                  length >> qemu_target_page_bits());
2050     ret = ram_block_discard_range(rb, start, length);
2051
2052 err:
2053     rcu_read_unlock();
2054
2055     return ret;
2056 }
2057
2058 /*
2059  * For every allocation, we will try not to crash the VM if the
2060  * allocation failed.
2061  */
2062 static int xbzrle_init(void)
2063 {
2064     Error *local_err = NULL;
2065
2066     if (!migrate_use_xbzrle()) {
2067         return 0;
2068     }
2069
2070     XBZRLE_cache_lock();
2071
2072     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2073     if (!XBZRLE.zero_target_page) {
2074         error_report("%s: Error allocating zero page", __func__);
2075         goto err_out;
2076     }
2077
2078     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2079                               TARGET_PAGE_SIZE, &local_err);
2080     if (!XBZRLE.cache) {
2081         error_report_err(local_err);
2082         goto free_zero_page;
2083     }
2084
2085     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2086     if (!XBZRLE.encoded_buf) {
2087         error_report("%s: Error allocating encoded_buf", __func__);
2088         goto free_cache;
2089     }
2090
2091     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2092     if (!XBZRLE.current_buf) {
2093         error_report("%s: Error allocating current_buf", __func__);
2094         goto free_encoded_buf;
2095     }
2096
2097     /* We are all good */
2098     XBZRLE_cache_unlock();
2099     return 0;
2100
2101 free_encoded_buf:
2102     g_free(XBZRLE.encoded_buf);
2103     XBZRLE.encoded_buf = NULL;
2104 free_cache:
2105     cache_fini(XBZRLE.cache);
2106     XBZRLE.cache = NULL;
2107 free_zero_page:
2108     g_free(XBZRLE.zero_target_page);
2109     XBZRLE.zero_target_page = NULL;
2110 err_out:
2111     XBZRLE_cache_unlock();
2112     return -ENOMEM;
2113 }
2114
2115 static int ram_state_init(RAMState **rsp)
2116 {
2117     *rsp = g_try_new0(RAMState, 1);
2118
2119     if (!*rsp) {
2120         error_report("%s: Init ramstate fail", __func__);
2121         return -1;
2122     }
2123
2124     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2125     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2126     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2127
2128     /*
2129      * Count the total number of pages used by ram blocks not including any
2130      * gaps due to alignment or unplugs.
2131      */
2132     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2133
2134     ram_state_reset(*rsp);
2135
2136     return 0;
2137 }
2138
2139 static void ram_list_init_bitmaps(void)
2140 {
2141     RAMBlock *block;
2142     unsigned long pages;
2143
2144     /* Skip setting bitmap if there is no RAM */
2145     if (ram_bytes_total()) {
2146         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2147             pages = block->max_length >> TARGET_PAGE_BITS;
2148             block->bmap = bitmap_new(pages);
2149             bitmap_set(block->bmap, 0, pages);
2150             if (migrate_postcopy_ram()) {
2151                 block->unsentmap = bitmap_new(pages);
2152                 bitmap_set(block->unsentmap, 0, pages);
2153             }
2154         }
2155     }
2156 }
2157
2158 static void ram_init_bitmaps(RAMState *rs)
2159 {
2160     /* For memory_global_dirty_log_start below.  */
2161     qemu_mutex_lock_iothread();
2162     qemu_mutex_lock_ramlist();
2163     rcu_read_lock();
2164
2165     ram_list_init_bitmaps();
2166     memory_global_dirty_log_start();
2167     migration_bitmap_sync(rs);
2168
2169     rcu_read_unlock();
2170     qemu_mutex_unlock_ramlist();
2171     qemu_mutex_unlock_iothread();
2172 }
2173
2174 static int ram_init_all(RAMState **rsp)
2175 {
2176     if (ram_state_init(rsp)) {
2177         return -1;
2178     }
2179
2180     if (xbzrle_init()) {
2181         ram_state_cleanup(rsp);
2182         return -1;
2183     }
2184
2185     ram_init_bitmaps(*rsp);
2186
2187     return 0;
2188 }
2189
2190 /*
2191  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2192  * long-running RCU critical section.  When rcu-reclaims in the code
2193  * start to become numerous it will be necessary to reduce the
2194  * granularity of these critical sections.
2195  */
2196
2197 /**
2198  * ram_save_setup: Setup RAM for migration
2199  *
2200  * Returns zero to indicate success and negative for error
2201  *
2202  * @f: QEMUFile where to send the data
2203  * @opaque: RAMState pointer
2204  */
2205 static int ram_save_setup(QEMUFile *f, void *opaque)
2206 {
2207     RAMState **rsp = opaque;
2208     RAMBlock *block;
2209
2210     /* migration has already setup the bitmap, reuse it. */
2211     if (!migration_in_colo_state()) {
2212         if (ram_init_all(rsp) != 0) {
2213             return -1;
2214         }
2215     }
2216     (*rsp)->f = f;
2217
2218     rcu_read_lock();
2219
2220     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2221
2222     RAMBLOCK_FOREACH(block) {
2223         qemu_put_byte(f, strlen(block->idstr));
2224         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2225         qemu_put_be64(f, block->used_length);
2226         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2227             qemu_put_be64(f, block->page_size);
2228         }
2229     }
2230
2231     rcu_read_unlock();
2232     compress_threads_save_setup();
2233
2234     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2235     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2236
2237     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2238
2239     return 0;
2240 }
2241
2242 /**
2243  * ram_save_iterate: iterative stage for migration
2244  *
2245  * Returns zero to indicate success and negative for error
2246  *
2247  * @f: QEMUFile where to send the data
2248  * @opaque: RAMState pointer
2249  */
2250 static int ram_save_iterate(QEMUFile *f, void *opaque)
2251 {
2252     RAMState **temp = opaque;
2253     RAMState *rs = *temp;
2254     int ret;
2255     int i;
2256     int64_t t0;
2257     int done = 0;
2258
2259     rcu_read_lock();
2260     if (ram_list.version != rs->last_version) {
2261         ram_state_reset(rs);
2262     }
2263
2264     /* Read version before ram_list.blocks */
2265     smp_rmb();
2266
2267     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2268
2269     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2270     i = 0;
2271     while ((ret = qemu_file_rate_limit(f)) == 0) {
2272         int pages;
2273
2274         pages = ram_find_and_save_block(rs, false);
2275         /* no more pages to sent */
2276         if (pages == 0) {
2277             done = 1;
2278             break;
2279         }
2280         rs->iterations++;
2281
2282         /* we want to check in the 1st loop, just in case it was the 1st time
2283            and we had to sync the dirty bitmap.
2284            qemu_get_clock_ns() is a bit expensive, so we only check each some
2285            iterations
2286         */
2287         if ((i & 63) == 0) {
2288             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2289             if (t1 > MAX_WAIT) {
2290                 trace_ram_save_iterate_big_wait(t1, i);
2291                 break;
2292             }
2293         }
2294         i++;
2295     }
2296     flush_compressed_data(rs);
2297     rcu_read_unlock();
2298
2299     /*
2300      * Must occur before EOS (or any QEMUFile operation)
2301      * because of RDMA protocol.
2302      */
2303     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2304
2305     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2306     ram_counters.transferred += 8;
2307
2308     ret = qemu_file_get_error(f);
2309     if (ret < 0) {
2310         return ret;
2311     }
2312
2313     return done;
2314 }
2315
2316 /**
2317  * ram_save_complete: function called to send the remaining amount of ram
2318  *
2319  * Returns zero to indicate success
2320  *
2321  * Called with iothread lock
2322  *
2323  * @f: QEMUFile where to send the data
2324  * @opaque: RAMState pointer
2325  */
2326 static int ram_save_complete(QEMUFile *f, void *opaque)
2327 {
2328     RAMState **temp = opaque;
2329     RAMState *rs = *temp;
2330
2331     rcu_read_lock();
2332
2333     if (!migration_in_postcopy()) {
2334         migration_bitmap_sync(rs);
2335     }
2336
2337     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2338
2339     /* try transferring iterative blocks of memory */
2340
2341     /* flush all remaining blocks regardless of rate limiting */
2342     while (true) {
2343         int pages;
2344
2345         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2346         /* no more blocks to sent */
2347         if (pages == 0) {
2348             break;
2349         }
2350     }
2351
2352     flush_compressed_data(rs);
2353     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2354
2355     rcu_read_unlock();
2356
2357     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2358
2359     return 0;
2360 }
2361
2362 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2363                              uint64_t *non_postcopiable_pending,
2364                              uint64_t *postcopiable_pending)
2365 {
2366     RAMState **temp = opaque;
2367     RAMState *rs = *temp;
2368     uint64_t remaining_size;
2369
2370     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2371
2372     if (!migration_in_postcopy() &&
2373         remaining_size < max_size) {
2374         qemu_mutex_lock_iothread();
2375         rcu_read_lock();
2376         migration_bitmap_sync(rs);
2377         rcu_read_unlock();
2378         qemu_mutex_unlock_iothread();
2379         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2380     }
2381
2382     if (migrate_postcopy_ram()) {
2383         /* We can do postcopy, and all the data is postcopiable */
2384         *postcopiable_pending += remaining_size;
2385     } else {
2386         *non_postcopiable_pending += remaining_size;
2387     }
2388 }
2389
2390 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2391 {
2392     unsigned int xh_len;
2393     int xh_flags;
2394     uint8_t *loaded_data;
2395
2396     /* extract RLE header */
2397     xh_flags = qemu_get_byte(f);
2398     xh_len = qemu_get_be16(f);
2399
2400     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2401         error_report("Failed to load XBZRLE page - wrong compression!");
2402         return -1;
2403     }
2404
2405     if (xh_len > TARGET_PAGE_SIZE) {
2406         error_report("Failed to load XBZRLE page - len overflow!");
2407         return -1;
2408     }
2409     loaded_data = XBZRLE.decoded_buf;
2410     /* load data and decode */
2411     /* it can change loaded_data to point to an internal buffer */
2412     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2413
2414     /* decode RLE */
2415     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2416                              TARGET_PAGE_SIZE) == -1) {
2417         error_report("Failed to load XBZRLE page - decode error!");
2418         return -1;
2419     }
2420
2421     return 0;
2422 }
2423
2424 /**
2425  * ram_block_from_stream: read a RAMBlock id from the migration stream
2426  *
2427  * Must be called from within a rcu critical section.
2428  *
2429  * Returns a pointer from within the RCU-protected ram_list.
2430  *
2431  * @f: QEMUFile where to read the data from
2432  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2433  */
2434 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2435 {
2436     static RAMBlock *block = NULL;
2437     char id[256];
2438     uint8_t len;
2439
2440     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2441         if (!block) {
2442             error_report("Ack, bad migration stream!");
2443             return NULL;
2444         }
2445         return block;
2446     }
2447
2448     len = qemu_get_byte(f);
2449     qemu_get_buffer(f, (uint8_t *)id, len);
2450     id[len] = 0;
2451
2452     block = qemu_ram_block_by_name(id);
2453     if (!block) {
2454         error_report("Can't find block %s", id);
2455         return NULL;
2456     }
2457
2458     return block;
2459 }
2460
2461 static inline void *host_from_ram_block_offset(RAMBlock *block,
2462                                                ram_addr_t offset)
2463 {
2464     if (!offset_in_ramblock(block, offset)) {
2465         return NULL;
2466     }
2467
2468     return block->host + offset;
2469 }
2470
2471 /**
2472  * ram_handle_compressed: handle the zero page case
2473  *
2474  * If a page (or a whole RDMA chunk) has been
2475  * determined to be zero, then zap it.
2476  *
2477  * @host: host address for the zero page
2478  * @ch: what the page is filled from.  We only support zero
2479  * @size: size of the zero page
2480  */
2481 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2482 {
2483     if (ch != 0 || !is_zero_range(host, size)) {
2484         memset(host, ch, size);
2485     }
2486 }
2487
2488 static void *do_data_decompress(void *opaque)
2489 {
2490     DecompressParam *param = opaque;
2491     unsigned long pagesize;
2492     uint8_t *des;
2493     int len;
2494
2495     qemu_mutex_lock(&param->mutex);
2496     while (!param->quit) {
2497         if (param->des) {
2498             des = param->des;
2499             len = param->len;
2500             param->des = 0;
2501             qemu_mutex_unlock(&param->mutex);
2502
2503             pagesize = TARGET_PAGE_SIZE;
2504             /* uncompress() will return failed in some case, especially
2505              * when the page is dirted when doing the compression, it's
2506              * not a problem because the dirty page will be retransferred
2507              * and uncompress() won't break the data in other pages.
2508              */
2509             uncompress((Bytef *)des, &pagesize,
2510                        (const Bytef *)param->compbuf, len);
2511
2512             qemu_mutex_lock(&decomp_done_lock);
2513             param->done = true;
2514             qemu_cond_signal(&decomp_done_cond);
2515             qemu_mutex_unlock(&decomp_done_lock);
2516
2517             qemu_mutex_lock(&param->mutex);
2518         } else {
2519             qemu_cond_wait(&param->cond, &param->mutex);
2520         }
2521     }
2522     qemu_mutex_unlock(&param->mutex);
2523
2524     return NULL;
2525 }
2526
2527 static void wait_for_decompress_done(void)
2528 {
2529     int idx, thread_count;
2530
2531     if (!migrate_use_compression()) {
2532         return;
2533     }
2534
2535     thread_count = migrate_decompress_threads();
2536     qemu_mutex_lock(&decomp_done_lock);
2537     for (idx = 0; idx < thread_count; idx++) {
2538         while (!decomp_param[idx].done) {
2539             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2540         }
2541     }
2542     qemu_mutex_unlock(&decomp_done_lock);
2543 }
2544
2545 static void compress_threads_load_setup(void)
2546 {
2547     int i, thread_count;
2548
2549     if (!migrate_use_compression()) {
2550         return;
2551     }
2552     thread_count = migrate_decompress_threads();
2553     decompress_threads = g_new0(QemuThread, thread_count);
2554     decomp_param = g_new0(DecompressParam, thread_count);
2555     qemu_mutex_init(&decomp_done_lock);
2556     qemu_cond_init(&decomp_done_cond);
2557     for (i = 0; i < thread_count; i++) {
2558         qemu_mutex_init(&decomp_param[i].mutex);
2559         qemu_cond_init(&decomp_param[i].cond);
2560         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2561         decomp_param[i].done = true;
2562         decomp_param[i].quit = false;
2563         qemu_thread_create(decompress_threads + i, "decompress",
2564                            do_data_decompress, decomp_param + i,
2565                            QEMU_THREAD_JOINABLE);
2566     }
2567 }
2568
2569 static void compress_threads_load_cleanup(void)
2570 {
2571     int i, thread_count;
2572
2573     if (!migrate_use_compression()) {
2574         return;
2575     }
2576     thread_count = migrate_decompress_threads();
2577     for (i = 0; i < thread_count; i++) {
2578         qemu_mutex_lock(&decomp_param[i].mutex);
2579         decomp_param[i].quit = true;
2580         qemu_cond_signal(&decomp_param[i].cond);
2581         qemu_mutex_unlock(&decomp_param[i].mutex);
2582     }
2583     for (i = 0; i < thread_count; i++) {
2584         qemu_thread_join(decompress_threads + i);
2585         qemu_mutex_destroy(&decomp_param[i].mutex);
2586         qemu_cond_destroy(&decomp_param[i].cond);
2587         g_free(decomp_param[i].compbuf);
2588     }
2589     g_free(decompress_threads);
2590     g_free(decomp_param);
2591     decompress_threads = NULL;
2592     decomp_param = NULL;
2593 }
2594
2595 static void decompress_data_with_multi_threads(QEMUFile *f,
2596                                                void *host, int len)
2597 {
2598     int idx, thread_count;
2599
2600     thread_count = migrate_decompress_threads();
2601     qemu_mutex_lock(&decomp_done_lock);
2602     while (true) {
2603         for (idx = 0; idx < thread_count; idx++) {
2604             if (decomp_param[idx].done) {
2605                 decomp_param[idx].done = false;
2606                 qemu_mutex_lock(&decomp_param[idx].mutex);
2607                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2608                 decomp_param[idx].des = host;
2609                 decomp_param[idx].len = len;
2610                 qemu_cond_signal(&decomp_param[idx].cond);
2611                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2612                 break;
2613             }
2614         }
2615         if (idx < thread_count) {
2616             break;
2617         } else {
2618             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2619         }
2620     }
2621     qemu_mutex_unlock(&decomp_done_lock);
2622 }
2623
2624 /**
2625  * ram_load_setup: Setup RAM for migration incoming side
2626  *
2627  * Returns zero to indicate success and negative for error
2628  *
2629  * @f: QEMUFile where to receive the data
2630  * @opaque: RAMState pointer
2631  */
2632 static int ram_load_setup(QEMUFile *f, void *opaque)
2633 {
2634     xbzrle_load_setup();
2635     compress_threads_load_setup();
2636     ramblock_recv_map_init();
2637     return 0;
2638 }
2639
2640 static int ram_load_cleanup(void *opaque)
2641 {
2642     RAMBlock *rb;
2643     xbzrle_load_cleanup();
2644     compress_threads_load_cleanup();
2645
2646     RAMBLOCK_FOREACH(rb) {
2647         g_free(rb->receivedmap);
2648         rb->receivedmap = NULL;
2649     }
2650     return 0;
2651 }
2652
2653 /**
2654  * ram_postcopy_incoming_init: allocate postcopy data structures
2655  *
2656  * Returns 0 for success and negative if there was one error
2657  *
2658  * @mis: current migration incoming state
2659  *
2660  * Allocate data structures etc needed by incoming migration with
2661  * postcopy-ram. postcopy-ram's similarly names
2662  * postcopy_ram_incoming_init does the work.
2663  */
2664 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2665 {
2666     unsigned long ram_pages = last_ram_page();
2667
2668     return postcopy_ram_incoming_init(mis, ram_pages);
2669 }
2670
2671 /**
2672  * ram_load_postcopy: load a page in postcopy case
2673  *
2674  * Returns 0 for success or -errno in case of error
2675  *
2676  * Called in postcopy mode by ram_load().
2677  * rcu_read_lock is taken prior to this being called.
2678  *
2679  * @f: QEMUFile where to send the data
2680  */
2681 static int ram_load_postcopy(QEMUFile *f)
2682 {
2683     int flags = 0, ret = 0;
2684     bool place_needed = false;
2685     bool matching_page_sizes = false;
2686     MigrationIncomingState *mis = migration_incoming_get_current();
2687     /* Temporary page that is later 'placed' */
2688     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2689     void *last_host = NULL;
2690     bool all_zero = false;
2691
2692     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2693         ram_addr_t addr;
2694         void *host = NULL;
2695         void *page_buffer = NULL;
2696         void *place_source = NULL;
2697         RAMBlock *block = NULL;
2698         uint8_t ch;
2699
2700         addr = qemu_get_be64(f);
2701         flags = addr & ~TARGET_PAGE_MASK;
2702         addr &= TARGET_PAGE_MASK;
2703
2704         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2705         place_needed = false;
2706         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2707             block = ram_block_from_stream(f, flags);
2708
2709             host = host_from_ram_block_offset(block, addr);
2710             if (!host) {
2711                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2712                 ret = -EINVAL;
2713                 break;
2714             }
2715             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2716             /*
2717              * Postcopy requires that we place whole host pages atomically;
2718              * these may be huge pages for RAMBlocks that are backed by
2719              * hugetlbfs.
2720              * To make it atomic, the data is read into a temporary page
2721              * that's moved into place later.
2722              * The migration protocol uses,  possibly smaller, target-pages
2723              * however the source ensures it always sends all the components
2724              * of a host page in order.
2725              */
2726             page_buffer = postcopy_host_page +
2727                           ((uintptr_t)host & (block->page_size - 1));
2728             /* If all TP are zero then we can optimise the place */
2729             if (!((uintptr_t)host & (block->page_size - 1))) {
2730                 all_zero = true;
2731             } else {
2732                 /* not the 1st TP within the HP */
2733                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2734                     error_report("Non-sequential target page %p/%p",
2735                                   host, last_host);
2736                     ret = -EINVAL;
2737                     break;
2738                 }
2739             }
2740
2741
2742             /*
2743              * If it's the last part of a host page then we place the host
2744              * page
2745              */
2746             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2747                                      (block->page_size - 1)) == 0;
2748             place_source = postcopy_host_page;
2749         }
2750         last_host = host;
2751
2752         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2753         case RAM_SAVE_FLAG_ZERO:
2754             ch = qemu_get_byte(f);
2755             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2756             if (ch) {
2757                 all_zero = false;
2758             }
2759             break;
2760
2761         case RAM_SAVE_FLAG_PAGE:
2762             all_zero = false;
2763             if (!place_needed || !matching_page_sizes) {
2764                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2765             } else {
2766                 /* Avoids the qemu_file copy during postcopy, which is
2767                  * going to do a copy later; can only do it when we
2768                  * do this read in one go (matching page sizes)
2769                  */
2770                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2771                                          TARGET_PAGE_SIZE);
2772             }
2773             break;
2774         case RAM_SAVE_FLAG_EOS:
2775             /* normal exit */
2776             break;
2777         default:
2778             error_report("Unknown combination of migration flags: %#x"
2779                          " (postcopy mode)", flags);
2780             ret = -EINVAL;
2781         }
2782
2783         if (place_needed) {
2784             /* This gets called at the last target page in the host page */
2785             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2786
2787             if (all_zero) {
2788                 ret = postcopy_place_page_zero(mis, place_dest,
2789                                                block);
2790             } else {
2791                 ret = postcopy_place_page(mis, place_dest,
2792                                           place_source, block);
2793             }
2794         }
2795         if (!ret) {
2796             ret = qemu_file_get_error(f);
2797         }
2798     }
2799
2800     return ret;
2801 }
2802
2803 static bool postcopy_is_advised(void)
2804 {
2805     PostcopyState ps = postcopy_state_get();
2806     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
2807 }
2808
2809 static bool postcopy_is_running(void)
2810 {
2811     PostcopyState ps = postcopy_state_get();
2812     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
2813 }
2814
2815 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2816 {
2817     int flags = 0, ret = 0, invalid_flags = 0;
2818     static uint64_t seq_iter;
2819     int len = 0;
2820     /*
2821      * If system is running in postcopy mode, page inserts to host memory must
2822      * be atomic
2823      */
2824     bool postcopy_running = postcopy_is_running();
2825     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2826     bool postcopy_advised = postcopy_is_advised();
2827
2828     seq_iter++;
2829
2830     if (version_id != 4) {
2831         ret = -EINVAL;
2832     }
2833
2834     if (!migrate_use_compression()) {
2835         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2836     }
2837     /* This RCU critical section can be very long running.
2838      * When RCU reclaims in the code start to become numerous,
2839      * it will be necessary to reduce the granularity of this
2840      * critical section.
2841      */
2842     rcu_read_lock();
2843
2844     if (postcopy_running) {
2845         ret = ram_load_postcopy(f);
2846     }
2847
2848     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2849         ram_addr_t addr, total_ram_bytes;
2850         void *host = NULL;
2851         uint8_t ch;
2852
2853         addr = qemu_get_be64(f);
2854         flags = addr & ~TARGET_PAGE_MASK;
2855         addr &= TARGET_PAGE_MASK;
2856
2857         if (flags & invalid_flags) {
2858             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2859                 error_report("Received an unexpected compressed page");
2860             }
2861
2862             ret = -EINVAL;
2863             break;
2864         }
2865
2866         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2867                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2868             RAMBlock *block = ram_block_from_stream(f, flags);
2869
2870             host = host_from_ram_block_offset(block, addr);
2871             if (!host) {
2872                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2873                 ret = -EINVAL;
2874                 break;
2875             }
2876             ramblock_recv_bitmap_set(block, host);
2877             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2878         }
2879
2880         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2881         case RAM_SAVE_FLAG_MEM_SIZE:
2882             /* Synchronize RAM block list */
2883             total_ram_bytes = addr;
2884             while (!ret && total_ram_bytes) {
2885                 RAMBlock *block;
2886                 char id[256];
2887                 ram_addr_t length;
2888
2889                 len = qemu_get_byte(f);
2890                 qemu_get_buffer(f, (uint8_t *)id, len);
2891                 id[len] = 0;
2892                 length = qemu_get_be64(f);
2893
2894                 block = qemu_ram_block_by_name(id);
2895                 if (block) {
2896                     if (length != block->used_length) {
2897                         Error *local_err = NULL;
2898
2899                         ret = qemu_ram_resize(block, length,
2900                                               &local_err);
2901                         if (local_err) {
2902                             error_report_err(local_err);
2903                         }
2904                     }
2905                     /* For postcopy we need to check hugepage sizes match */
2906                     if (postcopy_advised &&
2907                         block->page_size != qemu_host_page_size) {
2908                         uint64_t remote_page_size = qemu_get_be64(f);
2909                         if (remote_page_size != block->page_size) {
2910                             error_report("Mismatched RAM page size %s "
2911                                          "(local) %zd != %" PRId64,
2912                                          id, block->page_size,
2913                                          remote_page_size);
2914                             ret = -EINVAL;
2915                         }
2916                     }
2917                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2918                                           block->idstr);
2919                 } else {
2920                     error_report("Unknown ramblock \"%s\", cannot "
2921                                  "accept migration", id);
2922                     ret = -EINVAL;
2923                 }
2924
2925                 total_ram_bytes -= length;
2926             }
2927             break;
2928
2929         case RAM_SAVE_FLAG_ZERO:
2930             ch = qemu_get_byte(f);
2931             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2932             break;
2933
2934         case RAM_SAVE_FLAG_PAGE:
2935             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2936             break;
2937
2938         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2939             len = qemu_get_be32(f);
2940             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2941                 error_report("Invalid compressed data length: %d", len);
2942                 ret = -EINVAL;
2943                 break;
2944             }
2945             decompress_data_with_multi_threads(f, host, len);
2946             break;
2947
2948         case RAM_SAVE_FLAG_XBZRLE:
2949             if (load_xbzrle(f, addr, host) < 0) {
2950                 error_report("Failed to decompress XBZRLE page at "
2951                              RAM_ADDR_FMT, addr);
2952                 ret = -EINVAL;
2953                 break;
2954             }
2955             break;
2956         case RAM_SAVE_FLAG_EOS:
2957             /* normal exit */
2958             break;
2959         default:
2960             if (flags & RAM_SAVE_FLAG_HOOK) {
2961                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2962             } else {
2963                 error_report("Unknown combination of migration flags: %#x",
2964                              flags);
2965                 ret = -EINVAL;
2966             }
2967         }
2968         if (!ret) {
2969             ret = qemu_file_get_error(f);
2970         }
2971     }
2972
2973     wait_for_decompress_done();
2974     rcu_read_unlock();
2975     trace_ram_load_complete(ret, seq_iter);
2976     return ret;
2977 }
2978
2979 static bool ram_has_postcopy(void *opaque)
2980 {
2981     return migrate_postcopy_ram();
2982 }
2983
2984 static SaveVMHandlers savevm_ram_handlers = {
2985     .save_setup = ram_save_setup,
2986     .save_live_iterate = ram_save_iterate,
2987     .save_live_complete_postcopy = ram_save_complete,
2988     .save_live_complete_precopy = ram_save_complete,
2989     .has_postcopy = ram_has_postcopy,
2990     .save_live_pending = ram_save_pending,
2991     .load_state = ram_load,
2992     .save_cleanup = ram_save_cleanup,
2993     .load_setup = ram_load_setup,
2994     .load_cleanup = ram_load_cleanup,
2995 };
2996
2997 void ram_mig_init(void)
2998 {
2999     qemu_mutex_init(&XBZRLE.lock);
3000     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
3001 }