migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "socket.h"
  40 #include "migration/register.h"
  41 #include "migration/misc.h"
  42 #include "qemu-file.h"
  43 #include "postcopy-ram.h"
  44 #include "migration/page_cache.h"
  45 #include "qemu/error-report.h"
  46 #include "qapi/error.h"
  47 #include "qapi/qapi-events-migration.h"
  48 #include "qapi/qmp/qerror.h"
  49 #include "trace.h"
  50 #include "exec/ram_addr.h"
  51 #include "exec/target_page.h"
  52 #include "qemu/rcu_queue.h"
  53 #include "migration/colo.h"
  54 #include "migration/block.h"
  55 #include "sysemu/sysemu.h"
  56 #include "qemu/uuid.h"
  57
  58 /***********************************************************/
  59 /* ram save/restore */
  60
  61 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  62  * worked for pages that where filled with the same char.  We switched
  63  * it to only search for the zero value.  And to avoid confusion with
  64  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  65  */
  66
  67 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  68 #define RAM_SAVE_FLAG_ZERO     0x02
  69 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  70 #define RAM_SAVE_FLAG_PAGE     0x08
  71 #define RAM_SAVE_FLAG_EOS      0x10
  72 #define RAM_SAVE_FLAG_CONTINUE 0x20
  73 #define RAM_SAVE_FLAG_XBZRLE   0x40
  74 /* 0x80 is reserved in migration.h start with 0x100 next */
  75 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  76
  77 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  78 {
  79     return buffer_is_zero(p, size);
  80 }
  81
  82 XBZRLECacheStats xbzrle_counters;
  83
  84 /* struct contains XBZRLE cache and a static page
  85    used by the compression */
  86 static struct {
  87     /* buffer used for XBZRLE encoding */
  88     uint8_t *encoded_buf;
  89     /* buffer for storing page content */
  90     uint8_t *current_buf;
  91     /* Cache for XBZRLE, Protected by lock. */
  92     PageCache *cache;
  93     QemuMutex lock;
  94     /* it will store a page full of zeros */
  95     uint8_t *zero_target_page;
  96     /* buffer used for XBZRLE decoding */
  97     uint8_t *decoded_buf;
  98 } XBZRLE;
  99
 100 static void XBZRLE_cache_lock(void)
 101 {
 102     if (migrate_use_xbzrle())
 103         qemu_mutex_lock(&XBZRLE.lock);
 104 }
 105
 106 static void XBZRLE_cache_unlock(void)
 107 {
 108     if (migrate_use_xbzrle())
 109         qemu_mutex_unlock(&XBZRLE.lock);
 110 }
 111
 112 /**
 113  * xbzrle_cache_resize: resize the xbzrle cache
 114  *
 115  * This function is called from qmp_migrate_set_cache_size in main
 116  * thread, possibly while a migration is in progress.  A running
 117  * migration may be using the cache and might finish during this call,
 118  * hence changes to the cache are protected by XBZRLE.lock().
 119  *
 120  * Returns 0 for success or -1 for error
 121  *
 122  * @new_size: new cache size
 123  * @errp: set *errp if the check failed, with reason
 124  */
 125 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 126 {
 127     PageCache *new_cache;
 128     int64_t ret = 0;
 129
 130     /* Check for truncation */
 131     if (new_size != (size_t)new_size) {
 132         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 133                    "exceeding address space");
 134         return -1;
 135     }
 136
 137     if (new_size == migrate_xbzrle_cache_size()) {
 138         /* nothing to do */
 139         return 0;
 140     }
 141
 142     XBZRLE_cache_lock();
 143
 144     if (XBZRLE.cache != NULL) {
 145         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 146         if (!new_cache) {
 147             ret = -1;
 148             goto out;
 149         }
 150
 151         cache_fini(XBZRLE.cache);
 152         XBZRLE.cache = new_cache;
 153     }
 154 out:
 155     XBZRLE_cache_unlock();
 156     return ret;
 157 }
 158
 159 static void ramblock_recv_map_init(void)
 160 {
 161     RAMBlock *rb;
 162
 163     RAMBLOCK_FOREACH(rb) {
 164         assert(!rb->receivedmap);
 165         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 166     }
 167 }
 168
 169 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 170 {
 171     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 172                     rb->receivedmap);
 173 }
 174
 175 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 176 {
 177     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 178 }
 179
 180 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 181 {
 182     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 183 }
 184
 185 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 186                                     size_t nr)
 187 {
 188     bitmap_set_atomic(rb->receivedmap,
 189                       ramblock_recv_bitmap_offset(host_addr, rb),
 190                       nr);
 191 }
 192
 193 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 194
 195 /*
 196  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 197  *
 198  * Returns >0 if success with sent bytes, or <0 if error.
 199  */
 200 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 201                                   const char *block_name)
 202 {
 203     RAMBlock *block = qemu_ram_block_by_name(block_name);
 204     unsigned long *le_bitmap, nbits;
 205     uint64_t size;
 206
 207     if (!block) {
 208         error_report("%s: invalid block name: %s", __func__, block_name);
 209         return -1;
 210     }
 211
 212     nbits = block->used_length >> TARGET_PAGE_BITS;
 213
 214     /*
 215      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 216      * machines we may need 4 more bytes for padding (see below
 217      * comment). So extend it a bit before hand.
 218      */
 219     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 220
 221     /*
 222      * Always use little endian when sending the bitmap. This is
 223      * required that when source and destination VMs are not using the
 224      * same endianess. (Note: big endian won't work.)
 225      */
 226     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 227
 228     /* Size of the bitmap, in bytes */
 229     size = nbits / 8;
 230
 231     /*
 232      * size is always aligned to 8 bytes for 64bit machines, but it
 233      * may not be true for 32bit machines. We need this padding to
 234      * make sure the migration can survive even between 32bit and
 235      * 64bit machines.
 236      */
 237     size = ROUND_UP(size, 8);
 238
 239     qemu_put_be64(file, size);
 240     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 241     /*
 242      * Mark as an end, in case the middle part is screwed up due to
 243      * some "misterious" reason.
 244      */
 245     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 246     qemu_fflush(file);
 247
 248     free(le_bitmap);
 249
 250     if (qemu_file_get_error(file)) {
 251         return qemu_file_get_error(file);
 252     }
 253
 254     return size + sizeof(size);
 255 }
 256
 257 /*
 258  * An outstanding page request, on the source, having been received
 259  * and queued
 260  */
 261 struct RAMSrcPageRequest {
 262     RAMBlock *rb;
 263     hwaddr    offset;
 264     hwaddr    len;
 265
 266     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 267 };
 268
 269 /* State of RAM for migration */
 270 struct RAMState {
 271     /* QEMUFile used for this migration */
 272     QEMUFile *f;
 273     /* Last block that we have visited searching for dirty pages */
 274     RAMBlock *last_seen_block;
 275     /* Last block from where we have sent data */
 276     RAMBlock *last_sent_block;
 277     /* Last dirty target page we have sent */
 278     ram_addr_t last_page;
 279     /* last ram version we have seen */
 280     uint32_t last_version;
 281     /* We are in the first round */
 282     bool ram_bulk_stage;
 283     /* How many times we have dirty too many pages */
 284     int dirty_rate_high_cnt;
 285     /* these variables are used for bitmap sync */
 286     /* last time we did a full bitmap_sync */
 287     int64_t time_last_bitmap_sync;
 288     /* bytes transferred at start_time */
 289     uint64_t bytes_xfer_prev;
 290     /* number of dirty pages since start_time */
 291     uint64_t num_dirty_pages_period;
 292     /* xbzrle misses since the beginning of the period */
 293     uint64_t xbzrle_cache_miss_prev;
 294     /* number of iterations at the beginning of period */
 295     uint64_t iterations_prev;
 296     /* Iterations since start */
 297     uint64_t iterations;
 298     /* number of dirty bits in the bitmap */
 299     uint64_t migration_dirty_pages;
 300     /* protects modification of the bitmap */
 301     QemuMutex bitmap_mutex;
 302     /* The RAMBlock used in the last src_page_requests */
 303     RAMBlock *last_req_rb;
 304     /* Queue of outstanding page requests from the destination */
 305     QemuMutex src_page_req_mutex;
 306     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 307 };
 308 typedef struct RAMState RAMState;
 309
 310 static RAMState *ram_state;
 311
 312 uint64_t ram_bytes_remaining(void)
 313 {
 314     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 315                        0;
 316 }
 317
 318 MigrationStats ram_counters;
 319
 320 /* used by the search for pages to send */
 321 struct PageSearchStatus {
 322     /* Current block being searched */
 323     RAMBlock    *block;
 324     /* Current page to search from */
 325     unsigned long page;
 326     /* Set once we wrap around */
 327     bool         complete_round;
 328 };
 329 typedef struct PageSearchStatus PageSearchStatus;
 330
 331 struct CompressParam {
 332     bool done;
 333     bool quit;
 334     QEMUFile *file;
 335     QemuMutex mutex;
 336     QemuCond cond;
 337     RAMBlock *block;
 338     ram_addr_t offset;
 339
 340     /* internally used fields */
 341     z_stream stream;
 342     uint8_t *originbuf;
 343 };
 344 typedef struct CompressParam CompressParam;
 345
 346 struct DecompressParam {
 347     bool done;
 348     bool quit;
 349     QemuMutex mutex;
 350     QemuCond cond;
 351     void *des;
 352     uint8_t *compbuf;
 353     int len;
 354     z_stream stream;
 355 };
 356 typedef struct DecompressParam DecompressParam;
 357
 358 static CompressParam *comp_param;
 359 static QemuThread *compress_threads;
 360 /* comp_done_cond is used to wake up the migration thread when
 361  * one of the compression threads has finished the compression.
 362  * comp_done_lock is used to co-work with comp_done_cond.
 363  */
 364 static QemuMutex comp_done_lock;
 365 static QemuCond comp_done_cond;
 366 /* The empty QEMUFileOps will be used by file in CompressParam */
 367 static const QEMUFileOps empty_ops = { };
 368
 369 static QEMUFile *decomp_file;
 370 static DecompressParam *decomp_param;
 371 static QemuThread *decompress_threads;
 372 static QemuMutex decomp_done_lock;
 373 static QemuCond decomp_done_cond;
 374
 375 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 376                                 ram_addr_t offset, uint8_t *source_buf);
 377
 378 static void *do_data_compress(void *opaque)
 379 {
 380     CompressParam *param = opaque;
 381     RAMBlock *block;
 382     ram_addr_t offset;
 383
 384     qemu_mutex_lock(&param->mutex);
 385     while (!param->quit) {
 386         if (param->block) {
 387             block = param->block;
 388             offset = param->offset;
 389             param->block = NULL;
 390             qemu_mutex_unlock(&param->mutex);
 391
 392             do_compress_ram_page(param->file, &param->stream, block, offset,
 393                                  param->originbuf);
 394
 395             qemu_mutex_lock(&comp_done_lock);
 396             param->done = true;
 397             qemu_cond_signal(&comp_done_cond);
 398             qemu_mutex_unlock(&comp_done_lock);
 399
 400             qemu_mutex_lock(&param->mutex);
 401         } else {
 402             qemu_cond_wait(&param->cond, &param->mutex);
 403         }
 404     }
 405     qemu_mutex_unlock(&param->mutex);
 406
 407     return NULL;
 408 }
 409
 410 static inline void terminate_compression_threads(void)
 411 {
 412     int idx, thread_count;
 413
 414     thread_count = migrate_compress_threads();
 415
 416     for (idx = 0; idx < thread_count; idx++) {
 417         qemu_mutex_lock(&comp_param[idx].mutex);
 418         comp_param[idx].quit = true;
 419         qemu_cond_signal(&comp_param[idx].cond);
 420         qemu_mutex_unlock(&comp_param[idx].mutex);
 421     }
 422 }
 423
 424 static void compress_threads_save_cleanup(void)
 425 {
 426     int i, thread_count;
 427
 428     if (!migrate_use_compression()) {
 429         return;
 430     }
 431     terminate_compression_threads();
 432     thread_count = migrate_compress_threads();
 433     for (i = 0; i < thread_count; i++) {
 434         /*
 435          * we use it as a indicator which shows if the thread is
 436          * properly init'd or not
 437          */
 438         if (!comp_param[i].file) {
 439             break;
 440         }
 441         qemu_thread_join(compress_threads + i);
 442         qemu_mutex_destroy(&comp_param[i].mutex);
 443         qemu_cond_destroy(&comp_param[i].cond);
 444         deflateEnd(&comp_param[i].stream);
 445         g_free(comp_param[i].originbuf);
 446         qemu_fclose(comp_param[i].file);
 447         comp_param[i].file = NULL;
 448     }
 449     qemu_mutex_destroy(&comp_done_lock);
 450     qemu_cond_destroy(&comp_done_cond);
 451     g_free(compress_threads);
 452     g_free(comp_param);
 453     compress_threads = NULL;
 454     comp_param = NULL;
 455 }
 456
 457 static int compress_threads_save_setup(void)
 458 {
 459     int i, thread_count;
 460
 461     if (!migrate_use_compression()) {
 462         return 0;
 463     }
 464     thread_count = migrate_compress_threads();
 465     compress_threads = g_new0(QemuThread, thread_count);
 466     comp_param = g_new0(CompressParam, thread_count);
 467     qemu_cond_init(&comp_done_cond);
 468     qemu_mutex_init(&comp_done_lock);
 469     for (i = 0; i < thread_count; i++) {
 470         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 471         if (!comp_param[i].originbuf) {
 472             goto exit;
 473         }
 474
 475         if (deflateInit(&comp_param[i].stream,
 476                         migrate_compress_level()) != Z_OK) {
 477             g_free(comp_param[i].originbuf);
 478             goto exit;
 479         }
 480
 481         /* comp_param[i].file is just used as a dummy buffer to save data,
 482          * set its ops to empty.
 483          */
 484         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 485         comp_param[i].done = true;
 486         comp_param[i].quit = false;
 487         qemu_mutex_init(&comp_param[i].mutex);
 488         qemu_cond_init(&comp_param[i].cond);
 489         qemu_thread_create(compress_threads + i, "compress",
 490                            do_data_compress, comp_param + i,
 491                            QEMU_THREAD_JOINABLE);
 492     }
 493     return 0;
 494
 495 exit:
 496     compress_threads_save_cleanup();
 497     return -1;
 498 }
 499
 500 /* Multiple fd's */
 501
 502 #define MULTIFD_MAGIC 0x11223344U
 503 #define MULTIFD_VERSION 1
 504
 505 typedef struct {
 506     uint32_t magic;
 507     uint32_t version;
 508     unsigned char uuid[16]; /* QemuUUID */
 509     uint8_t id;
 510 } __attribute__((packed)) MultiFDInit_t;
 511
 512 typedef struct {
 513     /* this fields are not changed once the thread is created */
 514     /* channel number */
 515     uint8_t id;
 516     /* channel thread name */
 517     char *name;
 518     /* channel thread id */
 519     QemuThread thread;
 520     /* communication channel */
 521     QIOChannel *c;
 522     /* sem where to wait for more work */
 523     QemuSemaphore sem;
 524     /* this mutex protects the following parameters */
 525     QemuMutex mutex;
 526     /* is this channel thread running */
 527     bool running;
 528     /* should this thread finish */
 529     bool quit;
 530 }  MultiFDSendParams;
 531
 532 typedef struct {
 533     /* this fields are not changed once the thread is created */
 534     /* channel number */
 535     uint8_t id;
 536     /* channel thread name */
 537     char *name;
 538     /* channel thread id */
 539     QemuThread thread;
 540     /* communication channel */
 541     QIOChannel *c;
 542     /* sem where to wait for more work */
 543     QemuSemaphore sem;
 544     /* this mutex protects the following parameters */
 545     QemuMutex mutex;
 546     /* is this channel thread running */
 547     bool running;
 548     /* should this thread finish */
 549     bool quit;
 550 } MultiFDRecvParams;
 551
 552 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 553 {
 554     MultiFDInit_t msg;
 555     int ret;
 556
 557     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 558     msg.version = cpu_to_be32(MULTIFD_VERSION);
 559     msg.id = p->id;
 560     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 561
 562     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 563     if (ret != 0) {
 564         return -1;
 565     }
 566     return 0;
 567 }
 568
 569 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 570 {
 571     MultiFDInit_t msg;
 572     int ret;
 573
 574     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 575     if (ret != 0) {
 576         return -1;
 577     }
 578
 579     be32_to_cpus(&msg.magic);
 580     be32_to_cpus(&msg.version);
 581
 582     if (msg.magic != MULTIFD_MAGIC) {
 583         error_setg(errp, "multifd: received packet magic %x "
 584                    "expected %x", msg.magic, MULTIFD_MAGIC);
 585         return -1;
 586     }
 587
 588     if (msg.version != MULTIFD_VERSION) {
 589         error_setg(errp, "multifd: received packet version %d "
 590                    "expected %d", msg.version, MULTIFD_VERSION);
 591         return -1;
 592     }
 593
 594     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 595         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 596         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 597
 598         error_setg(errp, "multifd: received uuid '%s' and expected "
 599                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 600         g_free(uuid);
 601         g_free(msg_uuid);
 602         return -1;
 603     }
 604
 605     if (msg.id > migrate_multifd_channels()) {
 606         error_setg(errp, "multifd: received channel version %d "
 607                    "expected %d", msg.version, MULTIFD_VERSION);
 608         return -1;
 609     }
 610
 611     return msg.id;
 612 }
 613
 614 struct {
 615     MultiFDSendParams *params;
 616     /* number of created threads */
 617     int count;
 618 } *multifd_send_state;
 619
 620 static void multifd_send_terminate_threads(Error *err)
 621 {
 622     int i;
 623
 624     if (err) {
 625         MigrationState *s = migrate_get_current();
 626         migrate_set_error(s, err);
 627         if (s->state == MIGRATION_STATUS_SETUP ||
 628             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
 629             s->state == MIGRATION_STATUS_DEVICE ||
 630             s->state == MIGRATION_STATUS_ACTIVE) {
 631             migrate_set_state(&s->state, s->state,
 632                               MIGRATION_STATUS_FAILED);
 633         }
 634     }
 635
 636     for (i = 0; i < migrate_multifd_channels(); i++) {
 637         MultiFDSendParams *p = &multifd_send_state->params[i];
 638
 639         qemu_mutex_lock(&p->mutex);
 640         p->quit = true;
 641         qemu_sem_post(&p->sem);
 642         qemu_mutex_unlock(&p->mutex);
 643     }
 644 }
 645
 646 int multifd_save_cleanup(Error **errp)
 647 {
 648     int i;
 649     int ret = 0;
 650
 651     if (!migrate_use_multifd()) {
 652         return 0;
 653     }
 654     multifd_send_terminate_threads(NULL);
 655     for (i = 0; i < migrate_multifd_channels(); i++) {
 656         MultiFDSendParams *p = &multifd_send_state->params[i];
 657
 658         if (p->running) {
 659             qemu_thread_join(&p->thread);
 660         }
 661         socket_send_channel_destroy(p->c);
 662         p->c = NULL;
 663         qemu_mutex_destroy(&p->mutex);
 664         qemu_sem_destroy(&p->sem);
 665         g_free(p->name);
 666         p->name = NULL;
 667     }
 668     g_free(multifd_send_state->params);
 669     multifd_send_state->params = NULL;
 670     g_free(multifd_send_state);
 671     multifd_send_state = NULL;
 672     return ret;
 673 }
 674
 675 static void *multifd_send_thread(void *opaque)
 676 {
 677     MultiFDSendParams *p = opaque;
 678     Error *local_err = NULL;
 679
 680     if (multifd_send_initial_packet(p, &local_err) < 0) {
 681         goto out;
 682     }
 683
 684     while (true) {
 685         qemu_mutex_lock(&p->mutex);
 686         if (p->quit) {
 687             qemu_mutex_unlock(&p->mutex);
 688             break;
 689         }
 690         qemu_mutex_unlock(&p->mutex);
 691         qemu_sem_wait(&p->sem);
 692     }
 693
 694 out:
 695     if (local_err) {
 696         multifd_send_terminate_threads(local_err);
 697     }
 698
 699     qemu_mutex_lock(&p->mutex);
 700     p->running = false;
 701     qemu_mutex_unlock(&p->mutex);
 702
 703     return NULL;
 704 }
 705
 706 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
 707 {
 708     MultiFDSendParams *p = opaque;
 709     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
 710     Error *local_err = NULL;
 711
 712     if (qio_task_propagate_error(task, &local_err)) {
 713         if (multifd_save_cleanup(&local_err) != 0) {
 714             migrate_set_error(migrate_get_current(), local_err);
 715         }
 716     } else {
 717         p->c = QIO_CHANNEL(sioc);
 718         qio_channel_set_delay(p->c, false);
 719         p->running = true;
 720         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
 721                            QEMU_THREAD_JOINABLE);
 722
 723         atomic_inc(&multifd_send_state->count);
 724     }
 725 }
 726
 727 int multifd_save_setup(void)
 728 {
 729     int thread_count;
 730     uint8_t i;
 731
 732     if (!migrate_use_multifd()) {
 733         return 0;
 734     }
 735     thread_count = migrate_multifd_channels();
 736     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
 737     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
 738     atomic_set(&multifd_send_state->count, 0);
 739     for (i = 0; i < thread_count; i++) {
 740         MultiFDSendParams *p = &multifd_send_state->params[i];
 741
 742         qemu_mutex_init(&p->mutex);
 743         qemu_sem_init(&p->sem, 0);
 744         p->quit = false;
 745         p->id = i;
 746         p->name = g_strdup_printf("multifdsend_%d", i);
 747         socket_send_channel_create(multifd_new_send_channel_async, p);
 748     }
 749     return 0;
 750 }
 751
 752 struct {
 753     MultiFDRecvParams *params;
 754     /* number of created threads */
 755     int count;
 756 } *multifd_recv_state;
 757
 758 static void multifd_recv_terminate_threads(Error *err)
 759 {
 760     int i;
 761
 762     if (err) {
 763         MigrationState *s = migrate_get_current();
 764         migrate_set_error(s, err);
 765         if (s->state == MIGRATION_STATUS_SETUP ||
 766             s->state == MIGRATION_STATUS_ACTIVE) {
 767             migrate_set_state(&s->state, s->state,
 768                               MIGRATION_STATUS_FAILED);
 769         }
 770     }
 771
 772     for (i = 0; i < migrate_multifd_channels(); i++) {
 773         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 774
 775         qemu_mutex_lock(&p->mutex);
 776         p->quit = true;
 777         qemu_sem_post(&p->sem);
 778         qemu_mutex_unlock(&p->mutex);
 779     }
 780 }
 781
 782 int multifd_load_cleanup(Error **errp)
 783 {
 784     int i;
 785     int ret = 0;
 786
 787     if (!migrate_use_multifd()) {
 788         return 0;
 789     }
 790     multifd_recv_terminate_threads(NULL);
 791     for (i = 0; i < migrate_multifd_channels(); i++) {
 792         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 793
 794         if (p->running) {
 795             qemu_thread_join(&p->thread);
 796         }
 797         object_unref(OBJECT(p->c));
 798         p->c = NULL;
 799         qemu_mutex_destroy(&p->mutex);
 800         qemu_sem_destroy(&p->sem);
 801         g_free(p->name);
 802         p->name = NULL;
 803     }
 804     g_free(multifd_recv_state->params);
 805     multifd_recv_state->params = NULL;
 806     g_free(multifd_recv_state);
 807     multifd_recv_state = NULL;
 808
 809     return ret;
 810 }
 811
 812 static void *multifd_recv_thread(void *opaque)
 813 {
 814     MultiFDRecvParams *p = opaque;
 815
 816     while (true) {
 817         qemu_mutex_lock(&p->mutex);
 818         if (p->quit) {
 819             qemu_mutex_unlock(&p->mutex);
 820             break;
 821         }
 822         qemu_mutex_unlock(&p->mutex);
 823         qemu_sem_wait(&p->sem);
 824     }
 825
 826     qemu_mutex_lock(&p->mutex);
 827     p->running = false;
 828     qemu_mutex_unlock(&p->mutex);
 829
 830     return NULL;
 831 }
 832
 833 int multifd_load_setup(void)
 834 {
 835     int thread_count;
 836     uint8_t i;
 837
 838     if (!migrate_use_multifd()) {
 839         return 0;
 840     }
 841     thread_count = migrate_multifd_channels();
 842     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
 843     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
 844     atomic_set(&multifd_recv_state->count, 0);
 845     for (i = 0; i < thread_count; i++) {
 846         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 847
 848         qemu_mutex_init(&p->mutex);
 849         qemu_sem_init(&p->sem, 0);
 850         p->quit = false;
 851         p->id = i;
 852         p->name = g_strdup_printf("multifdrecv_%d", i);
 853     }
 854     return 0;
 855 }
 856
 857 bool multifd_recv_all_channels_created(void)
 858 {
 859     int thread_count = migrate_multifd_channels();
 860
 861     if (!migrate_use_multifd()) {
 862         return true;
 863     }
 864
 865     return thread_count == atomic_read(&multifd_recv_state->count);
 866 }
 867
 868 void multifd_recv_new_channel(QIOChannel *ioc)
 869 {
 870     MultiFDRecvParams *p;
 871     Error *local_err = NULL;
 872     int id;
 873
 874     id = multifd_recv_initial_packet(ioc, &local_err);
 875     if (id < 0) {
 876         multifd_recv_terminate_threads(local_err);
 877         return;
 878     }
 879
 880     p = &multifd_recv_state->params[id];
 881     if (p->c != NULL) {
 882         error_setg(&local_err, "multifd: received id '%d' already setup'",
 883                    id);
 884         multifd_recv_terminate_threads(local_err);
 885         return;
 886     }
 887     p->c = ioc;
 888     object_ref(OBJECT(ioc));
 889
 890     p->running = true;
 891     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
 892                        QEMU_THREAD_JOINABLE);
 893     atomic_inc(&multifd_recv_state->count);
 894     if (multifd_recv_state->count == migrate_multifd_channels()) {
 895         migration_incoming_process();
 896     }
 897 }
 898
 899 /**
 900  * save_page_header: write page header to wire
 901  *
 902  * If this is the 1st block, it also writes the block identification
 903  *
 904  * Returns the number of bytes written
 905  *
 906  * @f: QEMUFile where to send the data
 907  * @block: block that contains the page we want to send
 908  * @offset: offset inside the block for the page
 909  *          in the lower bits, it contains flags
 910  */
 911 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 912                                ram_addr_t offset)
 913 {
 914     size_t size, len;
 915
 916     if (block == rs->last_sent_block) {
 917         offset |= RAM_SAVE_FLAG_CONTINUE;
 918     }
 919     qemu_put_be64(f, offset);
 920     size = 8;
 921
 922     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 923         len = strlen(block->idstr);
 924         qemu_put_byte(f, len);
 925         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 926         size += 1 + len;
 927         rs->last_sent_block = block;
 928     }
 929     return size;
 930 }
 931
 932 /**
 933  * mig_throttle_guest_down: throotle down the guest
 934  *
 935  * Reduce amount of guest cpu execution to hopefully slow down memory
 936  * writes. If guest dirty memory rate is reduced below the rate at
 937  * which we can transfer pages to the destination then we should be
 938  * able to complete migration. Some workloads dirty memory way too
 939  * fast and will not effectively converge, even with auto-converge.
 940  */
 941 static void mig_throttle_guest_down(void)
 942 {
 943     MigrationState *s = migrate_get_current();
 944     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 945     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 946
 947     /* We have not started throttling yet. Let's start it. */
 948     if (!cpu_throttle_active()) {
 949         cpu_throttle_set(pct_initial);
 950     } else {
 951         /* Throttling already on, just increase the rate */
 952         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 953     }
 954 }
 955
 956 /**
 957  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 958  *
 959  * @rs: current RAM state
 960  * @current_addr: address for the zero page
 961  *
 962  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 963  * The important thing is that a stale (not-yet-0'd) page be replaced
 964  * by the new data.
 965  * As a bonus, if the page wasn't in the cache it gets added so that
 966  * when a small write is made into the 0'd page it gets XBZRLE sent.
 967  */
 968 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 969 {
 970     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 971         return;
 972     }
 973
 974     /* We don't care if this fails to allocate a new cache page
 975      * as long as it updated an old one */
 976     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 977                  ram_counters.dirty_sync_count);
 978 }
 979
 980 #define ENCODING_FLAG_XBZRLE 0x1
 981
 982 /**
 983  * save_xbzrle_page: compress and send current page
 984  *
 985  * Returns: 1 means that we wrote the page
 986  *          0 means that page is identical to the one already sent
 987  *          -1 means that xbzrle would be longer than normal
 988  *
 989  * @rs: current RAM state
 990  * @current_data: pointer to the address of the page contents
 991  * @current_addr: addr of the page
 992  * @block: block that contains the page we want to send
 993  * @offset: offset inside the block for the page
 994  * @last_stage: if we are at the completion stage
 995  */
 996 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 997                             ram_addr_t current_addr, RAMBlock *block,
 998                             ram_addr_t offset, bool last_stage)
 999 {
1000     int encoded_len = 0, bytes_xbzrle;
1001     uint8_t *prev_cached_page;
1002
1003     if (!cache_is_cached(XBZRLE.cache, current_addr,
1004                          ram_counters.dirty_sync_count)) {
1005         xbzrle_counters.cache_miss++;
1006         if (!last_stage) {
1007             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1008                              ram_counters.dirty_sync_count) == -1) {
1009                 return -1;
1010             } else {
1011                 /* update *current_data when the page has been
1012                    inserted into cache */
1013                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1014             }
1015         }
1016         return -1;
1017     }
1018
1019     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1020
1021     /* save current buffer into memory */
1022     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1023
1024     /* XBZRLE encoding (if there is no overflow) */
1025     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1026                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1027                                        TARGET_PAGE_SIZE);
1028     if (encoded_len == 0) {
1029         trace_save_xbzrle_page_skipping();
1030         return 0;
1031     } else if (encoded_len == -1) {
1032         trace_save_xbzrle_page_overflow();
1033         xbzrle_counters.overflow++;
1034         /* update data in the cache */
1035         if (!last_stage) {
1036             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1037             *current_data = prev_cached_page;
1038         }
1039         return -1;
1040     }
1041
1042     /* we need to update the data in the cache, in order to get the same data */
1043     if (!last_stage) {
1044         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1045     }
1046
1047     /* Send XBZRLE based compressed page */
1048     bytes_xbzrle = save_page_header(rs, rs->f, block,
1049                                     offset | RAM_SAVE_FLAG_XBZRLE);
1050     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1051     qemu_put_be16(rs->f, encoded_len);
1052     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1053     bytes_xbzrle += encoded_len + 1 + 2;
1054     xbzrle_counters.pages++;
1055     xbzrle_counters.bytes += bytes_xbzrle;
1056     ram_counters.transferred += bytes_xbzrle;
1057
1058     return 1;
1059 }
1060
1061 /**
1062  * migration_bitmap_find_dirty: find the next dirty page from start
1063  *
1064  * Called with rcu_read_lock() to protect migration_bitmap
1065  *
1066  * Returns the byte offset within memory region of the start of a dirty page
1067  *
1068  * @rs: current RAM state
1069  * @rb: RAMBlock where to search for dirty pages
1070  * @start: page where we start the search
1071  */
1072 static inline
1073 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1074                                           unsigned long start)
1075 {
1076     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1077     unsigned long *bitmap = rb->bmap;
1078     unsigned long next;
1079
1080     if (rs->ram_bulk_stage && start > 0) {
1081         next = start + 1;
1082     } else {
1083         next = find_next_bit(bitmap, size, start);
1084     }
1085
1086     return next;
1087 }
1088
1089 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1090                                                 RAMBlock *rb,
1091                                                 unsigned long page)
1092 {
1093     bool ret;
1094
1095     ret = test_and_clear_bit(page, rb->bmap);
1096
1097     if (ret) {
1098         rs->migration_dirty_pages--;
1099     }
1100     return ret;
1101 }
1102
1103 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1104                                         ram_addr_t start, ram_addr_t length)
1105 {
1106     rs->migration_dirty_pages +=
1107         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1108                                               &rs->num_dirty_pages_period);
1109 }
1110
1111 /**
1112  * ram_pagesize_summary: calculate all the pagesizes of a VM
1113  *
1114  * Returns a summary bitmap of the page sizes of all RAMBlocks
1115  *
1116  * For VMs with just normal pages this is equivalent to the host page
1117  * size. If it's got some huge pages then it's the OR of all the
1118  * different page sizes.
1119  */
1120 uint64_t ram_pagesize_summary(void)
1121 {
1122     RAMBlock *block;
1123     uint64_t summary = 0;
1124
1125     RAMBLOCK_FOREACH(block) {
1126         summary |= block->page_size;
1127     }
1128
1129     return summary;
1130 }
1131
1132 static void migration_bitmap_sync(RAMState *rs)
1133 {
1134     RAMBlock *block;
1135     int64_t end_time;
1136     uint64_t bytes_xfer_now;
1137
1138     ram_counters.dirty_sync_count++;
1139
1140     if (!rs->time_last_bitmap_sync) {
1141         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1142     }
1143
1144     trace_migration_bitmap_sync_start();
1145     memory_global_dirty_log_sync();
1146
1147     qemu_mutex_lock(&rs->bitmap_mutex);
1148     rcu_read_lock();
1149     RAMBLOCK_FOREACH(block) {
1150         migration_bitmap_sync_range(rs, block, 0, block->used_length);
1151     }
1152     rcu_read_unlock();
1153     qemu_mutex_unlock(&rs->bitmap_mutex);
1154
1155     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1156
1157     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1158
1159     /* more than 1 second = 1000 millisecons */
1160     if (end_time > rs->time_last_bitmap_sync + 1000) {
1161         /* calculate period counters */
1162         ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1163             / (end_time - rs->time_last_bitmap_sync);
1164         bytes_xfer_now = ram_counters.transferred;
1165
1166         /* During block migration the auto-converge logic incorrectly detects
1167          * that ram migration makes no progress. Avoid this by disabling the
1168          * throttling logic during the bulk phase of block migration. */
1169         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1170             /* The following detection logic can be refined later. For now:
1171                Check to see if the dirtied bytes is 50% more than the approx.
1172                amount of bytes that just got transferred since the last time we
1173                were in this routine. If that happens twice, start or increase
1174                throttling */
1175
1176             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1177                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1178                 (++rs->dirty_rate_high_cnt >= 2)) {
1179                     trace_migration_throttle();
1180                     rs->dirty_rate_high_cnt = 0;
1181                     mig_throttle_guest_down();
1182             }
1183         }
1184
1185         if (migrate_use_xbzrle()) {
1186             if (rs->iterations_prev != rs->iterations) {
1187                 xbzrle_counters.cache_miss_rate =
1188                    (double)(xbzrle_counters.cache_miss -
1189                             rs->xbzrle_cache_miss_prev) /
1190                    (rs->iterations - rs->iterations_prev);
1191             }
1192             rs->iterations_prev = rs->iterations;
1193             rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1194         }
1195
1196         /* reset period counters */
1197         rs->time_last_bitmap_sync = end_time;
1198         rs->num_dirty_pages_period = 0;
1199         rs->bytes_xfer_prev = bytes_xfer_now;
1200     }
1201     if (migrate_use_events()) {
1202         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
1203     }
1204 }
1205
1206 /**
1207  * save_zero_page: send the zero page to the stream
1208  *
1209  * Returns the number of pages written.
1210  *
1211  * @rs: current RAM state
1212  * @block: block that contains the page we want to send
1213  * @offset: offset inside the block for the page
1214  */
1215 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1216 {
1217     uint8_t *p = block->host + offset;
1218     int pages = -1;
1219
1220     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1221         ram_counters.duplicate++;
1222         ram_counters.transferred +=
1223             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
1224         qemu_put_byte(rs->f, 0);
1225         ram_counters.transferred += 1;
1226         pages = 1;
1227     }
1228
1229     return pages;
1230 }
1231
1232 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1233 {
1234     if (!migrate_release_ram() || !migration_in_postcopy()) {
1235         return;
1236     }
1237
1238     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1239 }
1240
1241 /*
1242  * @pages: the number of pages written by the control path,
1243  *        < 0 - error
1244  *        > 0 - number of pages written
1245  *
1246  * Return true if the pages has been saved, otherwise false is returned.
1247  */
1248 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1249                               int *pages)
1250 {
1251     uint64_t bytes_xmit = 0;
1252     int ret;
1253
1254     *pages = -1;
1255     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1256                                 &bytes_xmit);
1257     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1258         return false;
1259     }
1260
1261     if (bytes_xmit) {
1262         ram_counters.transferred += bytes_xmit;
1263         *pages = 1;
1264     }
1265
1266     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1267         return true;
1268     }
1269
1270     if (bytes_xmit > 0) {
1271         ram_counters.normal++;
1272     } else if (bytes_xmit == 0) {
1273         ram_counters.duplicate++;
1274     }
1275
1276     return true;
1277 }
1278
1279 /*
1280  * directly send the page to the stream
1281  *
1282  * Returns the number of pages written.
1283  *
1284  * @rs: current RAM state
1285  * @block: block that contains the page we want to send
1286  * @offset: offset inside the block for the page
1287  * @buf: the page to be sent
1288  * @async: send to page asyncly
1289  */
1290 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1291                             uint8_t *buf, bool async)
1292 {
1293     ram_counters.transferred += save_page_header(rs, rs->f, block,
1294                                                  offset | RAM_SAVE_FLAG_PAGE);
1295     if (async) {
1296         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1297                               migrate_release_ram() &
1298                               migration_in_postcopy());
1299     } else {
1300         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1301     }
1302     ram_counters.transferred += TARGET_PAGE_SIZE;
1303     ram_counters.normal++;
1304     return 1;
1305 }
1306
1307 /**
1308  * ram_save_page: send the given page to the stream
1309  *
1310  * Returns the number of pages written.
1311  *          < 0 - error
1312  *          >=0 - Number of pages written - this might legally be 0
1313  *                if xbzrle noticed the page was the same.
1314  *
1315  * @rs: current RAM state
1316  * @block: block that contains the page we want to send
1317  * @offset: offset inside the block for the page
1318  * @last_stage: if we are at the completion stage
1319  */
1320 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1321 {
1322     int pages = -1;
1323     uint8_t *p;
1324     bool send_async = true;
1325     RAMBlock *block = pss->block;
1326     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1327     ram_addr_t current_addr = block->offset + offset;
1328
1329     p = block->host + offset;
1330     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1331
1332     XBZRLE_cache_lock();
1333     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1334         migrate_use_xbzrle()) {
1335         pages = save_xbzrle_page(rs, &p, current_addr, block,
1336                                  offset, last_stage);
1337         if (!last_stage) {
1338             /* Can't send this cached data async, since the cache page
1339              * might get updated before it gets to the wire
1340              */
1341             send_async = false;
1342         }
1343     }
1344
1345     /* XBZRLE overflow or normal page */
1346     if (pages == -1) {
1347         pages = save_normal_page(rs, block, offset, p, send_async);
1348     }
1349
1350     XBZRLE_cache_unlock();
1351
1352     return pages;
1353 }
1354
1355 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1356                                 ram_addr_t offset, uint8_t *source_buf)
1357 {
1358     RAMState *rs = ram_state;
1359     int bytes_sent, blen;
1360     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1361
1362     bytes_sent = save_page_header(rs, f, block, offset |
1363                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
1364
1365     /*
1366      * copy it to a internal buffer to avoid it being modified by VM
1367      * so that we can catch up the error during compression and
1368      * decompression
1369      */
1370     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1371     blen = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1372     if (blen < 0) {
1373         bytes_sent = 0;
1374         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1375         error_report("compressed data failed!");
1376     } else {
1377         bytes_sent += blen;
1378         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1379     }
1380
1381     return bytes_sent;
1382 }
1383
1384 static void flush_compressed_data(RAMState *rs)
1385 {
1386     int idx, len, thread_count;
1387
1388     if (!migrate_use_compression()) {
1389         return;
1390     }
1391     thread_count = migrate_compress_threads();
1392
1393     qemu_mutex_lock(&comp_done_lock);
1394     for (idx = 0; idx < thread_count; idx++) {
1395         while (!comp_param[idx].done) {
1396             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1397         }
1398     }
1399     qemu_mutex_unlock(&comp_done_lock);
1400
1401     for (idx = 0; idx < thread_count; idx++) {
1402         qemu_mutex_lock(&comp_param[idx].mutex);
1403         if (!comp_param[idx].quit) {
1404             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1405             ram_counters.transferred += len;
1406         }
1407         qemu_mutex_unlock(&comp_param[idx].mutex);
1408     }
1409 }
1410
1411 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1412                                        ram_addr_t offset)
1413 {
1414     param->block = block;
1415     param->offset = offset;
1416 }
1417
1418 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1419                                            ram_addr_t offset)
1420 {
1421     int idx, thread_count, bytes_xmit = -1, pages = -1;
1422
1423     thread_count = migrate_compress_threads();
1424     qemu_mutex_lock(&comp_done_lock);
1425     while (true) {
1426         for (idx = 0; idx < thread_count; idx++) {
1427             if (comp_param[idx].done) {
1428                 comp_param[idx].done = false;
1429                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1430                 qemu_mutex_lock(&comp_param[idx].mutex);
1431                 set_compress_params(&comp_param[idx], block, offset);
1432                 qemu_cond_signal(&comp_param[idx].cond);
1433                 qemu_mutex_unlock(&comp_param[idx].mutex);
1434                 pages = 1;
1435                 ram_counters.normal++;
1436                 ram_counters.transferred += bytes_xmit;
1437                 break;
1438             }
1439         }
1440         if (pages > 0) {
1441             break;
1442         } else {
1443             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1444         }
1445     }
1446     qemu_mutex_unlock(&comp_done_lock);
1447
1448     return pages;
1449 }
1450
1451 /**
1452  * find_dirty_block: find the next dirty page and update any state
1453  * associated with the search process.
1454  *
1455  * Returns if a page is found
1456  *
1457  * @rs: current RAM state
1458  * @pss: data about the state of the current dirty page scan
1459  * @again: set to false if the search has scanned the whole of RAM
1460  */
1461 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1462 {
1463     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1464     if (pss->complete_round && pss->block == rs->last_seen_block &&
1465         pss->page >= rs->last_page) {
1466         /*
1467          * We've been once around the RAM and haven't found anything.
1468          * Give up.
1469          */
1470         *again = false;
1471         return false;
1472     }
1473     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1474         /* Didn't find anything in this RAM Block */
1475         pss->page = 0;
1476         pss->block = QLIST_NEXT_RCU(pss->block, next);
1477         if (!pss->block) {
1478             /* Hit the end of the list */
1479             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1480             /* Flag that we've looped */
1481             pss->complete_round = true;
1482             rs->ram_bulk_stage = false;
1483             if (migrate_use_xbzrle()) {
1484                 /* If xbzrle is on, stop using the data compression at this
1485                  * point. In theory, xbzrle can do better than compression.
1486                  */
1487                 flush_compressed_data(rs);
1488             }
1489         }
1490         /* Didn't find anything this time, but try again on the new block */
1491         *again = true;
1492         return false;
1493     } else {
1494         /* Can go around again, but... */
1495         *again = true;
1496         /* We've found something so probably don't need to */
1497         return true;
1498     }
1499 }
1500
1501 /**
1502  * unqueue_page: gets a page of the queue
1503  *
1504  * Helper for 'get_queued_page' - gets a page off the queue
1505  *
1506  * Returns the block of the page (or NULL if none available)
1507  *
1508  * @rs: current RAM state
1509  * @offset: used to return the offset within the RAMBlock
1510  */
1511 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1512 {
1513     RAMBlock *block = NULL;
1514
1515     qemu_mutex_lock(&rs->src_page_req_mutex);
1516     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1517         struct RAMSrcPageRequest *entry =
1518                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1519         block = entry->rb;
1520         *offset = entry->offset;
1521
1522         if (entry->len > TARGET_PAGE_SIZE) {
1523             entry->len -= TARGET_PAGE_SIZE;
1524             entry->offset += TARGET_PAGE_SIZE;
1525         } else {
1526             memory_region_unref(block->mr);
1527             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1528             g_free(entry);
1529         }
1530     }
1531     qemu_mutex_unlock(&rs->src_page_req_mutex);
1532
1533     return block;
1534 }
1535
1536 /**
1537  * get_queued_page: unqueue a page from the postocpy requests
1538  *
1539  * Skips pages that are already sent (!dirty)
1540  *
1541  * Returns if a queued page is found
1542  *
1543  * @rs: current RAM state
1544  * @pss: data about the state of the current dirty page scan
1545  */
1546 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1547 {
1548     RAMBlock  *block;
1549     ram_addr_t offset;
1550     bool dirty;
1551
1552     do {
1553         block = unqueue_page(rs, &offset);
1554         /*
1555          * We're sending this page, and since it's postcopy nothing else
1556          * will dirty it, and we must make sure it doesn't get sent again
1557          * even if this queue request was received after the background
1558          * search already sent it.
1559          */
1560         if (block) {
1561             unsigned long page;
1562
1563             page = offset >> TARGET_PAGE_BITS;
1564             dirty = test_bit(page, block->bmap);
1565             if (!dirty) {
1566                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1567                        page, test_bit(page, block->unsentmap));
1568             } else {
1569                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1570             }
1571         }
1572
1573     } while (block && !dirty);
1574
1575     if (block) {
1576         /*
1577          * As soon as we start servicing pages out of order, then we have
1578          * to kill the bulk stage, since the bulk stage assumes
1579          * in (migration_bitmap_find_and_reset_dirty) that every page is
1580          * dirty, that's no longer true.
1581          */
1582         rs->ram_bulk_stage = false;
1583
1584         /*
1585          * We want the background search to continue from the queued page
1586          * since the guest is likely to want other pages near to the page
1587          * it just requested.
1588          */
1589         pss->block = block;
1590         pss->page = offset >> TARGET_PAGE_BITS;
1591     }
1592
1593     return !!block;
1594 }
1595
1596 /**
1597  * migration_page_queue_free: drop any remaining pages in the ram
1598  * request queue
1599  *
1600  * It should be empty at the end anyway, but in error cases there may
1601  * be some left.  in case that there is any page left, we drop it.
1602  *
1603  */
1604 static void migration_page_queue_free(RAMState *rs)
1605 {
1606     struct RAMSrcPageRequest *mspr, *next_mspr;
1607     /* This queue generally should be empty - but in the case of a failed
1608      * migration might have some droppings in.
1609      */
1610     rcu_read_lock();
1611     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1612         memory_region_unref(mspr->rb->mr);
1613         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1614         g_free(mspr);
1615     }
1616     rcu_read_unlock();
1617 }
1618
1619 /**
1620  * ram_save_queue_pages: queue the page for transmission
1621  *
1622  * A request from postcopy destination for example.
1623  *
1624  * Returns zero on success or negative on error
1625  *
1626  * @rbname: Name of the RAMBLock of the request. NULL means the
1627  *          same that last one.
1628  * @start: starting address from the start of the RAMBlock
1629  * @len: length (in bytes) to send
1630  */
1631 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1632 {
1633     RAMBlock *ramblock;
1634     RAMState *rs = ram_state;
1635
1636     ram_counters.postcopy_requests++;
1637     rcu_read_lock();
1638     if (!rbname) {
1639         /* Reuse last RAMBlock */
1640         ramblock = rs->last_req_rb;
1641
1642         if (!ramblock) {
1643             /*
1644              * Shouldn't happen, we can't reuse the last RAMBlock if
1645              * it's the 1st request.
1646              */
1647             error_report("ram_save_queue_pages no previous block");
1648             goto err;
1649         }
1650     } else {
1651         ramblock = qemu_ram_block_by_name(rbname);
1652
1653         if (!ramblock) {
1654             /* We shouldn't be asked for a non-existent RAMBlock */
1655             error_report("ram_save_queue_pages no block '%s'", rbname);
1656             goto err;
1657         }
1658         rs->last_req_rb = ramblock;
1659     }
1660     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1661     if (start+len > ramblock->used_length) {
1662         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1663                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1664                      __func__, start, len, ramblock->used_length);
1665         goto err;
1666     }
1667
1668     struct RAMSrcPageRequest *new_entry =
1669         g_malloc0(sizeof(struct RAMSrcPageRequest));
1670     new_entry->rb = ramblock;
1671     new_entry->offset = start;
1672     new_entry->len = len;
1673
1674     memory_region_ref(ramblock->mr);
1675     qemu_mutex_lock(&rs->src_page_req_mutex);
1676     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1677     qemu_mutex_unlock(&rs->src_page_req_mutex);
1678     rcu_read_unlock();
1679
1680     return 0;
1681
1682 err:
1683     rcu_read_unlock();
1684     return -1;
1685 }
1686
1687 static bool save_page_use_compression(RAMState *rs)
1688 {
1689     if (!migrate_use_compression()) {
1690         return false;
1691     }
1692
1693     /*
1694      * If xbzrle is on, stop using the data compression after first
1695      * round of migration even if compression is enabled. In theory,
1696      * xbzrle can do better than compression.
1697      */
1698     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1699         return true;
1700     }
1701
1702     return false;
1703 }
1704
1705 /**
1706  * ram_save_target_page: save one target page
1707  *
1708  * Returns the number of pages written
1709  *
1710  * @rs: current RAM state
1711  * @pss: data about the page we want to send
1712  * @last_stage: if we are at the completion stage
1713  */
1714 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1715                                 bool last_stage)
1716 {
1717     RAMBlock *block = pss->block;
1718     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1719     int res;
1720
1721     if (control_save_page(rs, block, offset, &res)) {
1722         return res;
1723     }
1724
1725     /*
1726      * When starting the process of a new block, the first page of
1727      * the block should be sent out before other pages in the same
1728      * block, and all the pages in last block should have been sent
1729      * out, keeping this order is important, because the 'cont' flag
1730      * is used to avoid resending the block name.
1731      */
1732     if (block != rs->last_sent_block && save_page_use_compression(rs)) {
1733             flush_compressed_data(rs);
1734     }
1735
1736     res = save_zero_page(rs, block, offset);
1737     if (res > 0) {
1738         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1739          * page would be stale
1740          */
1741         if (!save_page_use_compression(rs)) {
1742             XBZRLE_cache_lock();
1743             xbzrle_cache_zero_page(rs, block->offset + offset);
1744             XBZRLE_cache_unlock();
1745         }
1746         ram_release_pages(block->idstr, offset, res);
1747         return res;
1748     }
1749
1750     /*
1751      * Make sure the first page is sent out before other pages.
1752      *
1753      * we post it as normal page as compression will take much
1754      * CPU resource.
1755      */
1756     if (block == rs->last_sent_block && save_page_use_compression(rs)) {
1757         return compress_page_with_multi_thread(rs, block, offset);
1758     }
1759
1760     return ram_save_page(rs, pss, last_stage);
1761 }
1762
1763 /**
1764  * ram_save_host_page: save a whole host page
1765  *
1766  * Starting at *offset send pages up to the end of the current host
1767  * page. It's valid for the initial offset to point into the middle of
1768  * a host page in which case the remainder of the hostpage is sent.
1769  * Only dirty target pages are sent. Note that the host page size may
1770  * be a huge page for this block.
1771  * The saving stops at the boundary of the used_length of the block
1772  * if the RAMBlock isn't a multiple of the host page size.
1773  *
1774  * Returns the number of pages written or negative on error
1775  *
1776  * @rs: current RAM state
1777  * @ms: current migration state
1778  * @pss: data about the page we want to send
1779  * @last_stage: if we are at the completion stage
1780  */
1781 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1782                               bool last_stage)
1783 {
1784     int tmppages, pages = 0;
1785     size_t pagesize_bits =
1786         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1787
1788     do {
1789         /* Check the pages is dirty and if it is send it */
1790         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1791             pss->page++;
1792             continue;
1793         }
1794
1795         tmppages = ram_save_target_page(rs, pss, last_stage);
1796         if (tmppages < 0) {
1797             return tmppages;
1798         }
1799
1800         pages += tmppages;
1801         if (pss->block->unsentmap) {
1802             clear_bit(pss->page, pss->block->unsentmap);
1803         }
1804
1805         pss->page++;
1806     } while ((pss->page & (pagesize_bits - 1)) &&
1807              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1808
1809     /* The offset we leave with is the last one we looked at */
1810     pss->page--;
1811     return pages;
1812 }
1813
1814 /**
1815  * ram_find_and_save_block: finds a dirty page and sends it to f
1816  *
1817  * Called within an RCU critical section.
1818  *
1819  * Returns the number of pages written where zero means no dirty pages
1820  *
1821  * @rs: current RAM state
1822  * @last_stage: if we are at the completion stage
1823  *
1824  * On systems where host-page-size > target-page-size it will send all the
1825  * pages in a host page that are dirty.
1826  */
1827
1828 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1829 {
1830     PageSearchStatus pss;
1831     int pages = 0;
1832     bool again, found;
1833
1834     /* No dirty page as there is zero RAM */
1835     if (!ram_bytes_total()) {
1836         return pages;
1837     }
1838
1839     pss.block = rs->last_seen_block;
1840     pss.page = rs->last_page;
1841     pss.complete_round = false;
1842
1843     if (!pss.block) {
1844         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1845     }
1846
1847     do {
1848         again = true;
1849         found = get_queued_page(rs, &pss);
1850
1851         if (!found) {
1852             /* priority queue empty, so just search for something dirty */
1853             found = find_dirty_block(rs, &pss, &again);
1854         }
1855
1856         if (found) {
1857             pages = ram_save_host_page(rs, &pss, last_stage);
1858         }
1859     } while (!pages && again);
1860
1861     rs->last_seen_block = pss.block;
1862     rs->last_page = pss.page;
1863
1864     return pages;
1865 }
1866
1867 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1868 {
1869     uint64_t pages = size / TARGET_PAGE_SIZE;
1870
1871     if (zero) {
1872         ram_counters.duplicate += pages;
1873     } else {
1874         ram_counters.normal += pages;
1875         ram_counters.transferred += size;
1876         qemu_update_position(f, size);
1877     }
1878 }
1879
1880 uint64_t ram_bytes_total(void)
1881 {
1882     RAMBlock *block;
1883     uint64_t total = 0;
1884
1885     rcu_read_lock();
1886     RAMBLOCK_FOREACH(block) {
1887         total += block->used_length;
1888     }
1889     rcu_read_unlock();
1890     return total;
1891 }
1892
1893 static void xbzrle_load_setup(void)
1894 {
1895     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1896 }
1897
1898 static void xbzrle_load_cleanup(void)
1899 {
1900     g_free(XBZRLE.decoded_buf);
1901     XBZRLE.decoded_buf = NULL;
1902 }
1903
1904 static void ram_state_cleanup(RAMState **rsp)
1905 {
1906     if (*rsp) {
1907         migration_page_queue_free(*rsp);
1908         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1909         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1910         g_free(*rsp);
1911         *rsp = NULL;
1912     }
1913 }
1914
1915 static void xbzrle_cleanup(void)
1916 {
1917     XBZRLE_cache_lock();
1918     if (XBZRLE.cache) {
1919         cache_fini(XBZRLE.cache);
1920         g_free(XBZRLE.encoded_buf);
1921         g_free(XBZRLE.current_buf);
1922         g_free(XBZRLE.zero_target_page);
1923         XBZRLE.cache = NULL;
1924         XBZRLE.encoded_buf = NULL;
1925         XBZRLE.current_buf = NULL;
1926         XBZRLE.zero_target_page = NULL;
1927     }
1928     XBZRLE_cache_unlock();
1929 }
1930
1931 static void ram_save_cleanup(void *opaque)
1932 {
1933     RAMState **rsp = opaque;
1934     RAMBlock *block;
1935
1936     /* caller have hold iothread lock or is in a bh, so there is
1937      * no writing race against this migration_bitmap
1938      */
1939     memory_global_dirty_log_stop();
1940
1941     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1942         g_free(block->bmap);
1943         block->bmap = NULL;
1944         g_free(block->unsentmap);
1945         block->unsentmap = NULL;
1946     }
1947
1948     xbzrle_cleanup();
1949     compress_threads_save_cleanup();
1950     ram_state_cleanup(rsp);
1951 }
1952
1953 static void ram_state_reset(RAMState *rs)
1954 {
1955     rs->last_seen_block = NULL;
1956     rs->last_sent_block = NULL;
1957     rs->last_page = 0;
1958     rs->last_version = ram_list.version;
1959     rs->ram_bulk_stage = true;
1960 }
1961
1962 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1963
1964 /*
1965  * 'expected' is the value you expect the bitmap mostly to be full
1966  * of; it won't bother printing lines that are all this value.
1967  * If 'todump' is null the migration bitmap is dumped.
1968  */
1969 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1970                            unsigned long pages)
1971 {
1972     int64_t cur;
1973     int64_t linelen = 128;
1974     char linebuf[129];
1975
1976     for (cur = 0; cur < pages; cur += linelen) {
1977         int64_t curb;
1978         bool found = false;
1979         /*
1980          * Last line; catch the case where the line length
1981          * is longer than remaining ram
1982          */
1983         if (cur + linelen > pages) {
1984             linelen = pages - cur;
1985         }
1986         for (curb = 0; curb < linelen; curb++) {
1987             bool thisbit = test_bit(cur + curb, todump);
1988             linebuf[curb] = thisbit ? '1' : '.';
1989             found = found || (thisbit != expected);
1990         }
1991         if (found) {
1992             linebuf[curb] = '\0';
1993             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1994         }
1995     }
1996 }
1997
1998 /* **** functions for postcopy ***** */
1999
2000 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2001 {
2002     struct RAMBlock *block;
2003
2004     RAMBLOCK_FOREACH(block) {
2005         unsigned long *bitmap = block->bmap;
2006         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2007         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2008
2009         while (run_start < range) {
2010             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2011             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2012                               (run_end - run_start) << TARGET_PAGE_BITS);
2013             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2014         }
2015     }
2016 }
2017
2018 /**
2019  * postcopy_send_discard_bm_ram: discard a RAMBlock
2020  *
2021  * Returns zero on success
2022  *
2023  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2024  * Note: At this point the 'unsentmap' is the processed bitmap combined
2025  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2026  *
2027  * @ms: current migration state
2028  * @pds: state for postcopy
2029  * @start: RAMBlock starting page
2030  * @length: RAMBlock size
2031  */
2032 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2033                                         PostcopyDiscardState *pds,
2034                                         RAMBlock *block)
2035 {
2036     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2037     unsigned long current;
2038     unsigned long *unsentmap = block->unsentmap;
2039
2040     for (current = 0; current < end; ) {
2041         unsigned long one = find_next_bit(unsentmap, end, current);
2042
2043         if (one <= end) {
2044             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2045             unsigned long discard_length;
2046
2047             if (zero >= end) {
2048                 discard_length = end - one;
2049             } else {
2050                 discard_length = zero - one;
2051             }
2052             if (discard_length) {
2053                 postcopy_discard_send_range(ms, pds, one, discard_length);
2054             }
2055             current = one + discard_length;
2056         } else {
2057             current = one;
2058         }
2059     }
2060
2061     return 0;
2062 }
2063
2064 /**
2065  * postcopy_each_ram_send_discard: discard all RAMBlocks
2066  *
2067  * Returns 0 for success or negative for error
2068  *
2069  * Utility for the outgoing postcopy code.
2070  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2071  *   passing it bitmap indexes and name.
2072  * (qemu_ram_foreach_block ends up passing unscaled lengths
2073  *  which would mean postcopy code would have to deal with target page)
2074  *
2075  * @ms: current migration state
2076  */
2077 static int postcopy_each_ram_send_discard(MigrationState *ms)
2078 {
2079     struct RAMBlock *block;
2080     int ret;
2081
2082     RAMBLOCK_FOREACH(block) {
2083         PostcopyDiscardState *pds =
2084             postcopy_discard_send_init(ms, block->idstr);
2085
2086         /*
2087          * Postcopy sends chunks of bitmap over the wire, but it
2088          * just needs indexes at this point, avoids it having
2089          * target page specific code.
2090          */
2091         ret = postcopy_send_discard_bm_ram(ms, pds, block);
2092         postcopy_discard_send_finish(ms, pds);
2093         if (ret) {
2094             return ret;
2095         }
2096     }
2097
2098     return 0;
2099 }
2100
2101 /**
2102  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2103  *
2104  * Helper for postcopy_chunk_hostpages; it's called twice to
2105  * canonicalize the two bitmaps, that are similar, but one is
2106  * inverted.
2107  *
2108  * Postcopy requires that all target pages in a hostpage are dirty or
2109  * clean, not a mix.  This function canonicalizes the bitmaps.
2110  *
2111  * @ms: current migration state
2112  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2113  *               otherwise we need to canonicalize partially dirty host pages
2114  * @block: block that contains the page we want to canonicalize
2115  * @pds: state for postcopy
2116  */
2117 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2118                                           RAMBlock *block,
2119                                           PostcopyDiscardState *pds)
2120 {
2121     RAMState *rs = ram_state;
2122     unsigned long *bitmap = block->bmap;
2123     unsigned long *unsentmap = block->unsentmap;
2124     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2125     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2126     unsigned long run_start;
2127
2128     if (block->page_size == TARGET_PAGE_SIZE) {
2129         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2130         return;
2131     }
2132
2133     if (unsent_pass) {
2134         /* Find a sent page */
2135         run_start = find_next_zero_bit(unsentmap, pages, 0);
2136     } else {
2137         /* Find a dirty page */
2138         run_start = find_next_bit(bitmap, pages, 0);
2139     }
2140
2141     while (run_start < pages) {
2142         bool do_fixup = false;
2143         unsigned long fixup_start_addr;
2144         unsigned long host_offset;
2145
2146         /*
2147          * If the start of this run of pages is in the middle of a host
2148          * page, then we need to fixup this host page.
2149          */
2150         host_offset = run_start % host_ratio;
2151         if (host_offset) {
2152             do_fixup = true;
2153             run_start -= host_offset;
2154             fixup_start_addr = run_start;
2155             /* For the next pass */
2156             run_start = run_start + host_ratio;
2157         } else {
2158             /* Find the end of this run */
2159             unsigned long run_end;
2160             if (unsent_pass) {
2161                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2162             } else {
2163                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2164             }
2165             /*
2166              * If the end isn't at the start of a host page, then the
2167              * run doesn't finish at the end of a host page
2168              * and we need to discard.
2169              */
2170             host_offset = run_end % host_ratio;
2171             if (host_offset) {
2172                 do_fixup = true;
2173                 fixup_start_addr = run_end - host_offset;
2174                 /*
2175                  * This host page has gone, the next loop iteration starts
2176                  * from after the fixup
2177                  */
2178                 run_start = fixup_start_addr + host_ratio;
2179             } else {
2180                 /*
2181                  * No discards on this iteration, next loop starts from
2182                  * next sent/dirty page
2183                  */
2184                 run_start = run_end + 1;
2185             }
2186         }
2187
2188         if (do_fixup) {
2189             unsigned long page;
2190
2191             /* Tell the destination to discard this page */
2192             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2193                 /* For the unsent_pass we:
2194                  *     discard partially sent pages
2195                  * For the !unsent_pass (dirty) we:
2196                  *     discard partially dirty pages that were sent
2197                  *     (any partially sent pages were already discarded
2198                  *     by the previous unsent_pass)
2199                  */
2200                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2201                                             host_ratio);
2202             }
2203
2204             /* Clean up the bitmap */
2205             for (page = fixup_start_addr;
2206                  page < fixup_start_addr + host_ratio; page++) {
2207                 /* All pages in this host page are now not sent */
2208                 set_bit(page, unsentmap);
2209
2210                 /*
2211                  * Remark them as dirty, updating the count for any pages
2212                  * that weren't previously dirty.
2213                  */
2214                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2215             }
2216         }
2217
2218         if (unsent_pass) {
2219             /* Find the next sent page for the next iteration */
2220             run_start = find_next_zero_bit(unsentmap, pages, run_start);
2221         } else {
2222             /* Find the next dirty page for the next iteration */
2223             run_start = find_next_bit(bitmap, pages, run_start);
2224         }
2225     }
2226 }
2227
2228 /**
2229  * postcopy_chuck_hostpages: discrad any partially sent host page
2230  *
2231  * Utility for the outgoing postcopy code.
2232  *
2233  * Discard any partially sent host-page size chunks, mark any partially
2234  * dirty host-page size chunks as all dirty.  In this case the host-page
2235  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2236  *
2237  * Returns zero on success
2238  *
2239  * @ms: current migration state
2240  * @block: block we want to work with
2241  */
2242 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2243 {
2244     PostcopyDiscardState *pds =
2245         postcopy_discard_send_init(ms, block->idstr);
2246
2247     /* First pass: Discard all partially sent host pages */
2248     postcopy_chunk_hostpages_pass(ms, true, block, pds);
2249     /*
2250      * Second pass: Ensure that all partially dirty host pages are made
2251      * fully dirty.
2252      */
2253     postcopy_chunk_hostpages_pass(ms, false, block, pds);
2254
2255     postcopy_discard_send_finish(ms, pds);
2256     return 0;
2257 }
2258
2259 /**
2260  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2261  *
2262  * Returns zero on success
2263  *
2264  * Transmit the set of pages to be discarded after precopy to the target
2265  * these are pages that:
2266  *     a) Have been previously transmitted but are now dirty again
2267  *     b) Pages that have never been transmitted, this ensures that
2268  *        any pages on the destination that have been mapped by background
2269  *        tasks get discarded (transparent huge pages is the specific concern)
2270  * Hopefully this is pretty sparse
2271  *
2272  * @ms: current migration state
2273  */
2274 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2275 {
2276     RAMState *rs = ram_state;
2277     RAMBlock *block;
2278     int ret;
2279
2280     rcu_read_lock();
2281
2282     /* This should be our last sync, the src is now paused */
2283     migration_bitmap_sync(rs);
2284
2285     /* Easiest way to make sure we don't resume in the middle of a host-page */
2286     rs->last_seen_block = NULL;
2287     rs->last_sent_block = NULL;
2288     rs->last_page = 0;
2289
2290     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2291         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2292         unsigned long *bitmap = block->bmap;
2293         unsigned long *unsentmap = block->unsentmap;
2294
2295         if (!unsentmap) {
2296             /* We don't have a safe way to resize the sentmap, so
2297              * if the bitmap was resized it will be NULL at this
2298              * point.
2299              */
2300             error_report("migration ram resized during precopy phase");
2301             rcu_read_unlock();
2302             return -EINVAL;
2303         }
2304         /* Deal with TPS != HPS and huge pages */
2305         ret = postcopy_chunk_hostpages(ms, block);
2306         if (ret) {
2307             rcu_read_unlock();
2308             return ret;
2309         }
2310
2311         /*
2312          * Update the unsentmap to be unsentmap = unsentmap | dirty
2313          */
2314         bitmap_or(unsentmap, unsentmap, bitmap, pages);
2315 #ifdef DEBUG_POSTCOPY
2316         ram_debug_dump_bitmap(unsentmap, true, pages);
2317 #endif
2318     }
2319     trace_ram_postcopy_send_discard_bitmap();
2320
2321     ret = postcopy_each_ram_send_discard(ms);
2322     rcu_read_unlock();
2323
2324     return ret;
2325 }
2326
2327 /**
2328  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2329  *
2330  * Returns zero on success
2331  *
2332  * @rbname: name of the RAMBlock of the request. NULL means the
2333  *          same that last one.
2334  * @start: RAMBlock starting page
2335  * @length: RAMBlock size
2336  */
2337 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2338 {
2339     int ret = -1;
2340
2341     trace_ram_discard_range(rbname, start, length);
2342
2343     rcu_read_lock();
2344     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2345
2346     if (!rb) {
2347         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2348         goto err;
2349     }
2350
2351     bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2352                  length >> qemu_target_page_bits());
2353     ret = ram_block_discard_range(rb, start, length);
2354
2355 err:
2356     rcu_read_unlock();
2357
2358     return ret;
2359 }
2360
2361 /*
2362  * For every allocation, we will try not to crash the VM if the
2363  * allocation failed.
2364  */
2365 static int xbzrle_init(void)
2366 {
2367     Error *local_err = NULL;
2368
2369     if (!migrate_use_xbzrle()) {
2370         return 0;
2371     }
2372
2373     XBZRLE_cache_lock();
2374
2375     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2376     if (!XBZRLE.zero_target_page) {
2377         error_report("%s: Error allocating zero page", __func__);
2378         goto err_out;
2379     }
2380
2381     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2382                               TARGET_PAGE_SIZE, &local_err);
2383     if (!XBZRLE.cache) {
2384         error_report_err(local_err);
2385         goto free_zero_page;
2386     }
2387
2388     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2389     if (!XBZRLE.encoded_buf) {
2390         error_report("%s: Error allocating encoded_buf", __func__);
2391         goto free_cache;
2392     }
2393
2394     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2395     if (!XBZRLE.current_buf) {
2396         error_report("%s: Error allocating current_buf", __func__);
2397         goto free_encoded_buf;
2398     }
2399
2400     /* We are all good */
2401     XBZRLE_cache_unlock();
2402     return 0;
2403
2404 free_encoded_buf:
2405     g_free(XBZRLE.encoded_buf);
2406     XBZRLE.encoded_buf = NULL;
2407 free_cache:
2408     cache_fini(XBZRLE.cache);
2409     XBZRLE.cache = NULL;
2410 free_zero_page:
2411     g_free(XBZRLE.zero_target_page);
2412     XBZRLE.zero_target_page = NULL;
2413 err_out:
2414     XBZRLE_cache_unlock();
2415     return -ENOMEM;
2416 }
2417
2418 static int ram_state_init(RAMState **rsp)
2419 {
2420     *rsp = g_try_new0(RAMState, 1);
2421
2422     if (!*rsp) {
2423         error_report("%s: Init ramstate fail", __func__);
2424         return -1;
2425     }
2426
2427     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2428     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2429     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2430
2431     /*
2432      * Count the total number of pages used by ram blocks not including any
2433      * gaps due to alignment or unplugs.
2434      */
2435     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2436
2437     ram_state_reset(*rsp);
2438
2439     return 0;
2440 }
2441
2442 static void ram_list_init_bitmaps(void)
2443 {
2444     RAMBlock *block;
2445     unsigned long pages;
2446
2447     /* Skip setting bitmap if there is no RAM */
2448     if (ram_bytes_total()) {
2449         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2450             pages = block->max_length >> TARGET_PAGE_BITS;
2451             block->bmap = bitmap_new(pages);
2452             bitmap_set(block->bmap, 0, pages);
2453             if (migrate_postcopy_ram()) {
2454                 block->unsentmap = bitmap_new(pages);
2455                 bitmap_set(block->unsentmap, 0, pages);
2456             }
2457         }
2458     }
2459 }
2460
2461 static void ram_init_bitmaps(RAMState *rs)
2462 {
2463     /* For memory_global_dirty_log_start below.  */
2464     qemu_mutex_lock_iothread();
2465     qemu_mutex_lock_ramlist();
2466     rcu_read_lock();
2467
2468     ram_list_init_bitmaps();
2469     memory_global_dirty_log_start();
2470     migration_bitmap_sync(rs);
2471
2472     rcu_read_unlock();
2473     qemu_mutex_unlock_ramlist();
2474     qemu_mutex_unlock_iothread();
2475 }
2476
2477 static int ram_init_all(RAMState **rsp)
2478 {
2479     if (ram_state_init(rsp)) {
2480         return -1;
2481     }
2482
2483     if (xbzrle_init()) {
2484         ram_state_cleanup(rsp);
2485         return -1;
2486     }
2487
2488     ram_init_bitmaps(*rsp);
2489
2490     return 0;
2491 }
2492
2493 /*
2494  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2495  * long-running RCU critical section.  When rcu-reclaims in the code
2496  * start to become numerous it will be necessary to reduce the
2497  * granularity of these critical sections.
2498  */
2499
2500 /**
2501  * ram_save_setup: Setup RAM for migration
2502  *
2503  * Returns zero to indicate success and negative for error
2504  *
2505  * @f: QEMUFile where to send the data
2506  * @opaque: RAMState pointer
2507  */
2508 static int ram_save_setup(QEMUFile *f, void *opaque)
2509 {
2510     RAMState **rsp = opaque;
2511     RAMBlock *block;
2512
2513     if (compress_threads_save_setup()) {
2514         return -1;
2515     }
2516
2517     /* migration has already setup the bitmap, reuse it. */
2518     if (!migration_in_colo_state()) {
2519         if (ram_init_all(rsp) != 0) {
2520             compress_threads_save_cleanup();
2521             return -1;
2522         }
2523     }
2524     (*rsp)->f = f;
2525
2526     rcu_read_lock();
2527
2528     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2529
2530     RAMBLOCK_FOREACH(block) {
2531         qemu_put_byte(f, strlen(block->idstr));
2532         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2533         qemu_put_be64(f, block->used_length);
2534         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2535             qemu_put_be64(f, block->page_size);
2536         }
2537     }
2538
2539     rcu_read_unlock();
2540
2541     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2542     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2543
2544     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2545
2546     return 0;
2547 }
2548
2549 /**
2550  * ram_save_iterate: iterative stage for migration
2551  *
2552  * Returns zero to indicate success and negative for error
2553  *
2554  * @f: QEMUFile where to send the data
2555  * @opaque: RAMState pointer
2556  */
2557 static int ram_save_iterate(QEMUFile *f, void *opaque)
2558 {
2559     RAMState **temp = opaque;
2560     RAMState *rs = *temp;
2561     int ret;
2562     int i;
2563     int64_t t0;
2564     int done = 0;
2565
2566     if (blk_mig_bulk_active()) {
2567         /* Avoid transferring ram during bulk phase of block migration as
2568          * the bulk phase will usually take a long time and transferring
2569          * ram updates during that time is pointless. */
2570         goto out;
2571     }
2572
2573     rcu_read_lock();
2574     if (ram_list.version != rs->last_version) {
2575         ram_state_reset(rs);
2576     }
2577
2578     /* Read version before ram_list.blocks */
2579     smp_rmb();
2580
2581     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2582
2583     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2584     i = 0;
2585     while ((ret = qemu_file_rate_limit(f)) == 0) {
2586         int pages;
2587
2588         pages = ram_find_and_save_block(rs, false);
2589         /* no more pages to sent */
2590         if (pages == 0) {
2591             done = 1;
2592             break;
2593         }
2594         rs->iterations++;
2595
2596         /* we want to check in the 1st loop, just in case it was the 1st time
2597            and we had to sync the dirty bitmap.
2598            qemu_get_clock_ns() is a bit expensive, so we only check each some
2599            iterations
2600         */
2601         if ((i & 63) == 0) {
2602             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2603             if (t1 > MAX_WAIT) {
2604                 trace_ram_save_iterate_big_wait(t1, i);
2605                 break;
2606             }
2607         }
2608         i++;
2609     }
2610     flush_compressed_data(rs);
2611     rcu_read_unlock();
2612
2613     /*
2614      * Must occur before EOS (or any QEMUFile operation)
2615      * because of RDMA protocol.
2616      */
2617     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2618
2619 out:
2620     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2621     ram_counters.transferred += 8;
2622
2623     ret = qemu_file_get_error(f);
2624     if (ret < 0) {
2625         return ret;
2626     }
2627
2628     return done;
2629 }
2630
2631 /**
2632  * ram_save_complete: function called to send the remaining amount of ram
2633  *
2634  * Returns zero to indicate success
2635  *
2636  * Called with iothread lock
2637  *
2638  * @f: QEMUFile where to send the data
2639  * @opaque: RAMState pointer
2640  */
2641 static int ram_save_complete(QEMUFile *f, void *opaque)
2642 {
2643     RAMState **temp = opaque;
2644     RAMState *rs = *temp;
2645
2646     rcu_read_lock();
2647
2648     if (!migration_in_postcopy()) {
2649         migration_bitmap_sync(rs);
2650     }
2651
2652     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2653
2654     /* try transferring iterative blocks of memory */
2655
2656     /* flush all remaining blocks regardless of rate limiting */
2657     while (true) {
2658         int pages;
2659
2660         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2661         /* no more blocks to sent */
2662         if (pages == 0) {
2663             break;
2664         }
2665     }
2666
2667     flush_compressed_data(rs);
2668     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2669
2670     rcu_read_unlock();
2671
2672     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2673
2674     return 0;
2675 }
2676
2677 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2678                              uint64_t *res_precopy_only,
2679                              uint64_t *res_compatible,
2680                              uint64_t *res_postcopy_only)
2681 {
2682     RAMState **temp = opaque;
2683     RAMState *rs = *temp;
2684     uint64_t remaining_size;
2685
2686     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2687
2688     if (!migration_in_postcopy() &&
2689         remaining_size < max_size) {
2690         qemu_mutex_lock_iothread();
2691         rcu_read_lock();
2692         migration_bitmap_sync(rs);
2693         rcu_read_unlock();
2694         qemu_mutex_unlock_iothread();
2695         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2696     }
2697
2698     if (migrate_postcopy_ram()) {
2699         /* We can do postcopy, and all the data is postcopiable */
2700         *res_compatible += remaining_size;
2701     } else {
2702         *res_precopy_only += remaining_size;
2703     }
2704 }
2705
2706 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2707 {
2708     unsigned int xh_len;
2709     int xh_flags;
2710     uint8_t *loaded_data;
2711
2712     /* extract RLE header */
2713     xh_flags = qemu_get_byte(f);
2714     xh_len = qemu_get_be16(f);
2715
2716     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2717         error_report("Failed to load XBZRLE page - wrong compression!");
2718         return -1;
2719     }
2720
2721     if (xh_len > TARGET_PAGE_SIZE) {
2722         error_report("Failed to load XBZRLE page - len overflow!");
2723         return -1;
2724     }
2725     loaded_data = XBZRLE.decoded_buf;
2726     /* load data and decode */
2727     /* it can change loaded_data to point to an internal buffer */
2728     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2729
2730     /* decode RLE */
2731     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2732                              TARGET_PAGE_SIZE) == -1) {
2733         error_report("Failed to load XBZRLE page - decode error!");
2734         return -1;
2735     }
2736
2737     return 0;
2738 }
2739
2740 /**
2741  * ram_block_from_stream: read a RAMBlock id from the migration stream
2742  *
2743  * Must be called from within a rcu critical section.
2744  *
2745  * Returns a pointer from within the RCU-protected ram_list.
2746  *
2747  * @f: QEMUFile where to read the data from
2748  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2749  */
2750 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2751 {
2752     static RAMBlock *block = NULL;
2753     char id[256];
2754     uint8_t len;
2755
2756     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2757         if (!block) {
2758             error_report("Ack, bad migration stream!");
2759             return NULL;
2760         }
2761         return block;
2762     }
2763
2764     len = qemu_get_byte(f);
2765     qemu_get_buffer(f, (uint8_t *)id, len);
2766     id[len] = 0;
2767
2768     block = qemu_ram_block_by_name(id);
2769     if (!block) {
2770         error_report("Can't find block %s", id);
2771         return NULL;
2772     }
2773
2774     return block;
2775 }
2776
2777 static inline void *host_from_ram_block_offset(RAMBlock *block,
2778                                                ram_addr_t offset)
2779 {
2780     if (!offset_in_ramblock(block, offset)) {
2781         return NULL;
2782     }
2783
2784     return block->host + offset;
2785 }
2786
2787 /**
2788  * ram_handle_compressed: handle the zero page case
2789  *
2790  * If a page (or a whole RDMA chunk) has been
2791  * determined to be zero, then zap it.
2792  *
2793  * @host: host address for the zero page
2794  * @ch: what the page is filled from.  We only support zero
2795  * @size: size of the zero page
2796  */
2797 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2798 {
2799     if (ch != 0 || !is_zero_range(host, size)) {
2800         memset(host, ch, size);
2801     }
2802 }
2803
2804 /* return the size after decompression, or negative value on error */
2805 static int
2806 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2807                      const uint8_t *source, size_t source_len)
2808 {
2809     int err;
2810
2811     err = inflateReset(stream);
2812     if (err != Z_OK) {
2813         return -1;
2814     }
2815
2816     stream->avail_in = source_len;
2817     stream->next_in = (uint8_t *)source;
2818     stream->avail_out = dest_len;
2819     stream->next_out = dest;
2820
2821     err = inflate(stream, Z_NO_FLUSH);
2822     if (err != Z_STREAM_END) {
2823         return -1;
2824     }
2825
2826     return stream->total_out;
2827 }
2828
2829 static void *do_data_decompress(void *opaque)
2830 {
2831     DecompressParam *param = opaque;
2832     unsigned long pagesize;
2833     uint8_t *des;
2834     int len, ret;
2835
2836     qemu_mutex_lock(&param->mutex);
2837     while (!param->quit) {
2838         if (param->des) {
2839             des = param->des;
2840             len = param->len;
2841             param->des = 0;
2842             qemu_mutex_unlock(&param->mutex);
2843
2844             pagesize = TARGET_PAGE_SIZE;
2845
2846             ret = qemu_uncompress_data(&param->stream, des, pagesize,
2847                                        param->compbuf, len);
2848             if (ret < 0) {
2849                 error_report("decompress data failed");
2850                 qemu_file_set_error(decomp_file, ret);
2851             }
2852
2853             qemu_mutex_lock(&decomp_done_lock);
2854             param->done = true;
2855             qemu_cond_signal(&decomp_done_cond);
2856             qemu_mutex_unlock(&decomp_done_lock);
2857
2858             qemu_mutex_lock(&param->mutex);
2859         } else {
2860             qemu_cond_wait(&param->cond, &param->mutex);
2861         }
2862     }
2863     qemu_mutex_unlock(&param->mutex);
2864
2865     return NULL;
2866 }
2867
2868 static int wait_for_decompress_done(void)
2869 {
2870     int idx, thread_count;
2871
2872     if (!migrate_use_compression()) {
2873         return 0;
2874     }
2875
2876     thread_count = migrate_decompress_threads();
2877     qemu_mutex_lock(&decomp_done_lock);
2878     for (idx = 0; idx < thread_count; idx++) {
2879         while (!decomp_param[idx].done) {
2880             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2881         }
2882     }
2883     qemu_mutex_unlock(&decomp_done_lock);
2884     return qemu_file_get_error(decomp_file);
2885 }
2886
2887 static void compress_threads_load_cleanup(void)
2888 {
2889     int i, thread_count;
2890
2891     if (!migrate_use_compression()) {
2892         return;
2893     }
2894     thread_count = migrate_decompress_threads();
2895     for (i = 0; i < thread_count; i++) {
2896         /*
2897          * we use it as a indicator which shows if the thread is
2898          * properly init'd or not
2899          */
2900         if (!decomp_param[i].compbuf) {
2901             break;
2902         }
2903
2904         qemu_mutex_lock(&decomp_param[i].mutex);
2905         decomp_param[i].quit = true;
2906         qemu_cond_signal(&decomp_param[i].cond);
2907         qemu_mutex_unlock(&decomp_param[i].mutex);
2908     }
2909     for (i = 0; i < thread_count; i++) {
2910         if (!decomp_param[i].compbuf) {
2911             break;
2912         }
2913
2914         qemu_thread_join(decompress_threads + i);
2915         qemu_mutex_destroy(&decomp_param[i].mutex);
2916         qemu_cond_destroy(&decomp_param[i].cond);
2917         inflateEnd(&decomp_param[i].stream);
2918         g_free(decomp_param[i].compbuf);
2919         decomp_param[i].compbuf = NULL;
2920     }
2921     g_free(decompress_threads);
2922     g_free(decomp_param);
2923     decompress_threads = NULL;
2924     decomp_param = NULL;
2925     decomp_file = NULL;
2926 }
2927
2928 static int compress_threads_load_setup(QEMUFile *f)
2929 {
2930     int i, thread_count;
2931
2932     if (!migrate_use_compression()) {
2933         return 0;
2934     }
2935
2936     thread_count = migrate_decompress_threads();
2937     decompress_threads = g_new0(QemuThread, thread_count);
2938     decomp_param = g_new0(DecompressParam, thread_count);
2939     qemu_mutex_init(&decomp_done_lock);
2940     qemu_cond_init(&decomp_done_cond);
2941     decomp_file = f;
2942     for (i = 0; i < thread_count; i++) {
2943         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2944             goto exit;
2945         }
2946
2947         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2948         qemu_mutex_init(&decomp_param[i].mutex);
2949         qemu_cond_init(&decomp_param[i].cond);
2950         decomp_param[i].done = true;
2951         decomp_param[i].quit = false;
2952         qemu_thread_create(decompress_threads + i, "decompress",
2953                            do_data_decompress, decomp_param + i,
2954                            QEMU_THREAD_JOINABLE);
2955     }
2956     return 0;
2957 exit:
2958     compress_threads_load_cleanup();
2959     return -1;
2960 }
2961
2962 static void decompress_data_with_multi_threads(QEMUFile *f,
2963                                                void *host, int len)
2964 {
2965     int idx, thread_count;
2966
2967     thread_count = migrate_decompress_threads();
2968     qemu_mutex_lock(&decomp_done_lock);
2969     while (true) {
2970         for (idx = 0; idx < thread_count; idx++) {
2971             if (decomp_param[idx].done) {
2972                 decomp_param[idx].done = false;
2973                 qemu_mutex_lock(&decomp_param[idx].mutex);
2974                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2975                 decomp_param[idx].des = host;
2976                 decomp_param[idx].len = len;
2977                 qemu_cond_signal(&decomp_param[idx].cond);
2978                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2979                 break;
2980             }
2981         }
2982         if (idx < thread_count) {
2983             break;
2984         } else {
2985             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2986         }
2987     }
2988     qemu_mutex_unlock(&decomp_done_lock);
2989 }
2990
2991 /**
2992  * ram_load_setup: Setup RAM for migration incoming side
2993  *
2994  * Returns zero to indicate success and negative for error
2995  *
2996  * @f: QEMUFile where to receive the data
2997  * @opaque: RAMState pointer
2998  */
2999 static int ram_load_setup(QEMUFile *f, void *opaque)
3000 {
3001     if (compress_threads_load_setup(f)) {
3002         return -1;
3003     }
3004
3005     xbzrle_load_setup();
3006     ramblock_recv_map_init();
3007     return 0;
3008 }
3009
3010 static int ram_load_cleanup(void *opaque)
3011 {
3012     RAMBlock *rb;
3013     xbzrle_load_cleanup();
3014     compress_threads_load_cleanup();
3015
3016     RAMBLOCK_FOREACH(rb) {
3017         g_free(rb->receivedmap);
3018         rb->receivedmap = NULL;
3019     }
3020     return 0;
3021 }
3022
3023 /**
3024  * ram_postcopy_incoming_init: allocate postcopy data structures
3025  *
3026  * Returns 0 for success and negative if there was one error
3027  *
3028  * @mis: current migration incoming state
3029  *
3030  * Allocate data structures etc needed by incoming migration with
3031  * postcopy-ram. postcopy-ram's similarly names
3032  * postcopy_ram_incoming_init does the work.
3033  */
3034 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3035 {
3036     unsigned long ram_pages = last_ram_page();
3037
3038     return postcopy_ram_incoming_init(mis, ram_pages);
3039 }
3040
3041 /**
3042  * ram_load_postcopy: load a page in postcopy case
3043  *
3044  * Returns 0 for success or -errno in case of error
3045  *
3046  * Called in postcopy mode by ram_load().
3047  * rcu_read_lock is taken prior to this being called.
3048  *
3049  * @f: QEMUFile where to send the data
3050  */
3051 static int ram_load_postcopy(QEMUFile *f)
3052 {
3053     int flags = 0, ret = 0;
3054     bool place_needed = false;
3055     bool matching_page_sizes = false;
3056     MigrationIncomingState *mis = migration_incoming_get_current();
3057     /* Temporary page that is later 'placed' */
3058     void *postcopy_host_page = postcopy_get_tmp_page(mis);
3059     void *last_host = NULL;
3060     bool all_zero = false;
3061
3062     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3063         ram_addr_t addr;
3064         void *host = NULL;
3065         void *page_buffer = NULL;
3066         void *place_source = NULL;
3067         RAMBlock *block = NULL;
3068         uint8_t ch;
3069
3070         addr = qemu_get_be64(f);
3071
3072         /*
3073          * If qemu file error, we should stop here, and then "addr"
3074          * may be invalid
3075          */
3076         ret = qemu_file_get_error(f);
3077         if (ret) {
3078             break;
3079         }
3080
3081         flags = addr & ~TARGET_PAGE_MASK;
3082         addr &= TARGET_PAGE_MASK;
3083
3084         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3085         place_needed = false;
3086         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
3087             block = ram_block_from_stream(f, flags);
3088
3089             host = host_from_ram_block_offset(block, addr);
3090             if (!host) {
3091                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3092                 ret = -EINVAL;
3093                 break;
3094             }
3095             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
3096             /*
3097              * Postcopy requires that we place whole host pages atomically;
3098              * these may be huge pages for RAMBlocks that are backed by
3099              * hugetlbfs.
3100              * To make it atomic, the data is read into a temporary page
3101              * that's moved into place later.
3102              * The migration protocol uses,  possibly smaller, target-pages
3103              * however the source ensures it always sends all the components
3104              * of a host page in order.
3105              */
3106             page_buffer = postcopy_host_page +
3107                           ((uintptr_t)host & (block->page_size - 1));
3108             /* If all TP are zero then we can optimise the place */
3109             if (!((uintptr_t)host & (block->page_size - 1))) {
3110                 all_zero = true;
3111             } else {
3112                 /* not the 1st TP within the HP */
3113                 if (host != (last_host + TARGET_PAGE_SIZE)) {
3114                     error_report("Non-sequential target page %p/%p",
3115                                   host, last_host);
3116                     ret = -EINVAL;
3117                     break;
3118                 }
3119             }
3120
3121
3122             /*
3123              * If it's the last part of a host page then we place the host
3124              * page
3125              */
3126             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
3127                                      (block->page_size - 1)) == 0;
3128             place_source = postcopy_host_page;
3129         }
3130         last_host = host;
3131
3132         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3133         case RAM_SAVE_FLAG_ZERO:
3134             ch = qemu_get_byte(f);
3135             memset(page_buffer, ch, TARGET_PAGE_SIZE);
3136             if (ch) {
3137                 all_zero = false;
3138             }
3139             break;
3140
3141         case RAM_SAVE_FLAG_PAGE:
3142             all_zero = false;
3143             if (!place_needed || !matching_page_sizes) {
3144                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3145             } else {
3146                 /* Avoids the qemu_file copy during postcopy, which is
3147                  * going to do a copy later; can only do it when we
3148                  * do this read in one go (matching page sizes)
3149                  */
3150                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3151                                          TARGET_PAGE_SIZE);
3152             }
3153             break;
3154         case RAM_SAVE_FLAG_EOS:
3155             /* normal exit */
3156             break;
3157         default:
3158             error_report("Unknown combination of migration flags: %#x"
3159                          " (postcopy mode)", flags);
3160             ret = -EINVAL;
3161             break;
3162         }
3163
3164         /* Detect for any possible file errors */
3165         if (!ret && qemu_file_get_error(f)) {
3166             ret = qemu_file_get_error(f);
3167         }
3168
3169         if (!ret && place_needed) {
3170             /* This gets called at the last target page in the host page */
3171             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3172
3173             if (all_zero) {
3174                 ret = postcopy_place_page_zero(mis, place_dest,
3175                                                block);
3176             } else {
3177                 ret = postcopy_place_page(mis, place_dest,
3178                                           place_source, block);
3179             }
3180         }
3181     }
3182
3183     return ret;
3184 }
3185
3186 static bool postcopy_is_advised(void)
3187 {
3188     PostcopyState ps = postcopy_state_get();
3189     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3190 }
3191
3192 static bool postcopy_is_running(void)
3193 {
3194     PostcopyState ps = postcopy_state_get();
3195     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3196 }
3197
3198 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3199 {
3200     int flags = 0, ret = 0, invalid_flags = 0;
3201     static uint64_t seq_iter;
3202     int len = 0;
3203     /*
3204      * If system is running in postcopy mode, page inserts to host memory must
3205      * be atomic
3206      */
3207     bool postcopy_running = postcopy_is_running();
3208     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3209     bool postcopy_advised = postcopy_is_advised();
3210
3211     seq_iter++;
3212
3213     if (version_id != 4) {
3214         ret = -EINVAL;
3215     }
3216
3217     if (!migrate_use_compression()) {
3218         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3219     }
3220     /* This RCU critical section can be very long running.
3221      * When RCU reclaims in the code start to become numerous,
3222      * it will be necessary to reduce the granularity of this
3223      * critical section.
3224      */
3225     rcu_read_lock();
3226
3227     if (postcopy_running) {
3228         ret = ram_load_postcopy(f);
3229     }
3230
3231     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3232         ram_addr_t addr, total_ram_bytes;
3233         void *host = NULL;
3234         uint8_t ch;
3235
3236         addr = qemu_get_be64(f);
3237         flags = addr & ~TARGET_PAGE_MASK;
3238         addr &= TARGET_PAGE_MASK;
3239
3240         if (flags & invalid_flags) {
3241             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3242                 error_report("Received an unexpected compressed page");
3243             }
3244
3245             ret = -EINVAL;
3246             break;
3247         }
3248
3249         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3250                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3251             RAMBlock *block = ram_block_from_stream(f, flags);
3252
3253             host = host_from_ram_block_offset(block, addr);
3254             if (!host) {
3255                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3256                 ret = -EINVAL;
3257                 break;
3258             }
3259             ramblock_recv_bitmap_set(block, host);
3260             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3261         }
3262
3263         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3264         case RAM_SAVE_FLAG_MEM_SIZE:
3265             /* Synchronize RAM block list */
3266             total_ram_bytes = addr;
3267             while (!ret && total_ram_bytes) {
3268                 RAMBlock *block;
3269                 char id[256];
3270                 ram_addr_t length;
3271
3272                 len = qemu_get_byte(f);
3273                 qemu_get_buffer(f, (uint8_t *)id, len);
3274                 id[len] = 0;
3275                 length = qemu_get_be64(f);
3276
3277                 block = qemu_ram_block_by_name(id);
3278                 if (block) {
3279                     if (length != block->used_length) {
3280                         Error *local_err = NULL;
3281
3282                         ret = qemu_ram_resize(block, length,
3283                                               &local_err);
3284                         if (local_err) {
3285                             error_report_err(local_err);
3286                         }
3287                     }
3288                     /* For postcopy we need to check hugepage sizes match */
3289                     if (postcopy_advised &&
3290                         block->page_size != qemu_host_page_size) {
3291                         uint64_t remote_page_size = qemu_get_be64(f);
3292                         if (remote_page_size != block->page_size) {
3293                             error_report("Mismatched RAM page size %s "
3294                                          "(local) %zd != %" PRId64,
3295                                          id, block->page_size,
3296                                          remote_page_size);
3297                             ret = -EINVAL;
3298                         }
3299                     }
3300                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3301                                           block->idstr);
3302                 } else {
3303                     error_report("Unknown ramblock \"%s\", cannot "
3304                                  "accept migration", id);
3305                     ret = -EINVAL;
3306                 }
3307
3308                 total_ram_bytes -= length;
3309             }
3310             break;
3311
3312         case RAM_SAVE_FLAG_ZERO:
3313             ch = qemu_get_byte(f);
3314             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3315             break;
3316
3317         case RAM_SAVE_FLAG_PAGE:
3318             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3319             break;
3320
3321         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3322             len = qemu_get_be32(f);
3323             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3324                 error_report("Invalid compressed data length: %d", len);
3325                 ret = -EINVAL;
3326                 break;
3327             }
3328             decompress_data_with_multi_threads(f, host, len);
3329             break;
3330
3331         case RAM_SAVE_FLAG_XBZRLE:
3332             if (load_xbzrle(f, addr, host) < 0) {
3333                 error_report("Failed to decompress XBZRLE page at "
3334                              RAM_ADDR_FMT, addr);
3335                 ret = -EINVAL;
3336                 break;
3337             }
3338             break;
3339         case RAM_SAVE_FLAG_EOS:
3340             /* normal exit */
3341             break;
3342         default:
3343             if (flags & RAM_SAVE_FLAG_HOOK) {
3344                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3345             } else {
3346                 error_report("Unknown combination of migration flags: %#x",
3347                              flags);
3348                 ret = -EINVAL;
3349             }
3350         }
3351         if (!ret) {
3352             ret = qemu_file_get_error(f);
3353         }
3354     }
3355
3356     ret |= wait_for_decompress_done();
3357     rcu_read_unlock();
3358     trace_ram_load_complete(ret, seq_iter);
3359     return ret;
3360 }
3361
3362 static bool ram_has_postcopy(void *opaque)
3363 {
3364     return migrate_postcopy_ram();
3365 }
3366
3367 /*
3368  * Read the received bitmap, revert it as the initial dirty bitmap.
3369  * This is only used when the postcopy migration is paused but wants
3370  * to resume from a middle point.
3371  */
3372 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3373 {
3374     int ret = -EINVAL;
3375     QEMUFile *file = s->rp_state.from_dst_file;
3376     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3377     uint64_t local_size = nbits / 8;
3378     uint64_t size, end_mark;
3379
3380     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3381
3382     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3383         error_report("%s: incorrect state %s", __func__,
3384                      MigrationStatus_str(s->state));
3385         return -EINVAL;
3386     }
3387
3388     /*
3389      * Note: see comments in ramblock_recv_bitmap_send() on why we
3390      * need the endianess convertion, and the paddings.
3391      */
3392     local_size = ROUND_UP(local_size, 8);
3393
3394     /* Add paddings */
3395     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3396
3397     size = qemu_get_be64(file);
3398
3399     /* The size of the bitmap should match with our ramblock */
3400     if (size != local_size) {
3401         error_report("%s: ramblock '%s' bitmap size mismatch "
3402                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3403                      block->idstr, size, local_size);
3404         ret = -EINVAL;
3405         goto out;
3406     }
3407
3408     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3409     end_mark = qemu_get_be64(file);
3410
3411     ret = qemu_file_get_error(file);
3412     if (ret || size != local_size) {
3413         error_report("%s: read bitmap failed for ramblock '%s': %d"
3414                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3415                      __func__, block->idstr, ret, local_size, size);
3416         ret = -EIO;
3417         goto out;
3418     }
3419
3420     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3421         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3422                      __func__, block->idstr, end_mark);
3423         ret = -EINVAL;
3424         goto out;
3425     }
3426
3427     /*
3428      * Endianess convertion. We are during postcopy (though paused).
3429      * The dirty bitmap won't change. We can directly modify it.
3430      */
3431     bitmap_from_le(block->bmap, le_bitmap, nbits);
3432
3433     /*
3434      * What we received is "received bitmap". Revert it as the initial
3435      * dirty bitmap for this ramblock.
3436      */
3437     bitmap_complement(block->bmap, block->bmap, nbits);
3438
3439     trace_ram_dirty_bitmap_reload_complete(block->idstr);
3440
3441     ret = 0;
3442 out:
3443     free(le_bitmap);
3444     return ret;
3445 }
3446
3447 static SaveVMHandlers savevm_ram_handlers = {
3448     .save_setup = ram_save_setup,
3449     .save_live_iterate = ram_save_iterate,
3450     .save_live_complete_postcopy = ram_save_complete,
3451     .save_live_complete_precopy = ram_save_complete,
3452     .has_postcopy = ram_has_postcopy,
3453     .save_live_pending = ram_save_pending,
3454     .load_state = ram_load,
3455     .save_cleanup = ram_save_cleanup,
3456     .load_setup = ram_load_setup,
3457     .load_cleanup = ram_load_cleanup,
3458 };
3459
3460 void ram_mig_init(void)
3461 {
3462     qemu_mutex_init(&XBZRLE.lock);
3463     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
3464 }