xen-pvdevice: Introduce a simplistic xen-pvdevice save state
[qemu.git] / migration / ram.c
blob5bcbf7a9f98dcc07a39dfd6ceca2e01d43ad4ee4
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qemu/cutils.h"
33 #include "qemu/bitops.h"
34 #include "qemu/bitmap.h"
35 #include "qemu/main-loop.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "socket.h"
40 #include "migration/register.h"
41 #include "migration/misc.h"
42 #include "qemu-file.h"
43 #include "postcopy-ram.h"
44 #include "migration/page_cache.h"
45 #include "qemu/error-report.h"
46 #include "qapi/error.h"
47 #include "qapi/qapi-events-migration.h"
48 #include "qapi/qmp/qerror.h"
49 #include "trace.h"
50 #include "exec/ram_addr.h"
51 #include "exec/target_page.h"
52 #include "qemu/rcu_queue.h"
53 #include "migration/colo.h"
54 #include "migration/block.h"
55 #include "sysemu/sysemu.h"
56 #include "qemu/uuid.h"
57 #include "savevm.h"
59 /***********************************************************/
60 /* ram save/restore */
62 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
63 * worked for pages that where filled with the same char. We switched
64 * it to only search for the zero value. And to avoid confusion with
65 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
68 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
69 #define RAM_SAVE_FLAG_ZERO 0x02
70 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
71 #define RAM_SAVE_FLAG_PAGE 0x08
72 #define RAM_SAVE_FLAG_EOS 0x10
73 #define RAM_SAVE_FLAG_CONTINUE 0x20
74 #define RAM_SAVE_FLAG_XBZRLE 0x40
75 /* 0x80 is reserved in migration.h start with 0x100 next */
76 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
78 static inline bool is_zero_range(uint8_t *p, uint64_t size)
80 return buffer_is_zero(p, size);
83 XBZRLECacheStats xbzrle_counters;
85 /* struct contains XBZRLE cache and a static page
86 used by the compression */
87 static struct {
88 /* buffer used for XBZRLE encoding */
89 uint8_t *encoded_buf;
90 /* buffer for storing page content */
91 uint8_t *current_buf;
92 /* Cache for XBZRLE, Protected by lock. */
93 PageCache *cache;
94 QemuMutex lock;
95 /* it will store a page full of zeros */
96 uint8_t *zero_target_page;
97 /* buffer used for XBZRLE decoding */
98 uint8_t *decoded_buf;
99 } XBZRLE;
101 static void XBZRLE_cache_lock(void)
103 if (migrate_use_xbzrle())
104 qemu_mutex_lock(&XBZRLE.lock);
107 static void XBZRLE_cache_unlock(void)
109 if (migrate_use_xbzrle())
110 qemu_mutex_unlock(&XBZRLE.lock);
114 * xbzrle_cache_resize: resize the xbzrle cache
116 * This function is called from qmp_migrate_set_cache_size in main
117 * thread, possibly while a migration is in progress. A running
118 * migration may be using the cache and might finish during this call,
119 * hence changes to the cache are protected by XBZRLE.lock().
121 * Returns 0 for success or -1 for error
123 * @new_size: new cache size
124 * @errp: set *errp if the check failed, with reason
126 int xbzrle_cache_resize(int64_t new_size, Error **errp)
128 PageCache *new_cache;
129 int64_t ret = 0;
131 /* Check for truncation */
132 if (new_size != (size_t)new_size) {
133 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
134 "exceeding address space");
135 return -1;
138 if (new_size == migrate_xbzrle_cache_size()) {
139 /* nothing to do */
140 return 0;
143 XBZRLE_cache_lock();
145 if (XBZRLE.cache != NULL) {
146 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
147 if (!new_cache) {
148 ret = -1;
149 goto out;
152 cache_fini(XBZRLE.cache);
153 XBZRLE.cache = new_cache;
155 out:
156 XBZRLE_cache_unlock();
157 return ret;
160 static void ramblock_recv_map_init(void)
162 RAMBlock *rb;
164 RAMBLOCK_FOREACH(rb) {
165 assert(!rb->receivedmap);
166 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
170 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
172 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
173 rb->receivedmap);
176 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
178 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
181 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
183 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
186 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
187 size_t nr)
189 bitmap_set_atomic(rb->receivedmap,
190 ramblock_recv_bitmap_offset(host_addr, rb),
191 nr);
194 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
197 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
199 * Returns >0 if success with sent bytes, or <0 if error.
201 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
202 const char *block_name)
204 RAMBlock *block = qemu_ram_block_by_name(block_name);
205 unsigned long *le_bitmap, nbits;
206 uint64_t size;
208 if (!block) {
209 error_report("%s: invalid block name: %s", __func__, block_name);
210 return -1;
213 nbits = block->used_length >> TARGET_PAGE_BITS;
216 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
217 * machines we may need 4 more bytes for padding (see below
218 * comment). So extend it a bit before hand.
220 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
223 * Always use little endian when sending the bitmap. This is
224 * required that when source and destination VMs are not using the
225 * same endianess. (Note: big endian won't work.)
227 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
229 /* Size of the bitmap, in bytes */
230 size = nbits / 8;
233 * size is always aligned to 8 bytes for 64bit machines, but it
234 * may not be true for 32bit machines. We need this padding to
235 * make sure the migration can survive even between 32bit and
236 * 64bit machines.
238 size = ROUND_UP(size, 8);
240 qemu_put_be64(file, size);
241 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
243 * Mark as an end, in case the middle part is screwed up due to
244 * some "misterious" reason.
246 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
247 qemu_fflush(file);
249 free(le_bitmap);
251 if (qemu_file_get_error(file)) {
252 return qemu_file_get_error(file);
255 return size + sizeof(size);
259 * An outstanding page request, on the source, having been received
260 * and queued
262 struct RAMSrcPageRequest {
263 RAMBlock *rb;
264 hwaddr offset;
265 hwaddr len;
267 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
270 /* State of RAM for migration */
271 struct RAMState {
272 /* QEMUFile used for this migration */
273 QEMUFile *f;
274 /* Last block that we have visited searching for dirty pages */
275 RAMBlock *last_seen_block;
276 /* Last block from where we have sent data */
277 RAMBlock *last_sent_block;
278 /* Last dirty target page we have sent */
279 ram_addr_t last_page;
280 /* last ram version we have seen */
281 uint32_t last_version;
282 /* We are in the first round */
283 bool ram_bulk_stage;
284 /* How many times we have dirty too many pages */
285 int dirty_rate_high_cnt;
286 /* these variables are used for bitmap sync */
287 /* last time we did a full bitmap_sync */
288 int64_t time_last_bitmap_sync;
289 /* bytes transferred at start_time */
290 uint64_t bytes_xfer_prev;
291 /* number of dirty pages since start_time */
292 uint64_t num_dirty_pages_period;
293 /* xbzrle misses since the beginning of the period */
294 uint64_t xbzrle_cache_miss_prev;
295 /* number of iterations at the beginning of period */
296 uint64_t iterations_prev;
297 /* Iterations since start */
298 uint64_t iterations;
299 /* number of dirty bits in the bitmap */
300 uint64_t migration_dirty_pages;
301 /* protects modification of the bitmap */
302 QemuMutex bitmap_mutex;
303 /* The RAMBlock used in the last src_page_requests */
304 RAMBlock *last_req_rb;
305 /* Queue of outstanding page requests from the destination */
306 QemuMutex src_page_req_mutex;
307 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
309 typedef struct RAMState RAMState;
311 static RAMState *ram_state;
313 uint64_t ram_bytes_remaining(void)
315 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
319 MigrationStats ram_counters;
321 /* used by the search for pages to send */
322 struct PageSearchStatus {
323 /* Current block being searched */
324 RAMBlock *block;
325 /* Current page to search from */
326 unsigned long page;
327 /* Set once we wrap around */
328 bool complete_round;
330 typedef struct PageSearchStatus PageSearchStatus;
332 struct CompressParam {
333 bool done;
334 bool quit;
335 QEMUFile *file;
336 QemuMutex mutex;
337 QemuCond cond;
338 RAMBlock *block;
339 ram_addr_t offset;
341 /* internally used fields */
342 z_stream stream;
343 uint8_t *originbuf;
345 typedef struct CompressParam CompressParam;
347 struct DecompressParam {
348 bool done;
349 bool quit;
350 QemuMutex mutex;
351 QemuCond cond;
352 void *des;
353 uint8_t *compbuf;
354 int len;
355 z_stream stream;
357 typedef struct DecompressParam DecompressParam;
359 static CompressParam *comp_param;
360 static QemuThread *compress_threads;
361 /* comp_done_cond is used to wake up the migration thread when
362 * one of the compression threads has finished the compression.
363 * comp_done_lock is used to co-work with comp_done_cond.
365 static QemuMutex comp_done_lock;
366 static QemuCond comp_done_cond;
367 /* The empty QEMUFileOps will be used by file in CompressParam */
368 static const QEMUFileOps empty_ops = { };
370 static QEMUFile *decomp_file;
371 static DecompressParam *decomp_param;
372 static QemuThread *decompress_threads;
373 static QemuMutex decomp_done_lock;
374 static QemuCond decomp_done_cond;
376 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
377 ram_addr_t offset, uint8_t *source_buf);
379 static void *do_data_compress(void *opaque)
381 CompressParam *param = opaque;
382 RAMBlock *block;
383 ram_addr_t offset;
385 qemu_mutex_lock(&param->mutex);
386 while (!param->quit) {
387 if (param->block) {
388 block = param->block;
389 offset = param->offset;
390 param->block = NULL;
391 qemu_mutex_unlock(&param->mutex);
393 do_compress_ram_page(param->file, &param->stream, block, offset,
394 param->originbuf);
396 qemu_mutex_lock(&comp_done_lock);
397 param->done = true;
398 qemu_cond_signal(&comp_done_cond);
399 qemu_mutex_unlock(&comp_done_lock);
401 qemu_mutex_lock(&param->mutex);
402 } else {
403 qemu_cond_wait(&param->cond, &param->mutex);
406 qemu_mutex_unlock(&param->mutex);
408 return NULL;
411 static inline void terminate_compression_threads(void)
413 int idx, thread_count;
415 thread_count = migrate_compress_threads();
417 for (idx = 0; idx < thread_count; idx++) {
418 qemu_mutex_lock(&comp_param[idx].mutex);
419 comp_param[idx].quit = true;
420 qemu_cond_signal(&comp_param[idx].cond);
421 qemu_mutex_unlock(&comp_param[idx].mutex);
425 static void compress_threads_save_cleanup(void)
427 int i, thread_count;
429 if (!migrate_use_compression()) {
430 return;
432 terminate_compression_threads();
433 thread_count = migrate_compress_threads();
434 for (i = 0; i < thread_count; i++) {
436 * we use it as a indicator which shows if the thread is
437 * properly init'd or not
439 if (!comp_param[i].file) {
440 break;
442 qemu_thread_join(compress_threads + i);
443 qemu_mutex_destroy(&comp_param[i].mutex);
444 qemu_cond_destroy(&comp_param[i].cond);
445 deflateEnd(&comp_param[i].stream);
446 g_free(comp_param[i].originbuf);
447 qemu_fclose(comp_param[i].file);
448 comp_param[i].file = NULL;
450 qemu_mutex_destroy(&comp_done_lock);
451 qemu_cond_destroy(&comp_done_cond);
452 g_free(compress_threads);
453 g_free(comp_param);
454 compress_threads = NULL;
455 comp_param = NULL;
458 static int compress_threads_save_setup(void)
460 int i, thread_count;
462 if (!migrate_use_compression()) {
463 return 0;
465 thread_count = migrate_compress_threads();
466 compress_threads = g_new0(QemuThread, thread_count);
467 comp_param = g_new0(CompressParam, thread_count);
468 qemu_cond_init(&comp_done_cond);
469 qemu_mutex_init(&comp_done_lock);
470 for (i = 0; i < thread_count; i++) {
471 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
472 if (!comp_param[i].originbuf) {
473 goto exit;
476 if (deflateInit(&comp_param[i].stream,
477 migrate_compress_level()) != Z_OK) {
478 g_free(comp_param[i].originbuf);
479 goto exit;
482 /* comp_param[i].file is just used as a dummy buffer to save data,
483 * set its ops to empty.
485 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
486 comp_param[i].done = true;
487 comp_param[i].quit = false;
488 qemu_mutex_init(&comp_param[i].mutex);
489 qemu_cond_init(&comp_param[i].cond);
490 qemu_thread_create(compress_threads + i, "compress",
491 do_data_compress, comp_param + i,
492 QEMU_THREAD_JOINABLE);
494 return 0;
496 exit:
497 compress_threads_save_cleanup();
498 return -1;
501 /* Multiple fd's */
503 #define MULTIFD_MAGIC 0x11223344U
504 #define MULTIFD_VERSION 1
506 typedef struct {
507 uint32_t magic;
508 uint32_t version;
509 unsigned char uuid[16]; /* QemuUUID */
510 uint8_t id;
511 } __attribute__((packed)) MultiFDInit_t;
513 typedef struct {
514 /* this fields are not changed once the thread is created */
515 /* channel number */
516 uint8_t id;
517 /* channel thread name */
518 char *name;
519 /* channel thread id */
520 QemuThread thread;
521 /* communication channel */
522 QIOChannel *c;
523 /* sem where to wait for more work */
524 QemuSemaphore sem;
525 /* this mutex protects the following parameters */
526 QemuMutex mutex;
527 /* is this channel thread running */
528 bool running;
529 /* should this thread finish */
530 bool quit;
531 } MultiFDSendParams;
533 typedef struct {
534 /* this fields are not changed once the thread is created */
535 /* channel number */
536 uint8_t id;
537 /* channel thread name */
538 char *name;
539 /* channel thread id */
540 QemuThread thread;
541 /* communication channel */
542 QIOChannel *c;
543 /* sem where to wait for more work */
544 QemuSemaphore sem;
545 /* this mutex protects the following parameters */
546 QemuMutex mutex;
547 /* is this channel thread running */
548 bool running;
549 /* should this thread finish */
550 bool quit;
551 } MultiFDRecvParams;
553 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
555 MultiFDInit_t msg;
556 int ret;
558 msg.magic = cpu_to_be32(MULTIFD_MAGIC);
559 msg.version = cpu_to_be32(MULTIFD_VERSION);
560 msg.id = p->id;
561 memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
563 ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
564 if (ret != 0) {
565 return -1;
567 return 0;
570 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
572 MultiFDInit_t msg;
573 int ret;
575 ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
576 if (ret != 0) {
577 return -1;
580 be32_to_cpus(&msg.magic);
581 be32_to_cpus(&msg.version);
583 if (msg.magic != MULTIFD_MAGIC) {
584 error_setg(errp, "multifd: received packet magic %x "
585 "expected %x", msg.magic, MULTIFD_MAGIC);
586 return -1;
589 if (msg.version != MULTIFD_VERSION) {
590 error_setg(errp, "multifd: received packet version %d "
591 "expected %d", msg.version, MULTIFD_VERSION);
592 return -1;
595 if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
596 char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
597 char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
599 error_setg(errp, "multifd: received uuid '%s' and expected "
600 "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
601 g_free(uuid);
602 g_free(msg_uuid);
603 return -1;
606 if (msg.id > migrate_multifd_channels()) {
607 error_setg(errp, "multifd: received channel version %d "
608 "expected %d", msg.version, MULTIFD_VERSION);
609 return -1;
612 return msg.id;
615 struct {
616 MultiFDSendParams *params;
617 /* number of created threads */
618 int count;
619 } *multifd_send_state;
621 static void multifd_send_terminate_threads(Error *err)
623 int i;
625 if (err) {
626 MigrationState *s = migrate_get_current();
627 migrate_set_error(s, err);
628 if (s->state == MIGRATION_STATUS_SETUP ||
629 s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
630 s->state == MIGRATION_STATUS_DEVICE ||
631 s->state == MIGRATION_STATUS_ACTIVE) {
632 migrate_set_state(&s->state, s->state,
633 MIGRATION_STATUS_FAILED);
637 for (i = 0; i < migrate_multifd_channels(); i++) {
638 MultiFDSendParams *p = &multifd_send_state->params[i];
640 qemu_mutex_lock(&p->mutex);
641 p->quit = true;
642 qemu_sem_post(&p->sem);
643 qemu_mutex_unlock(&p->mutex);
647 int multifd_save_cleanup(Error **errp)
649 int i;
650 int ret = 0;
652 if (!migrate_use_multifd()) {
653 return 0;
655 multifd_send_terminate_threads(NULL);
656 for (i = 0; i < migrate_multifd_channels(); i++) {
657 MultiFDSendParams *p = &multifd_send_state->params[i];
659 if (p->running) {
660 qemu_thread_join(&p->thread);
662 socket_send_channel_destroy(p->c);
663 p->c = NULL;
664 qemu_mutex_destroy(&p->mutex);
665 qemu_sem_destroy(&p->sem);
666 g_free(p->name);
667 p->name = NULL;
669 g_free(multifd_send_state->params);
670 multifd_send_state->params = NULL;
671 g_free(multifd_send_state);
672 multifd_send_state = NULL;
673 return ret;
676 static void *multifd_send_thread(void *opaque)
678 MultiFDSendParams *p = opaque;
679 Error *local_err = NULL;
681 if (multifd_send_initial_packet(p, &local_err) < 0) {
682 goto out;
685 while (true) {
686 qemu_mutex_lock(&p->mutex);
687 if (p->quit) {
688 qemu_mutex_unlock(&p->mutex);
689 break;
691 qemu_mutex_unlock(&p->mutex);
692 qemu_sem_wait(&p->sem);
695 out:
696 if (local_err) {
697 multifd_send_terminate_threads(local_err);
700 qemu_mutex_lock(&p->mutex);
701 p->running = false;
702 qemu_mutex_unlock(&p->mutex);
704 return NULL;
707 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
709 MultiFDSendParams *p = opaque;
710 QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
711 Error *local_err = NULL;
713 if (qio_task_propagate_error(task, &local_err)) {
714 if (multifd_save_cleanup(&local_err) != 0) {
715 migrate_set_error(migrate_get_current(), local_err);
717 } else {
718 p->c = QIO_CHANNEL(sioc);
719 qio_channel_set_delay(p->c, false);
720 p->running = true;
721 qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
722 QEMU_THREAD_JOINABLE);
724 atomic_inc(&multifd_send_state->count);
728 int multifd_save_setup(void)
730 int thread_count;
731 uint8_t i;
733 if (!migrate_use_multifd()) {
734 return 0;
736 thread_count = migrate_multifd_channels();
737 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
738 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
739 atomic_set(&multifd_send_state->count, 0);
740 for (i = 0; i < thread_count; i++) {
741 MultiFDSendParams *p = &multifd_send_state->params[i];
743 qemu_mutex_init(&p->mutex);
744 qemu_sem_init(&p->sem, 0);
745 p->quit = false;
746 p->id = i;
747 p->name = g_strdup_printf("multifdsend_%d", i);
748 socket_send_channel_create(multifd_new_send_channel_async, p);
750 return 0;
753 struct {
754 MultiFDRecvParams *params;
755 /* number of created threads */
756 int count;
757 } *multifd_recv_state;
759 static void multifd_recv_terminate_threads(Error *err)
761 int i;
763 if (err) {
764 MigrationState *s = migrate_get_current();
765 migrate_set_error(s, err);
766 if (s->state == MIGRATION_STATUS_SETUP ||
767 s->state == MIGRATION_STATUS_ACTIVE) {
768 migrate_set_state(&s->state, s->state,
769 MIGRATION_STATUS_FAILED);
773 for (i = 0; i < migrate_multifd_channels(); i++) {
774 MultiFDRecvParams *p = &multifd_recv_state->params[i];
776 qemu_mutex_lock(&p->mutex);
777 p->quit = true;
778 qemu_sem_post(&p->sem);
779 qemu_mutex_unlock(&p->mutex);
783 int multifd_load_cleanup(Error **errp)
785 int i;
786 int ret = 0;
788 if (!migrate_use_multifd()) {
789 return 0;
791 multifd_recv_terminate_threads(NULL);
792 for (i = 0; i < migrate_multifd_channels(); i++) {
793 MultiFDRecvParams *p = &multifd_recv_state->params[i];
795 if (p->running) {
796 qemu_thread_join(&p->thread);
798 object_unref(OBJECT(p->c));
799 p->c = NULL;
800 qemu_mutex_destroy(&p->mutex);
801 qemu_sem_destroy(&p->sem);
802 g_free(p->name);
803 p->name = NULL;
805 g_free(multifd_recv_state->params);
806 multifd_recv_state->params = NULL;
807 g_free(multifd_recv_state);
808 multifd_recv_state = NULL;
810 return ret;
813 static void *multifd_recv_thread(void *opaque)
815 MultiFDRecvParams *p = opaque;
817 while (true) {
818 qemu_mutex_lock(&p->mutex);
819 if (p->quit) {
820 qemu_mutex_unlock(&p->mutex);
821 break;
823 qemu_mutex_unlock(&p->mutex);
824 qemu_sem_wait(&p->sem);
827 qemu_mutex_lock(&p->mutex);
828 p->running = false;
829 qemu_mutex_unlock(&p->mutex);
831 return NULL;
834 int multifd_load_setup(void)
836 int thread_count;
837 uint8_t i;
839 if (!migrate_use_multifd()) {
840 return 0;
842 thread_count = migrate_multifd_channels();
843 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
844 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
845 atomic_set(&multifd_recv_state->count, 0);
846 for (i = 0; i < thread_count; i++) {
847 MultiFDRecvParams *p = &multifd_recv_state->params[i];
849 qemu_mutex_init(&p->mutex);
850 qemu_sem_init(&p->sem, 0);
851 p->quit = false;
852 p->id = i;
853 p->name = g_strdup_printf("multifdrecv_%d", i);
855 return 0;
858 bool multifd_recv_all_channels_created(void)
860 int thread_count = migrate_multifd_channels();
862 if (!migrate_use_multifd()) {
863 return true;
866 return thread_count == atomic_read(&multifd_recv_state->count);
869 void multifd_recv_new_channel(QIOChannel *ioc)
871 MultiFDRecvParams *p;
872 Error *local_err = NULL;
873 int id;
875 id = multifd_recv_initial_packet(ioc, &local_err);
876 if (id < 0) {
877 multifd_recv_terminate_threads(local_err);
878 return;
881 p = &multifd_recv_state->params[id];
882 if (p->c != NULL) {
883 error_setg(&local_err, "multifd: received id '%d' already setup'",
884 id);
885 multifd_recv_terminate_threads(local_err);
886 return;
888 p->c = ioc;
889 object_ref(OBJECT(ioc));
891 p->running = true;
892 qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
893 QEMU_THREAD_JOINABLE);
894 atomic_inc(&multifd_recv_state->count);
895 if (multifd_recv_state->count == migrate_multifd_channels()) {
896 migration_incoming_process();
901 * save_page_header: write page header to wire
903 * If this is the 1st block, it also writes the block identification
905 * Returns the number of bytes written
907 * @f: QEMUFile where to send the data
908 * @block: block that contains the page we want to send
909 * @offset: offset inside the block for the page
910 * in the lower bits, it contains flags
912 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
913 ram_addr_t offset)
915 size_t size, len;
917 if (block == rs->last_sent_block) {
918 offset |= RAM_SAVE_FLAG_CONTINUE;
920 qemu_put_be64(f, offset);
921 size = 8;
923 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
924 len = strlen(block->idstr);
925 qemu_put_byte(f, len);
926 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
927 size += 1 + len;
928 rs->last_sent_block = block;
930 return size;
934 * mig_throttle_guest_down: throotle down the guest
936 * Reduce amount of guest cpu execution to hopefully slow down memory
937 * writes. If guest dirty memory rate is reduced below the rate at
938 * which we can transfer pages to the destination then we should be
939 * able to complete migration. Some workloads dirty memory way too
940 * fast and will not effectively converge, even with auto-converge.
942 static void mig_throttle_guest_down(void)
944 MigrationState *s = migrate_get_current();
945 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
946 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
948 /* We have not started throttling yet. Let's start it. */
949 if (!cpu_throttle_active()) {
950 cpu_throttle_set(pct_initial);
951 } else {
952 /* Throttling already on, just increase the rate */
953 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
958 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
960 * @rs: current RAM state
961 * @current_addr: address for the zero page
963 * Update the xbzrle cache to reflect a page that's been sent as all 0.
964 * The important thing is that a stale (not-yet-0'd) page be replaced
965 * by the new data.
966 * As a bonus, if the page wasn't in the cache it gets added so that
967 * when a small write is made into the 0'd page it gets XBZRLE sent.
969 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
971 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
972 return;
975 /* We don't care if this fails to allocate a new cache page
976 * as long as it updated an old one */
977 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
978 ram_counters.dirty_sync_count);
981 #define ENCODING_FLAG_XBZRLE 0x1
984 * save_xbzrle_page: compress and send current page
986 * Returns: 1 means that we wrote the page
987 * 0 means that page is identical to the one already sent
988 * -1 means that xbzrle would be longer than normal
990 * @rs: current RAM state
991 * @current_data: pointer to the address of the page contents
992 * @current_addr: addr of the page
993 * @block: block that contains the page we want to send
994 * @offset: offset inside the block for the page
995 * @last_stage: if we are at the completion stage
997 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
998 ram_addr_t current_addr, RAMBlock *block,
999 ram_addr_t offset, bool last_stage)
1001 int encoded_len = 0, bytes_xbzrle;
1002 uint8_t *prev_cached_page;
1004 if (!cache_is_cached(XBZRLE.cache, current_addr,
1005 ram_counters.dirty_sync_count)) {
1006 xbzrle_counters.cache_miss++;
1007 if (!last_stage) {
1008 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1009 ram_counters.dirty_sync_count) == -1) {
1010 return -1;
1011 } else {
1012 /* update *current_data when the page has been
1013 inserted into cache */
1014 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1017 return -1;
1020 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1022 /* save current buffer into memory */
1023 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1025 /* XBZRLE encoding (if there is no overflow) */
1026 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1027 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1028 TARGET_PAGE_SIZE);
1029 if (encoded_len == 0) {
1030 trace_save_xbzrle_page_skipping();
1031 return 0;
1032 } else if (encoded_len == -1) {
1033 trace_save_xbzrle_page_overflow();
1034 xbzrle_counters.overflow++;
1035 /* update data in the cache */
1036 if (!last_stage) {
1037 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1038 *current_data = prev_cached_page;
1040 return -1;
1043 /* we need to update the data in the cache, in order to get the same data */
1044 if (!last_stage) {
1045 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1048 /* Send XBZRLE based compressed page */
1049 bytes_xbzrle = save_page_header(rs, rs->f, block,
1050 offset | RAM_SAVE_FLAG_XBZRLE);
1051 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1052 qemu_put_be16(rs->f, encoded_len);
1053 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1054 bytes_xbzrle += encoded_len + 1 + 2;
1055 xbzrle_counters.pages++;
1056 xbzrle_counters.bytes += bytes_xbzrle;
1057 ram_counters.transferred += bytes_xbzrle;
1059 return 1;
1063 * migration_bitmap_find_dirty: find the next dirty page from start
1065 * Called with rcu_read_lock() to protect migration_bitmap
1067 * Returns the byte offset within memory region of the start of a dirty page
1069 * @rs: current RAM state
1070 * @rb: RAMBlock where to search for dirty pages
1071 * @start: page where we start the search
1073 static inline
1074 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1075 unsigned long start)
1077 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1078 unsigned long *bitmap = rb->bmap;
1079 unsigned long next;
1081 if (rs->ram_bulk_stage && start > 0) {
1082 next = start + 1;
1083 } else {
1084 next = find_next_bit(bitmap, size, start);
1087 return next;
1090 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1091 RAMBlock *rb,
1092 unsigned long page)
1094 bool ret;
1096 ret = test_and_clear_bit(page, rb->bmap);
1098 if (ret) {
1099 rs->migration_dirty_pages--;
1101 return ret;
1104 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1105 ram_addr_t start, ram_addr_t length)
1107 rs->migration_dirty_pages +=
1108 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1109 &rs->num_dirty_pages_period);
1113 * ram_pagesize_summary: calculate all the pagesizes of a VM
1115 * Returns a summary bitmap of the page sizes of all RAMBlocks
1117 * For VMs with just normal pages this is equivalent to the host page
1118 * size. If it's got some huge pages then it's the OR of all the
1119 * different page sizes.
1121 uint64_t ram_pagesize_summary(void)
1123 RAMBlock *block;
1124 uint64_t summary = 0;
1126 RAMBLOCK_FOREACH(block) {
1127 summary |= block->page_size;
1130 return summary;
1133 static void migration_bitmap_sync(RAMState *rs)
1135 RAMBlock *block;
1136 int64_t end_time;
1137 uint64_t bytes_xfer_now;
1139 ram_counters.dirty_sync_count++;
1141 if (!rs->time_last_bitmap_sync) {
1142 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1145 trace_migration_bitmap_sync_start();
1146 memory_global_dirty_log_sync();
1148 qemu_mutex_lock(&rs->bitmap_mutex);
1149 rcu_read_lock();
1150 RAMBLOCK_FOREACH(block) {
1151 migration_bitmap_sync_range(rs, block, 0, block->used_length);
1153 rcu_read_unlock();
1154 qemu_mutex_unlock(&rs->bitmap_mutex);
1156 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1158 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1160 /* more than 1 second = 1000 millisecons */
1161 if (end_time > rs->time_last_bitmap_sync + 1000) {
1162 /* calculate period counters */
1163 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1164 / (end_time - rs->time_last_bitmap_sync);
1165 bytes_xfer_now = ram_counters.transferred;
1167 /* During block migration the auto-converge logic incorrectly detects
1168 * that ram migration makes no progress. Avoid this by disabling the
1169 * throttling logic during the bulk phase of block migration. */
1170 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1171 /* The following detection logic can be refined later. For now:
1172 Check to see if the dirtied bytes is 50% more than the approx.
1173 amount of bytes that just got transferred since the last time we
1174 were in this routine. If that happens twice, start or increase
1175 throttling */
1177 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1178 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1179 (++rs->dirty_rate_high_cnt >= 2)) {
1180 trace_migration_throttle();
1181 rs->dirty_rate_high_cnt = 0;
1182 mig_throttle_guest_down();
1186 if (migrate_use_xbzrle()) {
1187 if (rs->iterations_prev != rs->iterations) {
1188 xbzrle_counters.cache_miss_rate =
1189 (double)(xbzrle_counters.cache_miss -
1190 rs->xbzrle_cache_miss_prev) /
1191 (rs->iterations - rs->iterations_prev);
1193 rs->iterations_prev = rs->iterations;
1194 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1197 /* reset period counters */
1198 rs->time_last_bitmap_sync = end_time;
1199 rs->num_dirty_pages_period = 0;
1200 rs->bytes_xfer_prev = bytes_xfer_now;
1202 if (migrate_use_events()) {
1203 qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
1208 * save_zero_page: send the zero page to the stream
1210 * Returns the number of pages written.
1212 * @rs: current RAM state
1213 * @block: block that contains the page we want to send
1214 * @offset: offset inside the block for the page
1216 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1218 uint8_t *p = block->host + offset;
1219 int pages = -1;
1221 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1222 ram_counters.duplicate++;
1223 ram_counters.transferred +=
1224 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
1225 qemu_put_byte(rs->f, 0);
1226 ram_counters.transferred += 1;
1227 pages = 1;
1230 return pages;
1233 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1235 if (!migrate_release_ram() || !migration_in_postcopy()) {
1236 return;
1239 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1243 * @pages: the number of pages written by the control path,
1244 * < 0 - error
1245 * > 0 - number of pages written
1247 * Return true if the pages has been saved, otherwise false is returned.
1249 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1250 int *pages)
1252 uint64_t bytes_xmit = 0;
1253 int ret;
1255 *pages = -1;
1256 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1257 &bytes_xmit);
1258 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1259 return false;
1262 if (bytes_xmit) {
1263 ram_counters.transferred += bytes_xmit;
1264 *pages = 1;
1267 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1268 return true;
1271 if (bytes_xmit > 0) {
1272 ram_counters.normal++;
1273 } else if (bytes_xmit == 0) {
1274 ram_counters.duplicate++;
1277 return true;
1281 * directly send the page to the stream
1283 * Returns the number of pages written.
1285 * @rs: current RAM state
1286 * @block: block that contains the page we want to send
1287 * @offset: offset inside the block for the page
1288 * @buf: the page to be sent
1289 * @async: send to page asyncly
1291 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1292 uint8_t *buf, bool async)
1294 ram_counters.transferred += save_page_header(rs, rs->f, block,
1295 offset | RAM_SAVE_FLAG_PAGE);
1296 if (async) {
1297 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1298 migrate_release_ram() &
1299 migration_in_postcopy());
1300 } else {
1301 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1303 ram_counters.transferred += TARGET_PAGE_SIZE;
1304 ram_counters.normal++;
1305 return 1;
1309 * ram_save_page: send the given page to the stream
1311 * Returns the number of pages written.
1312 * < 0 - error
1313 * >=0 - Number of pages written - this might legally be 0
1314 * if xbzrle noticed the page was the same.
1316 * @rs: current RAM state
1317 * @block: block that contains the page we want to send
1318 * @offset: offset inside the block for the page
1319 * @last_stage: if we are at the completion stage
1321 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1323 int pages = -1;
1324 uint8_t *p;
1325 bool send_async = true;
1326 RAMBlock *block = pss->block;
1327 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1328 ram_addr_t current_addr = block->offset + offset;
1330 p = block->host + offset;
1331 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1333 XBZRLE_cache_lock();
1334 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1335 migrate_use_xbzrle()) {
1336 pages = save_xbzrle_page(rs, &p, current_addr, block,
1337 offset, last_stage);
1338 if (!last_stage) {
1339 /* Can't send this cached data async, since the cache page
1340 * might get updated before it gets to the wire
1342 send_async = false;
1346 /* XBZRLE overflow or normal page */
1347 if (pages == -1) {
1348 pages = save_normal_page(rs, block, offset, p, send_async);
1351 XBZRLE_cache_unlock();
1353 return pages;
1356 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1357 ram_addr_t offset, uint8_t *source_buf)
1359 RAMState *rs = ram_state;
1360 int bytes_sent, blen;
1361 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1363 bytes_sent = save_page_header(rs, f, block, offset |
1364 RAM_SAVE_FLAG_COMPRESS_PAGE);
1367 * copy it to a internal buffer to avoid it being modified by VM
1368 * so that we can catch up the error during compression and
1369 * decompression
1371 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1372 blen = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1373 if (blen < 0) {
1374 bytes_sent = 0;
1375 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1376 error_report("compressed data failed!");
1377 } else {
1378 bytes_sent += blen;
1379 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1382 return bytes_sent;
1385 static void flush_compressed_data(RAMState *rs)
1387 int idx, len, thread_count;
1389 if (!migrate_use_compression()) {
1390 return;
1392 thread_count = migrate_compress_threads();
1394 qemu_mutex_lock(&comp_done_lock);
1395 for (idx = 0; idx < thread_count; idx++) {
1396 while (!comp_param[idx].done) {
1397 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1400 qemu_mutex_unlock(&comp_done_lock);
1402 for (idx = 0; idx < thread_count; idx++) {
1403 qemu_mutex_lock(&comp_param[idx].mutex);
1404 if (!comp_param[idx].quit) {
1405 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1406 ram_counters.transferred += len;
1408 qemu_mutex_unlock(&comp_param[idx].mutex);
1412 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1413 ram_addr_t offset)
1415 param->block = block;
1416 param->offset = offset;
1419 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1420 ram_addr_t offset)
1422 int idx, thread_count, bytes_xmit = -1, pages = -1;
1424 thread_count = migrate_compress_threads();
1425 qemu_mutex_lock(&comp_done_lock);
1426 while (true) {
1427 for (idx = 0; idx < thread_count; idx++) {
1428 if (comp_param[idx].done) {
1429 comp_param[idx].done = false;
1430 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1431 qemu_mutex_lock(&comp_param[idx].mutex);
1432 set_compress_params(&comp_param[idx], block, offset);
1433 qemu_cond_signal(&comp_param[idx].cond);
1434 qemu_mutex_unlock(&comp_param[idx].mutex);
1435 pages = 1;
1436 ram_counters.normal++;
1437 ram_counters.transferred += bytes_xmit;
1438 break;
1441 if (pages > 0) {
1442 break;
1443 } else {
1444 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1447 qemu_mutex_unlock(&comp_done_lock);
1449 return pages;
1453 * find_dirty_block: find the next dirty page and update any state
1454 * associated with the search process.
1456 * Returns if a page is found
1458 * @rs: current RAM state
1459 * @pss: data about the state of the current dirty page scan
1460 * @again: set to false if the search has scanned the whole of RAM
1462 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1464 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1465 if (pss->complete_round && pss->block == rs->last_seen_block &&
1466 pss->page >= rs->last_page) {
1468 * We've been once around the RAM and haven't found anything.
1469 * Give up.
1471 *again = false;
1472 return false;
1474 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1475 /* Didn't find anything in this RAM Block */
1476 pss->page = 0;
1477 pss->block = QLIST_NEXT_RCU(pss->block, next);
1478 if (!pss->block) {
1479 /* Hit the end of the list */
1480 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1481 /* Flag that we've looped */
1482 pss->complete_round = true;
1483 rs->ram_bulk_stage = false;
1484 if (migrate_use_xbzrle()) {
1485 /* If xbzrle is on, stop using the data compression at this
1486 * point. In theory, xbzrle can do better than compression.
1488 flush_compressed_data(rs);
1491 /* Didn't find anything this time, but try again on the new block */
1492 *again = true;
1493 return false;
1494 } else {
1495 /* Can go around again, but... */
1496 *again = true;
1497 /* We've found something so probably don't need to */
1498 return true;
1503 * unqueue_page: gets a page of the queue
1505 * Helper for 'get_queued_page' - gets a page off the queue
1507 * Returns the block of the page (or NULL if none available)
1509 * @rs: current RAM state
1510 * @offset: used to return the offset within the RAMBlock
1512 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1514 RAMBlock *block = NULL;
1516 qemu_mutex_lock(&rs->src_page_req_mutex);
1517 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1518 struct RAMSrcPageRequest *entry =
1519 QSIMPLEQ_FIRST(&rs->src_page_requests);
1520 block = entry->rb;
1521 *offset = entry->offset;
1523 if (entry->len > TARGET_PAGE_SIZE) {
1524 entry->len -= TARGET_PAGE_SIZE;
1525 entry->offset += TARGET_PAGE_SIZE;
1526 } else {
1527 memory_region_unref(block->mr);
1528 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1529 g_free(entry);
1532 qemu_mutex_unlock(&rs->src_page_req_mutex);
1534 return block;
1538 * get_queued_page: unqueue a page from the postocpy requests
1540 * Skips pages that are already sent (!dirty)
1542 * Returns if a queued page is found
1544 * @rs: current RAM state
1545 * @pss: data about the state of the current dirty page scan
1547 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1549 RAMBlock *block;
1550 ram_addr_t offset;
1551 bool dirty;
1553 do {
1554 block = unqueue_page(rs, &offset);
1556 * We're sending this page, and since it's postcopy nothing else
1557 * will dirty it, and we must make sure it doesn't get sent again
1558 * even if this queue request was received after the background
1559 * search already sent it.
1561 if (block) {
1562 unsigned long page;
1564 page = offset >> TARGET_PAGE_BITS;
1565 dirty = test_bit(page, block->bmap);
1566 if (!dirty) {
1567 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1568 page, test_bit(page, block->unsentmap));
1569 } else {
1570 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1574 } while (block && !dirty);
1576 if (block) {
1578 * As soon as we start servicing pages out of order, then we have
1579 * to kill the bulk stage, since the bulk stage assumes
1580 * in (migration_bitmap_find_and_reset_dirty) that every page is
1581 * dirty, that's no longer true.
1583 rs->ram_bulk_stage = false;
1586 * We want the background search to continue from the queued page
1587 * since the guest is likely to want other pages near to the page
1588 * it just requested.
1590 pss->block = block;
1591 pss->page = offset >> TARGET_PAGE_BITS;
1594 return !!block;
1598 * migration_page_queue_free: drop any remaining pages in the ram
1599 * request queue
1601 * It should be empty at the end anyway, but in error cases there may
1602 * be some left. in case that there is any page left, we drop it.
1605 static void migration_page_queue_free(RAMState *rs)
1607 struct RAMSrcPageRequest *mspr, *next_mspr;
1608 /* This queue generally should be empty - but in the case of a failed
1609 * migration might have some droppings in.
1611 rcu_read_lock();
1612 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1613 memory_region_unref(mspr->rb->mr);
1614 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1615 g_free(mspr);
1617 rcu_read_unlock();
1621 * ram_save_queue_pages: queue the page for transmission
1623 * A request from postcopy destination for example.
1625 * Returns zero on success or negative on error
1627 * @rbname: Name of the RAMBLock of the request. NULL means the
1628 * same that last one.
1629 * @start: starting address from the start of the RAMBlock
1630 * @len: length (in bytes) to send
1632 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1634 RAMBlock *ramblock;
1635 RAMState *rs = ram_state;
1637 ram_counters.postcopy_requests++;
1638 rcu_read_lock();
1639 if (!rbname) {
1640 /* Reuse last RAMBlock */
1641 ramblock = rs->last_req_rb;
1643 if (!ramblock) {
1645 * Shouldn't happen, we can't reuse the last RAMBlock if
1646 * it's the 1st request.
1648 error_report("ram_save_queue_pages no previous block");
1649 goto err;
1651 } else {
1652 ramblock = qemu_ram_block_by_name(rbname);
1654 if (!ramblock) {
1655 /* We shouldn't be asked for a non-existent RAMBlock */
1656 error_report("ram_save_queue_pages no block '%s'", rbname);
1657 goto err;
1659 rs->last_req_rb = ramblock;
1661 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1662 if (start+len > ramblock->used_length) {
1663 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1664 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1665 __func__, start, len, ramblock->used_length);
1666 goto err;
1669 struct RAMSrcPageRequest *new_entry =
1670 g_malloc0(sizeof(struct RAMSrcPageRequest));
1671 new_entry->rb = ramblock;
1672 new_entry->offset = start;
1673 new_entry->len = len;
1675 memory_region_ref(ramblock->mr);
1676 qemu_mutex_lock(&rs->src_page_req_mutex);
1677 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1678 qemu_mutex_unlock(&rs->src_page_req_mutex);
1679 rcu_read_unlock();
1681 return 0;
1683 err:
1684 rcu_read_unlock();
1685 return -1;
1688 static bool save_page_use_compression(RAMState *rs)
1690 if (!migrate_use_compression()) {
1691 return false;
1695 * If xbzrle is on, stop using the data compression after first
1696 * round of migration even if compression is enabled. In theory,
1697 * xbzrle can do better than compression.
1699 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1700 return true;
1703 return false;
1707 * ram_save_target_page: save one target page
1709 * Returns the number of pages written
1711 * @rs: current RAM state
1712 * @pss: data about the page we want to send
1713 * @last_stage: if we are at the completion stage
1715 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1716 bool last_stage)
1718 RAMBlock *block = pss->block;
1719 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1720 int res;
1722 if (control_save_page(rs, block, offset, &res)) {
1723 return res;
1727 * When starting the process of a new block, the first page of
1728 * the block should be sent out before other pages in the same
1729 * block, and all the pages in last block should have been sent
1730 * out, keeping this order is important, because the 'cont' flag
1731 * is used to avoid resending the block name.
1733 if (block != rs->last_sent_block && save_page_use_compression(rs)) {
1734 flush_compressed_data(rs);
1737 res = save_zero_page(rs, block, offset);
1738 if (res > 0) {
1739 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1740 * page would be stale
1742 if (!save_page_use_compression(rs)) {
1743 XBZRLE_cache_lock();
1744 xbzrle_cache_zero_page(rs, block->offset + offset);
1745 XBZRLE_cache_unlock();
1747 ram_release_pages(block->idstr, offset, res);
1748 return res;
1752 * Make sure the first page is sent out before other pages.
1754 * we post it as normal page as compression will take much
1755 * CPU resource.
1757 if (block == rs->last_sent_block && save_page_use_compression(rs)) {
1758 return compress_page_with_multi_thread(rs, block, offset);
1761 return ram_save_page(rs, pss, last_stage);
1765 * ram_save_host_page: save a whole host page
1767 * Starting at *offset send pages up to the end of the current host
1768 * page. It's valid for the initial offset to point into the middle of
1769 * a host page in which case the remainder of the hostpage is sent.
1770 * Only dirty target pages are sent. Note that the host page size may
1771 * be a huge page for this block.
1772 * The saving stops at the boundary of the used_length of the block
1773 * if the RAMBlock isn't a multiple of the host page size.
1775 * Returns the number of pages written or negative on error
1777 * @rs: current RAM state
1778 * @ms: current migration state
1779 * @pss: data about the page we want to send
1780 * @last_stage: if we are at the completion stage
1782 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1783 bool last_stage)
1785 int tmppages, pages = 0;
1786 size_t pagesize_bits =
1787 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1789 do {
1790 /* Check the pages is dirty and if it is send it */
1791 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1792 pss->page++;
1793 continue;
1796 tmppages = ram_save_target_page(rs, pss, last_stage);
1797 if (tmppages < 0) {
1798 return tmppages;
1801 pages += tmppages;
1802 if (pss->block->unsentmap) {
1803 clear_bit(pss->page, pss->block->unsentmap);
1806 pss->page++;
1807 } while ((pss->page & (pagesize_bits - 1)) &&
1808 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1810 /* The offset we leave with is the last one we looked at */
1811 pss->page--;
1812 return pages;
1816 * ram_find_and_save_block: finds a dirty page and sends it to f
1818 * Called within an RCU critical section.
1820 * Returns the number of pages written where zero means no dirty pages
1822 * @rs: current RAM state
1823 * @last_stage: if we are at the completion stage
1825 * On systems where host-page-size > target-page-size it will send all the
1826 * pages in a host page that are dirty.
1829 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1831 PageSearchStatus pss;
1832 int pages = 0;
1833 bool again, found;
1835 /* No dirty page as there is zero RAM */
1836 if (!ram_bytes_total()) {
1837 return pages;
1840 pss.block = rs->last_seen_block;
1841 pss.page = rs->last_page;
1842 pss.complete_round = false;
1844 if (!pss.block) {
1845 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1848 do {
1849 again = true;
1850 found = get_queued_page(rs, &pss);
1852 if (!found) {
1853 /* priority queue empty, so just search for something dirty */
1854 found = find_dirty_block(rs, &pss, &again);
1857 if (found) {
1858 pages = ram_save_host_page(rs, &pss, last_stage);
1860 } while (!pages && again);
1862 rs->last_seen_block = pss.block;
1863 rs->last_page = pss.page;
1865 return pages;
1868 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1870 uint64_t pages = size / TARGET_PAGE_SIZE;
1872 if (zero) {
1873 ram_counters.duplicate += pages;
1874 } else {
1875 ram_counters.normal += pages;
1876 ram_counters.transferred += size;
1877 qemu_update_position(f, size);
1881 uint64_t ram_bytes_total(void)
1883 RAMBlock *block;
1884 uint64_t total = 0;
1886 rcu_read_lock();
1887 RAMBLOCK_FOREACH(block) {
1888 total += block->used_length;
1890 rcu_read_unlock();
1891 return total;
1894 static void xbzrle_load_setup(void)
1896 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1899 static void xbzrle_load_cleanup(void)
1901 g_free(XBZRLE.decoded_buf);
1902 XBZRLE.decoded_buf = NULL;
1905 static void ram_state_cleanup(RAMState **rsp)
1907 if (*rsp) {
1908 migration_page_queue_free(*rsp);
1909 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1910 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1911 g_free(*rsp);
1912 *rsp = NULL;
1916 static void xbzrle_cleanup(void)
1918 XBZRLE_cache_lock();
1919 if (XBZRLE.cache) {
1920 cache_fini(XBZRLE.cache);
1921 g_free(XBZRLE.encoded_buf);
1922 g_free(XBZRLE.current_buf);
1923 g_free(XBZRLE.zero_target_page);
1924 XBZRLE.cache = NULL;
1925 XBZRLE.encoded_buf = NULL;
1926 XBZRLE.current_buf = NULL;
1927 XBZRLE.zero_target_page = NULL;
1929 XBZRLE_cache_unlock();
1932 static void ram_save_cleanup(void *opaque)
1934 RAMState **rsp = opaque;
1935 RAMBlock *block;
1937 /* caller have hold iothread lock or is in a bh, so there is
1938 * no writing race against this migration_bitmap
1940 memory_global_dirty_log_stop();
1942 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1943 g_free(block->bmap);
1944 block->bmap = NULL;
1945 g_free(block->unsentmap);
1946 block->unsentmap = NULL;
1949 xbzrle_cleanup();
1950 compress_threads_save_cleanup();
1951 ram_state_cleanup(rsp);
1954 static void ram_state_reset(RAMState *rs)
1956 rs->last_seen_block = NULL;
1957 rs->last_sent_block = NULL;
1958 rs->last_page = 0;
1959 rs->last_version = ram_list.version;
1960 rs->ram_bulk_stage = true;
1963 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1966 * 'expected' is the value you expect the bitmap mostly to be full
1967 * of; it won't bother printing lines that are all this value.
1968 * If 'todump' is null the migration bitmap is dumped.
1970 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1971 unsigned long pages)
1973 int64_t cur;
1974 int64_t linelen = 128;
1975 char linebuf[129];
1977 for (cur = 0; cur < pages; cur += linelen) {
1978 int64_t curb;
1979 bool found = false;
1981 * Last line; catch the case where the line length
1982 * is longer than remaining ram
1984 if (cur + linelen > pages) {
1985 linelen = pages - cur;
1987 for (curb = 0; curb < linelen; curb++) {
1988 bool thisbit = test_bit(cur + curb, todump);
1989 linebuf[curb] = thisbit ? '1' : '.';
1990 found = found || (thisbit != expected);
1992 if (found) {
1993 linebuf[curb] = '\0';
1994 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1999 /* **** functions for postcopy ***** */
2001 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2003 struct RAMBlock *block;
2005 RAMBLOCK_FOREACH(block) {
2006 unsigned long *bitmap = block->bmap;
2007 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2008 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2010 while (run_start < range) {
2011 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2012 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2013 (run_end - run_start) << TARGET_PAGE_BITS);
2014 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2020 * postcopy_send_discard_bm_ram: discard a RAMBlock
2022 * Returns zero on success
2024 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2025 * Note: At this point the 'unsentmap' is the processed bitmap combined
2026 * with the dirtymap; so a '1' means it's either dirty or unsent.
2028 * @ms: current migration state
2029 * @pds: state for postcopy
2030 * @start: RAMBlock starting page
2031 * @length: RAMBlock size
2033 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2034 PostcopyDiscardState *pds,
2035 RAMBlock *block)
2037 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2038 unsigned long current;
2039 unsigned long *unsentmap = block->unsentmap;
2041 for (current = 0; current < end; ) {
2042 unsigned long one = find_next_bit(unsentmap, end, current);
2044 if (one <= end) {
2045 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2046 unsigned long discard_length;
2048 if (zero >= end) {
2049 discard_length = end - one;
2050 } else {
2051 discard_length = zero - one;
2053 if (discard_length) {
2054 postcopy_discard_send_range(ms, pds, one, discard_length);
2056 current = one + discard_length;
2057 } else {
2058 current = one;
2062 return 0;
2066 * postcopy_each_ram_send_discard: discard all RAMBlocks
2068 * Returns 0 for success or negative for error
2070 * Utility for the outgoing postcopy code.
2071 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2072 * passing it bitmap indexes and name.
2073 * (qemu_ram_foreach_block ends up passing unscaled lengths
2074 * which would mean postcopy code would have to deal with target page)
2076 * @ms: current migration state
2078 static int postcopy_each_ram_send_discard(MigrationState *ms)
2080 struct RAMBlock *block;
2081 int ret;
2083 RAMBLOCK_FOREACH(block) {
2084 PostcopyDiscardState *pds =
2085 postcopy_discard_send_init(ms, block->idstr);
2088 * Postcopy sends chunks of bitmap over the wire, but it
2089 * just needs indexes at this point, avoids it having
2090 * target page specific code.
2092 ret = postcopy_send_discard_bm_ram(ms, pds, block);
2093 postcopy_discard_send_finish(ms, pds);
2094 if (ret) {
2095 return ret;
2099 return 0;
2103 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2105 * Helper for postcopy_chunk_hostpages; it's called twice to
2106 * canonicalize the two bitmaps, that are similar, but one is
2107 * inverted.
2109 * Postcopy requires that all target pages in a hostpage are dirty or
2110 * clean, not a mix. This function canonicalizes the bitmaps.
2112 * @ms: current migration state
2113 * @unsent_pass: if true we need to canonicalize partially unsent host pages
2114 * otherwise we need to canonicalize partially dirty host pages
2115 * @block: block that contains the page we want to canonicalize
2116 * @pds: state for postcopy
2118 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2119 RAMBlock *block,
2120 PostcopyDiscardState *pds)
2122 RAMState *rs = ram_state;
2123 unsigned long *bitmap = block->bmap;
2124 unsigned long *unsentmap = block->unsentmap;
2125 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2126 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2127 unsigned long run_start;
2129 if (block->page_size == TARGET_PAGE_SIZE) {
2130 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2131 return;
2134 if (unsent_pass) {
2135 /* Find a sent page */
2136 run_start = find_next_zero_bit(unsentmap, pages, 0);
2137 } else {
2138 /* Find a dirty page */
2139 run_start = find_next_bit(bitmap, pages, 0);
2142 while (run_start < pages) {
2143 bool do_fixup = false;
2144 unsigned long fixup_start_addr;
2145 unsigned long host_offset;
2148 * If the start of this run of pages is in the middle of a host
2149 * page, then we need to fixup this host page.
2151 host_offset = run_start % host_ratio;
2152 if (host_offset) {
2153 do_fixup = true;
2154 run_start -= host_offset;
2155 fixup_start_addr = run_start;
2156 /* For the next pass */
2157 run_start = run_start + host_ratio;
2158 } else {
2159 /* Find the end of this run */
2160 unsigned long run_end;
2161 if (unsent_pass) {
2162 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2163 } else {
2164 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2167 * If the end isn't at the start of a host page, then the
2168 * run doesn't finish at the end of a host page
2169 * and we need to discard.
2171 host_offset = run_end % host_ratio;
2172 if (host_offset) {
2173 do_fixup = true;
2174 fixup_start_addr = run_end - host_offset;
2176 * This host page has gone, the next loop iteration starts
2177 * from after the fixup
2179 run_start = fixup_start_addr + host_ratio;
2180 } else {
2182 * No discards on this iteration, next loop starts from
2183 * next sent/dirty page
2185 run_start = run_end + 1;
2189 if (do_fixup) {
2190 unsigned long page;
2192 /* Tell the destination to discard this page */
2193 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2194 /* For the unsent_pass we:
2195 * discard partially sent pages
2196 * For the !unsent_pass (dirty) we:
2197 * discard partially dirty pages that were sent
2198 * (any partially sent pages were already discarded
2199 * by the previous unsent_pass)
2201 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2202 host_ratio);
2205 /* Clean up the bitmap */
2206 for (page = fixup_start_addr;
2207 page < fixup_start_addr + host_ratio; page++) {
2208 /* All pages in this host page are now not sent */
2209 set_bit(page, unsentmap);
2212 * Remark them as dirty, updating the count for any pages
2213 * that weren't previously dirty.
2215 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2219 if (unsent_pass) {
2220 /* Find the next sent page for the next iteration */
2221 run_start = find_next_zero_bit(unsentmap, pages, run_start);
2222 } else {
2223 /* Find the next dirty page for the next iteration */
2224 run_start = find_next_bit(bitmap, pages, run_start);
2230 * postcopy_chuck_hostpages: discrad any partially sent host page
2232 * Utility for the outgoing postcopy code.
2234 * Discard any partially sent host-page size chunks, mark any partially
2235 * dirty host-page size chunks as all dirty. In this case the host-page
2236 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2238 * Returns zero on success
2240 * @ms: current migration state
2241 * @block: block we want to work with
2243 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2245 PostcopyDiscardState *pds =
2246 postcopy_discard_send_init(ms, block->idstr);
2248 /* First pass: Discard all partially sent host pages */
2249 postcopy_chunk_hostpages_pass(ms, true, block, pds);
2251 * Second pass: Ensure that all partially dirty host pages are made
2252 * fully dirty.
2254 postcopy_chunk_hostpages_pass(ms, false, block, pds);
2256 postcopy_discard_send_finish(ms, pds);
2257 return 0;
2261 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2263 * Returns zero on success
2265 * Transmit the set of pages to be discarded after precopy to the target
2266 * these are pages that:
2267 * a) Have been previously transmitted but are now dirty again
2268 * b) Pages that have never been transmitted, this ensures that
2269 * any pages on the destination that have been mapped by background
2270 * tasks get discarded (transparent huge pages is the specific concern)
2271 * Hopefully this is pretty sparse
2273 * @ms: current migration state
2275 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2277 RAMState *rs = ram_state;
2278 RAMBlock *block;
2279 int ret;
2281 rcu_read_lock();
2283 /* This should be our last sync, the src is now paused */
2284 migration_bitmap_sync(rs);
2286 /* Easiest way to make sure we don't resume in the middle of a host-page */
2287 rs->last_seen_block = NULL;
2288 rs->last_sent_block = NULL;
2289 rs->last_page = 0;
2291 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2292 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2293 unsigned long *bitmap = block->bmap;
2294 unsigned long *unsentmap = block->unsentmap;
2296 if (!unsentmap) {
2297 /* We don't have a safe way to resize the sentmap, so
2298 * if the bitmap was resized it will be NULL at this
2299 * point.
2301 error_report("migration ram resized during precopy phase");
2302 rcu_read_unlock();
2303 return -EINVAL;
2305 /* Deal with TPS != HPS and huge pages */
2306 ret = postcopy_chunk_hostpages(ms, block);
2307 if (ret) {
2308 rcu_read_unlock();
2309 return ret;
2313 * Update the unsentmap to be unsentmap = unsentmap | dirty
2315 bitmap_or(unsentmap, unsentmap, bitmap, pages);
2316 #ifdef DEBUG_POSTCOPY
2317 ram_debug_dump_bitmap(unsentmap, true, pages);
2318 #endif
2320 trace_ram_postcopy_send_discard_bitmap();
2322 ret = postcopy_each_ram_send_discard(ms);
2323 rcu_read_unlock();
2325 return ret;
2329 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2331 * Returns zero on success
2333 * @rbname: name of the RAMBlock of the request. NULL means the
2334 * same that last one.
2335 * @start: RAMBlock starting page
2336 * @length: RAMBlock size
2338 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2340 int ret = -1;
2342 trace_ram_discard_range(rbname, start, length);
2344 rcu_read_lock();
2345 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2347 if (!rb) {
2348 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2349 goto err;
2352 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2353 length >> qemu_target_page_bits());
2354 ret = ram_block_discard_range(rb, start, length);
2356 err:
2357 rcu_read_unlock();
2359 return ret;
2363 * For every allocation, we will try not to crash the VM if the
2364 * allocation failed.
2366 static int xbzrle_init(void)
2368 Error *local_err = NULL;
2370 if (!migrate_use_xbzrle()) {
2371 return 0;
2374 XBZRLE_cache_lock();
2376 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2377 if (!XBZRLE.zero_target_page) {
2378 error_report("%s: Error allocating zero page", __func__);
2379 goto err_out;
2382 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2383 TARGET_PAGE_SIZE, &local_err);
2384 if (!XBZRLE.cache) {
2385 error_report_err(local_err);
2386 goto free_zero_page;
2389 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2390 if (!XBZRLE.encoded_buf) {
2391 error_report("%s: Error allocating encoded_buf", __func__);
2392 goto free_cache;
2395 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2396 if (!XBZRLE.current_buf) {
2397 error_report("%s: Error allocating current_buf", __func__);
2398 goto free_encoded_buf;
2401 /* We are all good */
2402 XBZRLE_cache_unlock();
2403 return 0;
2405 free_encoded_buf:
2406 g_free(XBZRLE.encoded_buf);
2407 XBZRLE.encoded_buf = NULL;
2408 free_cache:
2409 cache_fini(XBZRLE.cache);
2410 XBZRLE.cache = NULL;
2411 free_zero_page:
2412 g_free(XBZRLE.zero_target_page);
2413 XBZRLE.zero_target_page = NULL;
2414 err_out:
2415 XBZRLE_cache_unlock();
2416 return -ENOMEM;
2419 static int ram_state_init(RAMState **rsp)
2421 *rsp = g_try_new0(RAMState, 1);
2423 if (!*rsp) {
2424 error_report("%s: Init ramstate fail", __func__);
2425 return -1;
2428 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2429 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2430 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2433 * Count the total number of pages used by ram blocks not including any
2434 * gaps due to alignment or unplugs.
2436 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2438 ram_state_reset(*rsp);
2440 return 0;
2443 static void ram_list_init_bitmaps(void)
2445 RAMBlock *block;
2446 unsigned long pages;
2448 /* Skip setting bitmap if there is no RAM */
2449 if (ram_bytes_total()) {
2450 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2451 pages = block->max_length >> TARGET_PAGE_BITS;
2452 block->bmap = bitmap_new(pages);
2453 bitmap_set(block->bmap, 0, pages);
2454 if (migrate_postcopy_ram()) {
2455 block->unsentmap = bitmap_new(pages);
2456 bitmap_set(block->unsentmap, 0, pages);
2462 static void ram_init_bitmaps(RAMState *rs)
2464 /* For memory_global_dirty_log_start below. */
2465 qemu_mutex_lock_iothread();
2466 qemu_mutex_lock_ramlist();
2467 rcu_read_lock();
2469 ram_list_init_bitmaps();
2470 memory_global_dirty_log_start();
2471 migration_bitmap_sync(rs);
2473 rcu_read_unlock();
2474 qemu_mutex_unlock_ramlist();
2475 qemu_mutex_unlock_iothread();
2478 static int ram_init_all(RAMState **rsp)
2480 if (ram_state_init(rsp)) {
2481 return -1;
2484 if (xbzrle_init()) {
2485 ram_state_cleanup(rsp);
2486 return -1;
2489 ram_init_bitmaps(*rsp);
2491 return 0;
2494 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2496 RAMBlock *block;
2497 uint64_t pages = 0;
2500 * Postcopy is not using xbzrle/compression, so no need for that.
2501 * Also, since source are already halted, we don't need to care
2502 * about dirty page logging as well.
2505 RAMBLOCK_FOREACH(block) {
2506 pages += bitmap_count_one(block->bmap,
2507 block->used_length >> TARGET_PAGE_BITS);
2510 /* This may not be aligned with current bitmaps. Recalculate. */
2511 rs->migration_dirty_pages = pages;
2513 rs->last_seen_block = NULL;
2514 rs->last_sent_block = NULL;
2515 rs->last_page = 0;
2516 rs->last_version = ram_list.version;
2518 * Disable the bulk stage, otherwise we'll resend the whole RAM no
2519 * matter what we have sent.
2521 rs->ram_bulk_stage = false;
2523 /* Update RAMState cache of output QEMUFile */
2524 rs->f = out;
2526 trace_ram_state_resume_prepare(pages);
2530 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2531 * long-running RCU critical section. When rcu-reclaims in the code
2532 * start to become numerous it will be necessary to reduce the
2533 * granularity of these critical sections.
2537 * ram_save_setup: Setup RAM for migration
2539 * Returns zero to indicate success and negative for error
2541 * @f: QEMUFile where to send the data
2542 * @opaque: RAMState pointer
2544 static int ram_save_setup(QEMUFile *f, void *opaque)
2546 RAMState **rsp = opaque;
2547 RAMBlock *block;
2549 if (compress_threads_save_setup()) {
2550 return -1;
2553 /* migration has already setup the bitmap, reuse it. */
2554 if (!migration_in_colo_state()) {
2555 if (ram_init_all(rsp) != 0) {
2556 compress_threads_save_cleanup();
2557 return -1;
2560 (*rsp)->f = f;
2562 rcu_read_lock();
2564 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2566 RAMBLOCK_FOREACH(block) {
2567 qemu_put_byte(f, strlen(block->idstr));
2568 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2569 qemu_put_be64(f, block->used_length);
2570 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2571 qemu_put_be64(f, block->page_size);
2575 rcu_read_unlock();
2577 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2578 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2580 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2582 return 0;
2586 * ram_save_iterate: iterative stage for migration
2588 * Returns zero to indicate success and negative for error
2590 * @f: QEMUFile where to send the data
2591 * @opaque: RAMState pointer
2593 static int ram_save_iterate(QEMUFile *f, void *opaque)
2595 RAMState **temp = opaque;
2596 RAMState *rs = *temp;
2597 int ret;
2598 int i;
2599 int64_t t0;
2600 int done = 0;
2602 if (blk_mig_bulk_active()) {
2603 /* Avoid transferring ram during bulk phase of block migration as
2604 * the bulk phase will usually take a long time and transferring
2605 * ram updates during that time is pointless. */
2606 goto out;
2609 rcu_read_lock();
2610 if (ram_list.version != rs->last_version) {
2611 ram_state_reset(rs);
2614 /* Read version before ram_list.blocks */
2615 smp_rmb();
2617 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2619 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2620 i = 0;
2621 while ((ret = qemu_file_rate_limit(f)) == 0) {
2622 int pages;
2624 pages = ram_find_and_save_block(rs, false);
2625 /* no more pages to sent */
2626 if (pages == 0) {
2627 done = 1;
2628 break;
2630 rs->iterations++;
2632 /* we want to check in the 1st loop, just in case it was the 1st time
2633 and we had to sync the dirty bitmap.
2634 qemu_get_clock_ns() is a bit expensive, so we only check each some
2635 iterations
2637 if ((i & 63) == 0) {
2638 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2639 if (t1 > MAX_WAIT) {
2640 trace_ram_save_iterate_big_wait(t1, i);
2641 break;
2644 i++;
2646 flush_compressed_data(rs);
2647 rcu_read_unlock();
2650 * Must occur before EOS (or any QEMUFile operation)
2651 * because of RDMA protocol.
2653 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2655 out:
2656 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2657 ram_counters.transferred += 8;
2659 ret = qemu_file_get_error(f);
2660 if (ret < 0) {
2661 return ret;
2664 return done;
2668 * ram_save_complete: function called to send the remaining amount of ram
2670 * Returns zero to indicate success
2672 * Called with iothread lock
2674 * @f: QEMUFile where to send the data
2675 * @opaque: RAMState pointer
2677 static int ram_save_complete(QEMUFile *f, void *opaque)
2679 RAMState **temp = opaque;
2680 RAMState *rs = *temp;
2682 rcu_read_lock();
2684 if (!migration_in_postcopy()) {
2685 migration_bitmap_sync(rs);
2688 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2690 /* try transferring iterative blocks of memory */
2692 /* flush all remaining blocks regardless of rate limiting */
2693 while (true) {
2694 int pages;
2696 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2697 /* no more blocks to sent */
2698 if (pages == 0) {
2699 break;
2703 flush_compressed_data(rs);
2704 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2706 rcu_read_unlock();
2708 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2710 return 0;
2713 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2714 uint64_t *res_precopy_only,
2715 uint64_t *res_compatible,
2716 uint64_t *res_postcopy_only)
2718 RAMState **temp = opaque;
2719 RAMState *rs = *temp;
2720 uint64_t remaining_size;
2722 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2724 if (!migration_in_postcopy() &&
2725 remaining_size < max_size) {
2726 qemu_mutex_lock_iothread();
2727 rcu_read_lock();
2728 migration_bitmap_sync(rs);
2729 rcu_read_unlock();
2730 qemu_mutex_unlock_iothread();
2731 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2734 if (migrate_postcopy_ram()) {
2735 /* We can do postcopy, and all the data is postcopiable */
2736 *res_compatible += remaining_size;
2737 } else {
2738 *res_precopy_only += remaining_size;
2742 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2744 unsigned int xh_len;
2745 int xh_flags;
2746 uint8_t *loaded_data;
2748 /* extract RLE header */
2749 xh_flags = qemu_get_byte(f);
2750 xh_len = qemu_get_be16(f);
2752 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2753 error_report("Failed to load XBZRLE page - wrong compression!");
2754 return -1;
2757 if (xh_len > TARGET_PAGE_SIZE) {
2758 error_report("Failed to load XBZRLE page - len overflow!");
2759 return -1;
2761 loaded_data = XBZRLE.decoded_buf;
2762 /* load data and decode */
2763 /* it can change loaded_data to point to an internal buffer */
2764 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2766 /* decode RLE */
2767 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2768 TARGET_PAGE_SIZE) == -1) {
2769 error_report("Failed to load XBZRLE page - decode error!");
2770 return -1;
2773 return 0;
2777 * ram_block_from_stream: read a RAMBlock id from the migration stream
2779 * Must be called from within a rcu critical section.
2781 * Returns a pointer from within the RCU-protected ram_list.
2783 * @f: QEMUFile where to read the data from
2784 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2786 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2788 static RAMBlock *block = NULL;
2789 char id[256];
2790 uint8_t len;
2792 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2793 if (!block) {
2794 error_report("Ack, bad migration stream!");
2795 return NULL;
2797 return block;
2800 len = qemu_get_byte(f);
2801 qemu_get_buffer(f, (uint8_t *)id, len);
2802 id[len] = 0;
2804 block = qemu_ram_block_by_name(id);
2805 if (!block) {
2806 error_report("Can't find block %s", id);
2807 return NULL;
2810 return block;
2813 static inline void *host_from_ram_block_offset(RAMBlock *block,
2814 ram_addr_t offset)
2816 if (!offset_in_ramblock(block, offset)) {
2817 return NULL;
2820 return block->host + offset;
2824 * ram_handle_compressed: handle the zero page case
2826 * If a page (or a whole RDMA chunk) has been
2827 * determined to be zero, then zap it.
2829 * @host: host address for the zero page
2830 * @ch: what the page is filled from. We only support zero
2831 * @size: size of the zero page
2833 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2835 if (ch != 0 || !is_zero_range(host, size)) {
2836 memset(host, ch, size);
2840 /* return the size after decompression, or negative value on error */
2841 static int
2842 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2843 const uint8_t *source, size_t source_len)
2845 int err;
2847 err = inflateReset(stream);
2848 if (err != Z_OK) {
2849 return -1;
2852 stream->avail_in = source_len;
2853 stream->next_in = (uint8_t *)source;
2854 stream->avail_out = dest_len;
2855 stream->next_out = dest;
2857 err = inflate(stream, Z_NO_FLUSH);
2858 if (err != Z_STREAM_END) {
2859 return -1;
2862 return stream->total_out;
2865 static void *do_data_decompress(void *opaque)
2867 DecompressParam *param = opaque;
2868 unsigned long pagesize;
2869 uint8_t *des;
2870 int len, ret;
2872 qemu_mutex_lock(&param->mutex);
2873 while (!param->quit) {
2874 if (param->des) {
2875 des = param->des;
2876 len = param->len;
2877 param->des = 0;
2878 qemu_mutex_unlock(&param->mutex);
2880 pagesize = TARGET_PAGE_SIZE;
2882 ret = qemu_uncompress_data(&param->stream, des, pagesize,
2883 param->compbuf, len);
2884 if (ret < 0) {
2885 error_report("decompress data failed");
2886 qemu_file_set_error(decomp_file, ret);
2889 qemu_mutex_lock(&decomp_done_lock);
2890 param->done = true;
2891 qemu_cond_signal(&decomp_done_cond);
2892 qemu_mutex_unlock(&decomp_done_lock);
2894 qemu_mutex_lock(&param->mutex);
2895 } else {
2896 qemu_cond_wait(&param->cond, &param->mutex);
2899 qemu_mutex_unlock(&param->mutex);
2901 return NULL;
2904 static int wait_for_decompress_done(void)
2906 int idx, thread_count;
2908 if (!migrate_use_compression()) {
2909 return 0;
2912 thread_count = migrate_decompress_threads();
2913 qemu_mutex_lock(&decomp_done_lock);
2914 for (idx = 0; idx < thread_count; idx++) {
2915 while (!decomp_param[idx].done) {
2916 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2919 qemu_mutex_unlock(&decomp_done_lock);
2920 return qemu_file_get_error(decomp_file);
2923 static void compress_threads_load_cleanup(void)
2925 int i, thread_count;
2927 if (!migrate_use_compression()) {
2928 return;
2930 thread_count = migrate_decompress_threads();
2931 for (i = 0; i < thread_count; i++) {
2933 * we use it as a indicator which shows if the thread is
2934 * properly init'd or not
2936 if (!decomp_param[i].compbuf) {
2937 break;
2940 qemu_mutex_lock(&decomp_param[i].mutex);
2941 decomp_param[i].quit = true;
2942 qemu_cond_signal(&decomp_param[i].cond);
2943 qemu_mutex_unlock(&decomp_param[i].mutex);
2945 for (i = 0; i < thread_count; i++) {
2946 if (!decomp_param[i].compbuf) {
2947 break;
2950 qemu_thread_join(decompress_threads + i);
2951 qemu_mutex_destroy(&decomp_param[i].mutex);
2952 qemu_cond_destroy(&decomp_param[i].cond);
2953 inflateEnd(&decomp_param[i].stream);
2954 g_free(decomp_param[i].compbuf);
2955 decomp_param[i].compbuf = NULL;
2957 g_free(decompress_threads);
2958 g_free(decomp_param);
2959 decompress_threads = NULL;
2960 decomp_param = NULL;
2961 decomp_file = NULL;
2964 static int compress_threads_load_setup(QEMUFile *f)
2966 int i, thread_count;
2968 if (!migrate_use_compression()) {
2969 return 0;
2972 thread_count = migrate_decompress_threads();
2973 decompress_threads = g_new0(QemuThread, thread_count);
2974 decomp_param = g_new0(DecompressParam, thread_count);
2975 qemu_mutex_init(&decomp_done_lock);
2976 qemu_cond_init(&decomp_done_cond);
2977 decomp_file = f;
2978 for (i = 0; i < thread_count; i++) {
2979 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2980 goto exit;
2983 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2984 qemu_mutex_init(&decomp_param[i].mutex);
2985 qemu_cond_init(&decomp_param[i].cond);
2986 decomp_param[i].done = true;
2987 decomp_param[i].quit = false;
2988 qemu_thread_create(decompress_threads + i, "decompress",
2989 do_data_decompress, decomp_param + i,
2990 QEMU_THREAD_JOINABLE);
2992 return 0;
2993 exit:
2994 compress_threads_load_cleanup();
2995 return -1;
2998 static void decompress_data_with_multi_threads(QEMUFile *f,
2999 void *host, int len)
3001 int idx, thread_count;
3003 thread_count = migrate_decompress_threads();
3004 qemu_mutex_lock(&decomp_done_lock);
3005 while (true) {
3006 for (idx = 0; idx < thread_count; idx++) {
3007 if (decomp_param[idx].done) {
3008 decomp_param[idx].done = false;
3009 qemu_mutex_lock(&decomp_param[idx].mutex);
3010 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3011 decomp_param[idx].des = host;
3012 decomp_param[idx].len = len;
3013 qemu_cond_signal(&decomp_param[idx].cond);
3014 qemu_mutex_unlock(&decomp_param[idx].mutex);
3015 break;
3018 if (idx < thread_count) {
3019 break;
3020 } else {
3021 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3024 qemu_mutex_unlock(&decomp_done_lock);
3028 * ram_load_setup: Setup RAM for migration incoming side
3030 * Returns zero to indicate success and negative for error
3032 * @f: QEMUFile where to receive the data
3033 * @opaque: RAMState pointer
3035 static int ram_load_setup(QEMUFile *f, void *opaque)
3037 if (compress_threads_load_setup(f)) {
3038 return -1;
3041 xbzrle_load_setup();
3042 ramblock_recv_map_init();
3043 return 0;
3046 static int ram_load_cleanup(void *opaque)
3048 RAMBlock *rb;
3049 xbzrle_load_cleanup();
3050 compress_threads_load_cleanup();
3052 RAMBLOCK_FOREACH(rb) {
3053 g_free(rb->receivedmap);
3054 rb->receivedmap = NULL;
3056 return 0;
3060 * ram_postcopy_incoming_init: allocate postcopy data structures
3062 * Returns 0 for success and negative if there was one error
3064 * @mis: current migration incoming state
3066 * Allocate data structures etc needed by incoming migration with
3067 * postcopy-ram. postcopy-ram's similarly names
3068 * postcopy_ram_incoming_init does the work.
3070 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3072 unsigned long ram_pages = last_ram_page();
3074 return postcopy_ram_incoming_init(mis, ram_pages);
3078 * ram_load_postcopy: load a page in postcopy case
3080 * Returns 0 for success or -errno in case of error
3082 * Called in postcopy mode by ram_load().
3083 * rcu_read_lock is taken prior to this being called.
3085 * @f: QEMUFile where to send the data
3087 static int ram_load_postcopy(QEMUFile *f)
3089 int flags = 0, ret = 0;
3090 bool place_needed = false;
3091 bool matching_page_sizes = false;
3092 MigrationIncomingState *mis = migration_incoming_get_current();
3093 /* Temporary page that is later 'placed' */
3094 void *postcopy_host_page = postcopy_get_tmp_page(mis);
3095 void *last_host = NULL;
3096 bool all_zero = false;
3098 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3099 ram_addr_t addr;
3100 void *host = NULL;
3101 void *page_buffer = NULL;
3102 void *place_source = NULL;
3103 RAMBlock *block = NULL;
3104 uint8_t ch;
3106 addr = qemu_get_be64(f);
3109 * If qemu file error, we should stop here, and then "addr"
3110 * may be invalid
3112 ret = qemu_file_get_error(f);
3113 if (ret) {
3114 break;
3117 flags = addr & ~TARGET_PAGE_MASK;
3118 addr &= TARGET_PAGE_MASK;
3120 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3121 place_needed = false;
3122 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
3123 block = ram_block_from_stream(f, flags);
3125 host = host_from_ram_block_offset(block, addr);
3126 if (!host) {
3127 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3128 ret = -EINVAL;
3129 break;
3131 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
3133 * Postcopy requires that we place whole host pages atomically;
3134 * these may be huge pages for RAMBlocks that are backed by
3135 * hugetlbfs.
3136 * To make it atomic, the data is read into a temporary page
3137 * that's moved into place later.
3138 * The migration protocol uses, possibly smaller, target-pages
3139 * however the source ensures it always sends all the components
3140 * of a host page in order.
3142 page_buffer = postcopy_host_page +
3143 ((uintptr_t)host & (block->page_size - 1));
3144 /* If all TP are zero then we can optimise the place */
3145 if (!((uintptr_t)host & (block->page_size - 1))) {
3146 all_zero = true;
3147 } else {
3148 /* not the 1st TP within the HP */
3149 if (host != (last_host + TARGET_PAGE_SIZE)) {
3150 error_report("Non-sequential target page %p/%p",
3151 host, last_host);
3152 ret = -EINVAL;
3153 break;
3159 * If it's the last part of a host page then we place the host
3160 * page
3162 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
3163 (block->page_size - 1)) == 0;
3164 place_source = postcopy_host_page;
3166 last_host = host;
3168 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3169 case RAM_SAVE_FLAG_ZERO:
3170 ch = qemu_get_byte(f);
3171 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3172 if (ch) {
3173 all_zero = false;
3175 break;
3177 case RAM_SAVE_FLAG_PAGE:
3178 all_zero = false;
3179 if (!place_needed || !matching_page_sizes) {
3180 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3181 } else {
3182 /* Avoids the qemu_file copy during postcopy, which is
3183 * going to do a copy later; can only do it when we
3184 * do this read in one go (matching page sizes)
3186 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3187 TARGET_PAGE_SIZE);
3189 break;
3190 case RAM_SAVE_FLAG_EOS:
3191 /* normal exit */
3192 break;
3193 default:
3194 error_report("Unknown combination of migration flags: %#x"
3195 " (postcopy mode)", flags);
3196 ret = -EINVAL;
3197 break;
3200 /* Detect for any possible file errors */
3201 if (!ret && qemu_file_get_error(f)) {
3202 ret = qemu_file_get_error(f);
3205 if (!ret && place_needed) {
3206 /* This gets called at the last target page in the host page */
3207 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3209 if (all_zero) {
3210 ret = postcopy_place_page_zero(mis, place_dest,
3211 block);
3212 } else {
3213 ret = postcopy_place_page(mis, place_dest,
3214 place_source, block);
3219 return ret;
3222 static bool postcopy_is_advised(void)
3224 PostcopyState ps = postcopy_state_get();
3225 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3228 static bool postcopy_is_running(void)
3230 PostcopyState ps = postcopy_state_get();
3231 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3234 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3236 int flags = 0, ret = 0, invalid_flags = 0;
3237 static uint64_t seq_iter;
3238 int len = 0;
3240 * If system is running in postcopy mode, page inserts to host memory must
3241 * be atomic
3243 bool postcopy_running = postcopy_is_running();
3244 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3245 bool postcopy_advised = postcopy_is_advised();
3247 seq_iter++;
3249 if (version_id != 4) {
3250 ret = -EINVAL;
3253 if (!migrate_use_compression()) {
3254 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3256 /* This RCU critical section can be very long running.
3257 * When RCU reclaims in the code start to become numerous,
3258 * it will be necessary to reduce the granularity of this
3259 * critical section.
3261 rcu_read_lock();
3263 if (postcopy_running) {
3264 ret = ram_load_postcopy(f);
3267 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3268 ram_addr_t addr, total_ram_bytes;
3269 void *host = NULL;
3270 uint8_t ch;
3272 addr = qemu_get_be64(f);
3273 flags = addr & ~TARGET_PAGE_MASK;
3274 addr &= TARGET_PAGE_MASK;
3276 if (flags & invalid_flags) {
3277 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3278 error_report("Received an unexpected compressed page");
3281 ret = -EINVAL;
3282 break;
3285 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3286 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3287 RAMBlock *block = ram_block_from_stream(f, flags);
3289 host = host_from_ram_block_offset(block, addr);
3290 if (!host) {
3291 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3292 ret = -EINVAL;
3293 break;
3295 ramblock_recv_bitmap_set(block, host);
3296 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3299 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3300 case RAM_SAVE_FLAG_MEM_SIZE:
3301 /* Synchronize RAM block list */
3302 total_ram_bytes = addr;
3303 while (!ret && total_ram_bytes) {
3304 RAMBlock *block;
3305 char id[256];
3306 ram_addr_t length;
3308 len = qemu_get_byte(f);
3309 qemu_get_buffer(f, (uint8_t *)id, len);
3310 id[len] = 0;
3311 length = qemu_get_be64(f);
3313 block = qemu_ram_block_by_name(id);
3314 if (block) {
3315 if (length != block->used_length) {
3316 Error *local_err = NULL;
3318 ret = qemu_ram_resize(block, length,
3319 &local_err);
3320 if (local_err) {
3321 error_report_err(local_err);
3324 /* For postcopy we need to check hugepage sizes match */
3325 if (postcopy_advised &&
3326 block->page_size != qemu_host_page_size) {
3327 uint64_t remote_page_size = qemu_get_be64(f);
3328 if (remote_page_size != block->page_size) {
3329 error_report("Mismatched RAM page size %s "
3330 "(local) %zd != %" PRId64,
3331 id, block->page_size,
3332 remote_page_size);
3333 ret = -EINVAL;
3336 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3337 block->idstr);
3338 } else {
3339 error_report("Unknown ramblock \"%s\", cannot "
3340 "accept migration", id);
3341 ret = -EINVAL;
3344 total_ram_bytes -= length;
3346 break;
3348 case RAM_SAVE_FLAG_ZERO:
3349 ch = qemu_get_byte(f);
3350 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3351 break;
3353 case RAM_SAVE_FLAG_PAGE:
3354 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3355 break;
3357 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3358 len = qemu_get_be32(f);
3359 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3360 error_report("Invalid compressed data length: %d", len);
3361 ret = -EINVAL;
3362 break;
3364 decompress_data_with_multi_threads(f, host, len);
3365 break;
3367 case RAM_SAVE_FLAG_XBZRLE:
3368 if (load_xbzrle(f, addr, host) < 0) {
3369 error_report("Failed to decompress XBZRLE page at "
3370 RAM_ADDR_FMT, addr);
3371 ret = -EINVAL;
3372 break;
3374 break;
3375 case RAM_SAVE_FLAG_EOS:
3376 /* normal exit */
3377 break;
3378 default:
3379 if (flags & RAM_SAVE_FLAG_HOOK) {
3380 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3381 } else {
3382 error_report("Unknown combination of migration flags: %#x",
3383 flags);
3384 ret = -EINVAL;
3387 if (!ret) {
3388 ret = qemu_file_get_error(f);
3392 ret |= wait_for_decompress_done();
3393 rcu_read_unlock();
3394 trace_ram_load_complete(ret, seq_iter);
3395 return ret;
3398 static bool ram_has_postcopy(void *opaque)
3400 return migrate_postcopy_ram();
3403 /* Sync all the dirty bitmap with destination VM. */
3404 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3406 RAMBlock *block;
3407 QEMUFile *file = s->to_dst_file;
3408 int ramblock_count = 0;
3410 trace_ram_dirty_bitmap_sync_start();
3412 RAMBLOCK_FOREACH(block) {
3413 qemu_savevm_send_recv_bitmap(file, block->idstr);
3414 trace_ram_dirty_bitmap_request(block->idstr);
3415 ramblock_count++;
3418 trace_ram_dirty_bitmap_sync_wait();
3420 /* Wait until all the ramblocks' dirty bitmap synced */
3421 while (ramblock_count--) {
3422 qemu_sem_wait(&s->rp_state.rp_sem);
3425 trace_ram_dirty_bitmap_sync_complete();
3427 return 0;
3430 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3432 qemu_sem_post(&s->rp_state.rp_sem);
3436 * Read the received bitmap, revert it as the initial dirty bitmap.
3437 * This is only used when the postcopy migration is paused but wants
3438 * to resume from a middle point.
3440 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3442 int ret = -EINVAL;
3443 QEMUFile *file = s->rp_state.from_dst_file;
3444 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3445 uint64_t local_size = nbits / 8;
3446 uint64_t size, end_mark;
3448 trace_ram_dirty_bitmap_reload_begin(block->idstr);
3450 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3451 error_report("%s: incorrect state %s", __func__,
3452 MigrationStatus_str(s->state));
3453 return -EINVAL;
3457 * Note: see comments in ramblock_recv_bitmap_send() on why we
3458 * need the endianess convertion, and the paddings.
3460 local_size = ROUND_UP(local_size, 8);
3462 /* Add paddings */
3463 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3465 size = qemu_get_be64(file);
3467 /* The size of the bitmap should match with our ramblock */
3468 if (size != local_size) {
3469 error_report("%s: ramblock '%s' bitmap size mismatch "
3470 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3471 block->idstr, size, local_size);
3472 ret = -EINVAL;
3473 goto out;
3476 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3477 end_mark = qemu_get_be64(file);
3479 ret = qemu_file_get_error(file);
3480 if (ret || size != local_size) {
3481 error_report("%s: read bitmap failed for ramblock '%s': %d"
3482 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3483 __func__, block->idstr, ret, local_size, size);
3484 ret = -EIO;
3485 goto out;
3488 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3489 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3490 __func__, block->idstr, end_mark);
3491 ret = -EINVAL;
3492 goto out;
3496 * Endianess convertion. We are during postcopy (though paused).
3497 * The dirty bitmap won't change. We can directly modify it.
3499 bitmap_from_le(block->bmap, le_bitmap, nbits);
3502 * What we received is "received bitmap". Revert it as the initial
3503 * dirty bitmap for this ramblock.
3505 bitmap_complement(block->bmap, block->bmap, nbits);
3507 trace_ram_dirty_bitmap_reload_complete(block->idstr);
3510 * We succeeded to sync bitmap for current ramblock. If this is
3511 * the last one to sync, we need to notify the main send thread.
3513 ram_dirty_bitmap_reload_notify(s);
3515 ret = 0;
3516 out:
3517 free(le_bitmap);
3518 return ret;
3521 static int ram_resume_prepare(MigrationState *s, void *opaque)
3523 RAMState *rs = *(RAMState **)opaque;
3524 int ret;
3526 ret = ram_dirty_bitmap_sync_all(s, rs);
3527 if (ret) {
3528 return ret;
3531 ram_state_resume_prepare(rs, s->to_dst_file);
3533 return 0;
3536 static SaveVMHandlers savevm_ram_handlers = {
3537 .save_setup = ram_save_setup,
3538 .save_live_iterate = ram_save_iterate,
3539 .save_live_complete_postcopy = ram_save_complete,
3540 .save_live_complete_precopy = ram_save_complete,
3541 .has_postcopy = ram_has_postcopy,
3542 .save_live_pending = ram_save_pending,
3543 .load_state = ram_load,
3544 .save_cleanup = ram_save_cleanup,
3545 .load_setup = ram_load_setup,
3546 .load_cleanup = ram_load_cleanup,
3547 .resume_prepare = ram_resume_prepare,
3550 void ram_mig_init(void)
3552 qemu_mutex_init(&XBZRLE.lock);
3553 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);