migration: discard non-migratable RAMBlocks
[qemu/ar7.git] / migration / ram.c
blob290b281446753e34df454bf840adafdeef79a92c
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qemu/cutils.h"
33 #include "qemu/bitops.h"
34 #include "qemu/bitmap.h"
35 #include "qemu/main-loop.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "socket.h"
40 #include "migration/register.h"
41 #include "migration/misc.h"
42 #include "qemu-file.h"
43 #include "postcopy-ram.h"
44 #include "migration/page_cache.h"
45 #include "qemu/error-report.h"
46 #include "qapi/error.h"
47 #include "qapi/qapi-events-migration.h"
48 #include "qapi/qmp/qerror.h"
49 #include "trace.h"
50 #include "exec/ram_addr.h"
51 #include "exec/target_page.h"
52 #include "qemu/rcu_queue.h"
53 #include "migration/colo.h"
54 #include "migration/block.h"
55 #include "sysemu/sysemu.h"
56 #include "qemu/uuid.h"
57 #include "savevm.h"
59 /***********************************************************/
60 /* ram save/restore */
62 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
63 * worked for pages that where filled with the same char. We switched
64 * it to only search for the zero value. And to avoid confusion with
65 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
68 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
69 #define RAM_SAVE_FLAG_ZERO 0x02
70 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
71 #define RAM_SAVE_FLAG_PAGE 0x08
72 #define RAM_SAVE_FLAG_EOS 0x10
73 #define RAM_SAVE_FLAG_CONTINUE 0x20
74 #define RAM_SAVE_FLAG_XBZRLE 0x40
75 /* 0x80 is reserved in migration.h start with 0x100 next */
76 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
78 static inline bool is_zero_range(uint8_t *p, uint64_t size)
80 return buffer_is_zero(p, size);
83 XBZRLECacheStats xbzrle_counters;
85 /* struct contains XBZRLE cache and a static page
86 used by the compression */
87 static struct {
88 /* buffer used for XBZRLE encoding */
89 uint8_t *encoded_buf;
90 /* buffer for storing page content */
91 uint8_t *current_buf;
92 /* Cache for XBZRLE, Protected by lock. */
93 PageCache *cache;
94 QemuMutex lock;
95 /* it will store a page full of zeros */
96 uint8_t *zero_target_page;
97 /* buffer used for XBZRLE decoding */
98 uint8_t *decoded_buf;
99 } XBZRLE;
101 static void XBZRLE_cache_lock(void)
103 if (migrate_use_xbzrle())
104 qemu_mutex_lock(&XBZRLE.lock);
107 static void XBZRLE_cache_unlock(void)
109 if (migrate_use_xbzrle())
110 qemu_mutex_unlock(&XBZRLE.lock);
114 * xbzrle_cache_resize: resize the xbzrle cache
116 * This function is called from qmp_migrate_set_cache_size in main
117 * thread, possibly while a migration is in progress. A running
118 * migration may be using the cache and might finish during this call,
119 * hence changes to the cache are protected by XBZRLE.lock().
121 * Returns 0 for success or -1 for error
123 * @new_size: new cache size
124 * @errp: set *errp if the check failed, with reason
126 int xbzrle_cache_resize(int64_t new_size, Error **errp)
128 PageCache *new_cache;
129 int64_t ret = 0;
131 /* Check for truncation */
132 if (new_size != (size_t)new_size) {
133 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
134 "exceeding address space");
135 return -1;
138 if (new_size == migrate_xbzrle_cache_size()) {
139 /* nothing to do */
140 return 0;
143 XBZRLE_cache_lock();
145 if (XBZRLE.cache != NULL) {
146 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
147 if (!new_cache) {
148 ret = -1;
149 goto out;
152 cache_fini(XBZRLE.cache);
153 XBZRLE.cache = new_cache;
155 out:
156 XBZRLE_cache_unlock();
157 return ret;
160 /* Should be holding either ram_list.mutex, or the RCU lock. */
161 #define RAMBLOCK_FOREACH_MIGRATABLE(block) \
162 RAMBLOCK_FOREACH(block) \
163 if (!qemu_ram_is_migratable(block)) {} else
165 static void ramblock_recv_map_init(void)
167 RAMBlock *rb;
169 RAMBLOCK_FOREACH_MIGRATABLE(rb) {
170 assert(!rb->receivedmap);
171 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
175 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
177 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
178 rb->receivedmap);
181 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
183 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
186 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
188 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
191 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
192 size_t nr)
194 bitmap_set_atomic(rb->receivedmap,
195 ramblock_recv_bitmap_offset(host_addr, rb),
196 nr);
199 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
202 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
204 * Returns >0 if success with sent bytes, or <0 if error.
206 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
207 const char *block_name)
209 RAMBlock *block = qemu_ram_block_by_name(block_name);
210 unsigned long *le_bitmap, nbits;
211 uint64_t size;
213 if (!block) {
214 error_report("%s: invalid block name: %s", __func__, block_name);
215 return -1;
218 nbits = block->used_length >> TARGET_PAGE_BITS;
221 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
222 * machines we may need 4 more bytes for padding (see below
223 * comment). So extend it a bit before hand.
225 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
228 * Always use little endian when sending the bitmap. This is
229 * required that when source and destination VMs are not using the
230 * same endianess. (Note: big endian won't work.)
232 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
234 /* Size of the bitmap, in bytes */
235 size = nbits / 8;
238 * size is always aligned to 8 bytes for 64bit machines, but it
239 * may not be true for 32bit machines. We need this padding to
240 * make sure the migration can survive even between 32bit and
241 * 64bit machines.
243 size = ROUND_UP(size, 8);
245 qemu_put_be64(file, size);
246 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
248 * Mark as an end, in case the middle part is screwed up due to
249 * some "misterious" reason.
251 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
252 qemu_fflush(file);
254 g_free(le_bitmap);
256 if (qemu_file_get_error(file)) {
257 return qemu_file_get_error(file);
260 return size + sizeof(size);
264 * An outstanding page request, on the source, having been received
265 * and queued
267 struct RAMSrcPageRequest {
268 RAMBlock *rb;
269 hwaddr offset;
270 hwaddr len;
272 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
275 /* State of RAM for migration */
276 struct RAMState {
277 /* QEMUFile used for this migration */
278 QEMUFile *f;
279 /* Last block that we have visited searching for dirty pages */
280 RAMBlock *last_seen_block;
281 /* Last block from where we have sent data */
282 RAMBlock *last_sent_block;
283 /* Last dirty target page we have sent */
284 ram_addr_t last_page;
285 /* last ram version we have seen */
286 uint32_t last_version;
287 /* We are in the first round */
288 bool ram_bulk_stage;
289 /* How many times we have dirty too many pages */
290 int dirty_rate_high_cnt;
291 /* these variables are used for bitmap sync */
292 /* last time we did a full bitmap_sync */
293 int64_t time_last_bitmap_sync;
294 /* bytes transferred at start_time */
295 uint64_t bytes_xfer_prev;
296 /* number of dirty pages since start_time */
297 uint64_t num_dirty_pages_period;
298 /* xbzrle misses since the beginning of the period */
299 uint64_t xbzrle_cache_miss_prev;
300 /* number of iterations at the beginning of period */
301 uint64_t iterations_prev;
302 /* Iterations since start */
303 uint64_t iterations;
304 /* number of dirty bits in the bitmap */
305 uint64_t migration_dirty_pages;
306 /* protects modification of the bitmap */
307 QemuMutex bitmap_mutex;
308 /* The RAMBlock used in the last src_page_requests */
309 RAMBlock *last_req_rb;
310 /* Queue of outstanding page requests from the destination */
311 QemuMutex src_page_req_mutex;
312 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
314 typedef struct RAMState RAMState;
316 static RAMState *ram_state;
318 uint64_t ram_bytes_remaining(void)
320 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
324 MigrationStats ram_counters;
326 /* used by the search for pages to send */
327 struct PageSearchStatus {
328 /* Current block being searched */
329 RAMBlock *block;
330 /* Current page to search from */
331 unsigned long page;
332 /* Set once we wrap around */
333 bool complete_round;
335 typedef struct PageSearchStatus PageSearchStatus;
337 struct CompressParam {
338 bool done;
339 bool quit;
340 QEMUFile *file;
341 QemuMutex mutex;
342 QemuCond cond;
343 RAMBlock *block;
344 ram_addr_t offset;
346 /* internally used fields */
347 z_stream stream;
348 uint8_t *originbuf;
350 typedef struct CompressParam CompressParam;
352 struct DecompressParam {
353 bool done;
354 bool quit;
355 QemuMutex mutex;
356 QemuCond cond;
357 void *des;
358 uint8_t *compbuf;
359 int len;
360 z_stream stream;
362 typedef struct DecompressParam DecompressParam;
364 static CompressParam *comp_param;
365 static QemuThread *compress_threads;
366 /* comp_done_cond is used to wake up the migration thread when
367 * one of the compression threads has finished the compression.
368 * comp_done_lock is used to co-work with comp_done_cond.
370 static QemuMutex comp_done_lock;
371 static QemuCond comp_done_cond;
372 /* The empty QEMUFileOps will be used by file in CompressParam */
373 static const QEMUFileOps empty_ops = { };
375 static QEMUFile *decomp_file;
376 static DecompressParam *decomp_param;
377 static QemuThread *decompress_threads;
378 static QemuMutex decomp_done_lock;
379 static QemuCond decomp_done_cond;
381 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
382 ram_addr_t offset, uint8_t *source_buf);
384 static void *do_data_compress(void *opaque)
386 CompressParam *param = opaque;
387 RAMBlock *block;
388 ram_addr_t offset;
390 qemu_mutex_lock(&param->mutex);
391 while (!param->quit) {
392 if (param->block) {
393 block = param->block;
394 offset = param->offset;
395 param->block = NULL;
396 qemu_mutex_unlock(&param->mutex);
398 do_compress_ram_page(param->file, &param->stream, block, offset,
399 param->originbuf);
401 qemu_mutex_lock(&comp_done_lock);
402 param->done = true;
403 qemu_cond_signal(&comp_done_cond);
404 qemu_mutex_unlock(&comp_done_lock);
406 qemu_mutex_lock(&param->mutex);
407 } else {
408 qemu_cond_wait(&param->cond, &param->mutex);
411 qemu_mutex_unlock(&param->mutex);
413 return NULL;
416 static inline void terminate_compression_threads(void)
418 int idx, thread_count;
420 thread_count = migrate_compress_threads();
422 for (idx = 0; idx < thread_count; idx++) {
423 qemu_mutex_lock(&comp_param[idx].mutex);
424 comp_param[idx].quit = true;
425 qemu_cond_signal(&comp_param[idx].cond);
426 qemu_mutex_unlock(&comp_param[idx].mutex);
430 static void compress_threads_save_cleanup(void)
432 int i, thread_count;
434 if (!migrate_use_compression()) {
435 return;
437 terminate_compression_threads();
438 thread_count = migrate_compress_threads();
439 for (i = 0; i < thread_count; i++) {
441 * we use it as a indicator which shows if the thread is
442 * properly init'd or not
444 if (!comp_param[i].file) {
445 break;
447 qemu_thread_join(compress_threads + i);
448 qemu_mutex_destroy(&comp_param[i].mutex);
449 qemu_cond_destroy(&comp_param[i].cond);
450 deflateEnd(&comp_param[i].stream);
451 g_free(comp_param[i].originbuf);
452 qemu_fclose(comp_param[i].file);
453 comp_param[i].file = NULL;
455 qemu_mutex_destroy(&comp_done_lock);
456 qemu_cond_destroy(&comp_done_cond);
457 g_free(compress_threads);
458 g_free(comp_param);
459 compress_threads = NULL;
460 comp_param = NULL;
463 static int compress_threads_save_setup(void)
465 int i, thread_count;
467 if (!migrate_use_compression()) {
468 return 0;
470 thread_count = migrate_compress_threads();
471 compress_threads = g_new0(QemuThread, thread_count);
472 comp_param = g_new0(CompressParam, thread_count);
473 qemu_cond_init(&comp_done_cond);
474 qemu_mutex_init(&comp_done_lock);
475 for (i = 0; i < thread_count; i++) {
476 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
477 if (!comp_param[i].originbuf) {
478 goto exit;
481 if (deflateInit(&comp_param[i].stream,
482 migrate_compress_level()) != Z_OK) {
483 g_free(comp_param[i].originbuf);
484 goto exit;
487 /* comp_param[i].file is just used as a dummy buffer to save data,
488 * set its ops to empty.
490 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
491 comp_param[i].done = true;
492 comp_param[i].quit = false;
493 qemu_mutex_init(&comp_param[i].mutex);
494 qemu_cond_init(&comp_param[i].cond);
495 qemu_thread_create(compress_threads + i, "compress",
496 do_data_compress, comp_param + i,
497 QEMU_THREAD_JOINABLE);
499 return 0;
501 exit:
502 compress_threads_save_cleanup();
503 return -1;
506 /* Multiple fd's */
508 #define MULTIFD_MAGIC 0x11223344U
509 #define MULTIFD_VERSION 1
511 typedef struct {
512 uint32_t magic;
513 uint32_t version;
514 unsigned char uuid[16]; /* QemuUUID */
515 uint8_t id;
516 } __attribute__((packed)) MultiFDInit_t;
518 typedef struct {
519 /* this fields are not changed once the thread is created */
520 /* channel number */
521 uint8_t id;
522 /* channel thread name */
523 char *name;
524 /* channel thread id */
525 QemuThread thread;
526 /* communication channel */
527 QIOChannel *c;
528 /* sem where to wait for more work */
529 QemuSemaphore sem;
530 /* this mutex protects the following parameters */
531 QemuMutex mutex;
532 /* is this channel thread running */
533 bool running;
534 /* should this thread finish */
535 bool quit;
536 } MultiFDSendParams;
538 typedef struct {
539 /* this fields are not changed once the thread is created */
540 /* channel number */
541 uint8_t id;
542 /* channel thread name */
543 char *name;
544 /* channel thread id */
545 QemuThread thread;
546 /* communication channel */
547 QIOChannel *c;
548 /* sem where to wait for more work */
549 QemuSemaphore sem;
550 /* this mutex protects the following parameters */
551 QemuMutex mutex;
552 /* is this channel thread running */
553 bool running;
554 /* should this thread finish */
555 bool quit;
556 } MultiFDRecvParams;
558 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
560 MultiFDInit_t msg;
561 int ret;
563 msg.magic = cpu_to_be32(MULTIFD_MAGIC);
564 msg.version = cpu_to_be32(MULTIFD_VERSION);
565 msg.id = p->id;
566 memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
568 ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
569 if (ret != 0) {
570 return -1;
572 return 0;
575 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
577 MultiFDInit_t msg;
578 int ret;
580 ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
581 if (ret != 0) {
582 return -1;
585 be32_to_cpus(&msg.magic);
586 be32_to_cpus(&msg.version);
588 if (msg.magic != MULTIFD_MAGIC) {
589 error_setg(errp, "multifd: received packet magic %x "
590 "expected %x", msg.magic, MULTIFD_MAGIC);
591 return -1;
594 if (msg.version != MULTIFD_VERSION) {
595 error_setg(errp, "multifd: received packet version %d "
596 "expected %d", msg.version, MULTIFD_VERSION);
597 return -1;
600 if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
601 char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
602 char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
604 error_setg(errp, "multifd: received uuid '%s' and expected "
605 "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
606 g_free(uuid);
607 g_free(msg_uuid);
608 return -1;
611 if (msg.id > migrate_multifd_channels()) {
612 error_setg(errp, "multifd: received channel version %d "
613 "expected %d", msg.version, MULTIFD_VERSION);
614 return -1;
617 return msg.id;
620 struct {
621 MultiFDSendParams *params;
622 /* number of created threads */
623 int count;
624 } *multifd_send_state;
626 static void multifd_send_terminate_threads(Error *err)
628 int i;
630 if (err) {
631 MigrationState *s = migrate_get_current();
632 migrate_set_error(s, err);
633 if (s->state == MIGRATION_STATUS_SETUP ||
634 s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
635 s->state == MIGRATION_STATUS_DEVICE ||
636 s->state == MIGRATION_STATUS_ACTIVE) {
637 migrate_set_state(&s->state, s->state,
638 MIGRATION_STATUS_FAILED);
642 for (i = 0; i < migrate_multifd_channels(); i++) {
643 MultiFDSendParams *p = &multifd_send_state->params[i];
645 qemu_mutex_lock(&p->mutex);
646 p->quit = true;
647 qemu_sem_post(&p->sem);
648 qemu_mutex_unlock(&p->mutex);
652 int multifd_save_cleanup(Error **errp)
654 int i;
655 int ret = 0;
657 if (!migrate_use_multifd()) {
658 return 0;
660 multifd_send_terminate_threads(NULL);
661 for (i = 0; i < migrate_multifd_channels(); i++) {
662 MultiFDSendParams *p = &multifd_send_state->params[i];
664 if (p->running) {
665 qemu_thread_join(&p->thread);
667 socket_send_channel_destroy(p->c);
668 p->c = NULL;
669 qemu_mutex_destroy(&p->mutex);
670 qemu_sem_destroy(&p->sem);
671 g_free(p->name);
672 p->name = NULL;
674 g_free(multifd_send_state->params);
675 multifd_send_state->params = NULL;
676 g_free(multifd_send_state);
677 multifd_send_state = NULL;
678 return ret;
681 static void *multifd_send_thread(void *opaque)
683 MultiFDSendParams *p = opaque;
684 Error *local_err = NULL;
686 if (multifd_send_initial_packet(p, &local_err) < 0) {
687 goto out;
690 while (true) {
691 qemu_mutex_lock(&p->mutex);
692 if (p->quit) {
693 qemu_mutex_unlock(&p->mutex);
694 break;
696 qemu_mutex_unlock(&p->mutex);
697 qemu_sem_wait(&p->sem);
700 out:
701 if (local_err) {
702 multifd_send_terminate_threads(local_err);
705 qemu_mutex_lock(&p->mutex);
706 p->running = false;
707 qemu_mutex_unlock(&p->mutex);
709 return NULL;
712 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
714 MultiFDSendParams *p = opaque;
715 QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
716 Error *local_err = NULL;
718 if (qio_task_propagate_error(task, &local_err)) {
719 if (multifd_save_cleanup(&local_err) != 0) {
720 migrate_set_error(migrate_get_current(), local_err);
722 } else {
723 p->c = QIO_CHANNEL(sioc);
724 qio_channel_set_delay(p->c, false);
725 p->running = true;
726 qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
727 QEMU_THREAD_JOINABLE);
729 atomic_inc(&multifd_send_state->count);
733 int multifd_save_setup(void)
735 int thread_count;
736 uint8_t i;
738 if (!migrate_use_multifd()) {
739 return 0;
741 thread_count = migrate_multifd_channels();
742 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
743 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
744 atomic_set(&multifd_send_state->count, 0);
745 for (i = 0; i < thread_count; i++) {
746 MultiFDSendParams *p = &multifd_send_state->params[i];
748 qemu_mutex_init(&p->mutex);
749 qemu_sem_init(&p->sem, 0);
750 p->quit = false;
751 p->id = i;
752 p->name = g_strdup_printf("multifdsend_%d", i);
753 socket_send_channel_create(multifd_new_send_channel_async, p);
755 return 0;
758 struct {
759 MultiFDRecvParams *params;
760 /* number of created threads */
761 int count;
762 } *multifd_recv_state;
764 static void multifd_recv_terminate_threads(Error *err)
766 int i;
768 if (err) {
769 MigrationState *s = migrate_get_current();
770 migrate_set_error(s, err);
771 if (s->state == MIGRATION_STATUS_SETUP ||
772 s->state == MIGRATION_STATUS_ACTIVE) {
773 migrate_set_state(&s->state, s->state,
774 MIGRATION_STATUS_FAILED);
778 for (i = 0; i < migrate_multifd_channels(); i++) {
779 MultiFDRecvParams *p = &multifd_recv_state->params[i];
781 qemu_mutex_lock(&p->mutex);
782 p->quit = true;
783 qemu_sem_post(&p->sem);
784 qemu_mutex_unlock(&p->mutex);
788 int multifd_load_cleanup(Error **errp)
790 int i;
791 int ret = 0;
793 if (!migrate_use_multifd()) {
794 return 0;
796 multifd_recv_terminate_threads(NULL);
797 for (i = 0; i < migrate_multifd_channels(); i++) {
798 MultiFDRecvParams *p = &multifd_recv_state->params[i];
800 if (p->running) {
801 qemu_thread_join(&p->thread);
803 object_unref(OBJECT(p->c));
804 p->c = NULL;
805 qemu_mutex_destroy(&p->mutex);
806 qemu_sem_destroy(&p->sem);
807 g_free(p->name);
808 p->name = NULL;
810 g_free(multifd_recv_state->params);
811 multifd_recv_state->params = NULL;
812 g_free(multifd_recv_state);
813 multifd_recv_state = NULL;
815 return ret;
818 static void *multifd_recv_thread(void *opaque)
820 MultiFDRecvParams *p = opaque;
822 while (true) {
823 qemu_mutex_lock(&p->mutex);
824 if (p->quit) {
825 qemu_mutex_unlock(&p->mutex);
826 break;
828 qemu_mutex_unlock(&p->mutex);
829 qemu_sem_wait(&p->sem);
832 qemu_mutex_lock(&p->mutex);
833 p->running = false;
834 qemu_mutex_unlock(&p->mutex);
836 return NULL;
839 int multifd_load_setup(void)
841 int thread_count;
842 uint8_t i;
844 if (!migrate_use_multifd()) {
845 return 0;
847 thread_count = migrate_multifd_channels();
848 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
849 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
850 atomic_set(&multifd_recv_state->count, 0);
851 for (i = 0; i < thread_count; i++) {
852 MultiFDRecvParams *p = &multifd_recv_state->params[i];
854 qemu_mutex_init(&p->mutex);
855 qemu_sem_init(&p->sem, 0);
856 p->quit = false;
857 p->id = i;
858 p->name = g_strdup_printf("multifdrecv_%d", i);
860 return 0;
863 bool multifd_recv_all_channels_created(void)
865 int thread_count = migrate_multifd_channels();
867 if (!migrate_use_multifd()) {
868 return true;
871 return thread_count == atomic_read(&multifd_recv_state->count);
874 void multifd_recv_new_channel(QIOChannel *ioc)
876 MultiFDRecvParams *p;
877 Error *local_err = NULL;
878 int id;
880 id = multifd_recv_initial_packet(ioc, &local_err);
881 if (id < 0) {
882 multifd_recv_terminate_threads(local_err);
883 return;
886 p = &multifd_recv_state->params[id];
887 if (p->c != NULL) {
888 error_setg(&local_err, "multifd: received id '%d' already setup'",
889 id);
890 multifd_recv_terminate_threads(local_err);
891 return;
893 p->c = ioc;
894 object_ref(OBJECT(ioc));
896 p->running = true;
897 qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
898 QEMU_THREAD_JOINABLE);
899 atomic_inc(&multifd_recv_state->count);
900 if (multifd_recv_state->count == migrate_multifd_channels()) {
901 migration_incoming_process();
906 * save_page_header: write page header to wire
908 * If this is the 1st block, it also writes the block identification
910 * Returns the number of bytes written
912 * @f: QEMUFile where to send the data
913 * @block: block that contains the page we want to send
914 * @offset: offset inside the block for the page
915 * in the lower bits, it contains flags
917 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
918 ram_addr_t offset)
920 size_t size, len;
922 if (block == rs->last_sent_block) {
923 offset |= RAM_SAVE_FLAG_CONTINUE;
925 qemu_put_be64(f, offset);
926 size = 8;
928 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
929 len = strlen(block->idstr);
930 qemu_put_byte(f, len);
931 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
932 size += 1 + len;
933 rs->last_sent_block = block;
935 return size;
939 * mig_throttle_guest_down: throotle down the guest
941 * Reduce amount of guest cpu execution to hopefully slow down memory
942 * writes. If guest dirty memory rate is reduced below the rate at
943 * which we can transfer pages to the destination then we should be
944 * able to complete migration. Some workloads dirty memory way too
945 * fast and will not effectively converge, even with auto-converge.
947 static void mig_throttle_guest_down(void)
949 MigrationState *s = migrate_get_current();
950 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
951 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
953 /* We have not started throttling yet. Let's start it. */
954 if (!cpu_throttle_active()) {
955 cpu_throttle_set(pct_initial);
956 } else {
957 /* Throttling already on, just increase the rate */
958 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
963 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
965 * @rs: current RAM state
966 * @current_addr: address for the zero page
968 * Update the xbzrle cache to reflect a page that's been sent as all 0.
969 * The important thing is that a stale (not-yet-0'd) page be replaced
970 * by the new data.
971 * As a bonus, if the page wasn't in the cache it gets added so that
972 * when a small write is made into the 0'd page it gets XBZRLE sent.
974 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
976 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
977 return;
980 /* We don't care if this fails to allocate a new cache page
981 * as long as it updated an old one */
982 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
983 ram_counters.dirty_sync_count);
986 #define ENCODING_FLAG_XBZRLE 0x1
989 * save_xbzrle_page: compress and send current page
991 * Returns: 1 means that we wrote the page
992 * 0 means that page is identical to the one already sent
993 * -1 means that xbzrle would be longer than normal
995 * @rs: current RAM state
996 * @current_data: pointer to the address of the page contents
997 * @current_addr: addr of the page
998 * @block: block that contains the page we want to send
999 * @offset: offset inside the block for the page
1000 * @last_stage: if we are at the completion stage
1002 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1003 ram_addr_t current_addr, RAMBlock *block,
1004 ram_addr_t offset, bool last_stage)
1006 int encoded_len = 0, bytes_xbzrle;
1007 uint8_t *prev_cached_page;
1009 if (!cache_is_cached(XBZRLE.cache, current_addr,
1010 ram_counters.dirty_sync_count)) {
1011 xbzrle_counters.cache_miss++;
1012 if (!last_stage) {
1013 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1014 ram_counters.dirty_sync_count) == -1) {
1015 return -1;
1016 } else {
1017 /* update *current_data when the page has been
1018 inserted into cache */
1019 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1022 return -1;
1025 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1027 /* save current buffer into memory */
1028 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1030 /* XBZRLE encoding (if there is no overflow) */
1031 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1032 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1033 TARGET_PAGE_SIZE);
1034 if (encoded_len == 0) {
1035 trace_save_xbzrle_page_skipping();
1036 return 0;
1037 } else if (encoded_len == -1) {
1038 trace_save_xbzrle_page_overflow();
1039 xbzrle_counters.overflow++;
1040 /* update data in the cache */
1041 if (!last_stage) {
1042 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1043 *current_data = prev_cached_page;
1045 return -1;
1048 /* we need to update the data in the cache, in order to get the same data */
1049 if (!last_stage) {
1050 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1053 /* Send XBZRLE based compressed page */
1054 bytes_xbzrle = save_page_header(rs, rs->f, block,
1055 offset | RAM_SAVE_FLAG_XBZRLE);
1056 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1057 qemu_put_be16(rs->f, encoded_len);
1058 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1059 bytes_xbzrle += encoded_len + 1 + 2;
1060 xbzrle_counters.pages++;
1061 xbzrle_counters.bytes += bytes_xbzrle;
1062 ram_counters.transferred += bytes_xbzrle;
1064 return 1;
1068 * migration_bitmap_find_dirty: find the next dirty page from start
1070 * Called with rcu_read_lock() to protect migration_bitmap
1072 * Returns the byte offset within memory region of the start of a dirty page
1074 * @rs: current RAM state
1075 * @rb: RAMBlock where to search for dirty pages
1076 * @start: page where we start the search
1078 static inline
1079 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1080 unsigned long start)
1082 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1083 unsigned long *bitmap = rb->bmap;
1084 unsigned long next;
1086 if (!qemu_ram_is_migratable(rb)) {
1087 return size;
1090 if (rs->ram_bulk_stage && start > 0) {
1091 next = start + 1;
1092 } else {
1093 next = find_next_bit(bitmap, size, start);
1096 return next;
1099 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1100 RAMBlock *rb,
1101 unsigned long page)
1103 bool ret;
1105 ret = test_and_clear_bit(page, rb->bmap);
1107 if (ret) {
1108 rs->migration_dirty_pages--;
1110 return ret;
1113 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1114 ram_addr_t start, ram_addr_t length)
1116 rs->migration_dirty_pages +=
1117 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1118 &rs->num_dirty_pages_period);
1122 * ram_pagesize_summary: calculate all the pagesizes of a VM
1124 * Returns a summary bitmap of the page sizes of all RAMBlocks
1126 * For VMs with just normal pages this is equivalent to the host page
1127 * size. If it's got some huge pages then it's the OR of all the
1128 * different page sizes.
1130 uint64_t ram_pagesize_summary(void)
1132 RAMBlock *block;
1133 uint64_t summary = 0;
1135 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1136 summary |= block->page_size;
1139 return summary;
1142 static void migration_bitmap_sync(RAMState *rs)
1144 RAMBlock *block;
1145 int64_t end_time;
1146 uint64_t bytes_xfer_now;
1148 ram_counters.dirty_sync_count++;
1150 if (!rs->time_last_bitmap_sync) {
1151 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1154 trace_migration_bitmap_sync_start();
1155 memory_global_dirty_log_sync();
1157 qemu_mutex_lock(&rs->bitmap_mutex);
1158 rcu_read_lock();
1159 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1160 migration_bitmap_sync_range(rs, block, 0, block->used_length);
1162 rcu_read_unlock();
1163 qemu_mutex_unlock(&rs->bitmap_mutex);
1165 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1167 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1169 /* more than 1 second = 1000 millisecons */
1170 if (end_time > rs->time_last_bitmap_sync + 1000) {
1171 /* calculate period counters */
1172 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1173 / (end_time - rs->time_last_bitmap_sync);
1174 bytes_xfer_now = ram_counters.transferred;
1176 /* During block migration the auto-converge logic incorrectly detects
1177 * that ram migration makes no progress. Avoid this by disabling the
1178 * throttling logic during the bulk phase of block migration. */
1179 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1180 /* The following detection logic can be refined later. For now:
1181 Check to see if the dirtied bytes is 50% more than the approx.
1182 amount of bytes that just got transferred since the last time we
1183 were in this routine. If that happens twice, start or increase
1184 throttling */
1186 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1187 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1188 (++rs->dirty_rate_high_cnt >= 2)) {
1189 trace_migration_throttle();
1190 rs->dirty_rate_high_cnt = 0;
1191 mig_throttle_guest_down();
1195 if (migrate_use_xbzrle()) {
1196 if (rs->iterations_prev != rs->iterations) {
1197 xbzrle_counters.cache_miss_rate =
1198 (double)(xbzrle_counters.cache_miss -
1199 rs->xbzrle_cache_miss_prev) /
1200 (rs->iterations - rs->iterations_prev);
1202 rs->iterations_prev = rs->iterations;
1203 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1206 /* reset period counters */
1207 rs->time_last_bitmap_sync = end_time;
1208 rs->num_dirty_pages_period = 0;
1209 rs->bytes_xfer_prev = bytes_xfer_now;
1211 if (migrate_use_events()) {
1212 qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
1217 * save_zero_page: send the zero page to the stream
1219 * Returns the number of pages written.
1221 * @rs: current RAM state
1222 * @block: block that contains the page we want to send
1223 * @offset: offset inside the block for the page
1225 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1227 uint8_t *p = block->host + offset;
1228 int pages = -1;
1230 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1231 ram_counters.duplicate++;
1232 ram_counters.transferred +=
1233 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
1234 qemu_put_byte(rs->f, 0);
1235 ram_counters.transferred += 1;
1236 pages = 1;
1239 return pages;
1242 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1244 if (!migrate_release_ram() || !migration_in_postcopy()) {
1245 return;
1248 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1252 * @pages: the number of pages written by the control path,
1253 * < 0 - error
1254 * > 0 - number of pages written
1256 * Return true if the pages has been saved, otherwise false is returned.
1258 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1259 int *pages)
1261 uint64_t bytes_xmit = 0;
1262 int ret;
1264 *pages = -1;
1265 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1266 &bytes_xmit);
1267 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1268 return false;
1271 if (bytes_xmit) {
1272 ram_counters.transferred += bytes_xmit;
1273 *pages = 1;
1276 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1277 return true;
1280 if (bytes_xmit > 0) {
1281 ram_counters.normal++;
1282 } else if (bytes_xmit == 0) {
1283 ram_counters.duplicate++;
1286 return true;
1290 * directly send the page to the stream
1292 * Returns the number of pages written.
1294 * @rs: current RAM state
1295 * @block: block that contains the page we want to send
1296 * @offset: offset inside the block for the page
1297 * @buf: the page to be sent
1298 * @async: send to page asyncly
1300 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1301 uint8_t *buf, bool async)
1303 ram_counters.transferred += save_page_header(rs, rs->f, block,
1304 offset | RAM_SAVE_FLAG_PAGE);
1305 if (async) {
1306 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1307 migrate_release_ram() &
1308 migration_in_postcopy());
1309 } else {
1310 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1312 ram_counters.transferred += TARGET_PAGE_SIZE;
1313 ram_counters.normal++;
1314 return 1;
1318 * ram_save_page: send the given page to the stream
1320 * Returns the number of pages written.
1321 * < 0 - error
1322 * >=0 - Number of pages written - this might legally be 0
1323 * if xbzrle noticed the page was the same.
1325 * @rs: current RAM state
1326 * @block: block that contains the page we want to send
1327 * @offset: offset inside the block for the page
1328 * @last_stage: if we are at the completion stage
1330 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1332 int pages = -1;
1333 uint8_t *p;
1334 bool send_async = true;
1335 RAMBlock *block = pss->block;
1336 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1337 ram_addr_t current_addr = block->offset + offset;
1339 p = block->host + offset;
1340 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1342 XBZRLE_cache_lock();
1343 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1344 migrate_use_xbzrle()) {
1345 pages = save_xbzrle_page(rs, &p, current_addr, block,
1346 offset, last_stage);
1347 if (!last_stage) {
1348 /* Can't send this cached data async, since the cache page
1349 * might get updated before it gets to the wire
1351 send_async = false;
1355 /* XBZRLE overflow or normal page */
1356 if (pages == -1) {
1357 pages = save_normal_page(rs, block, offset, p, send_async);
1360 XBZRLE_cache_unlock();
1362 return pages;
1365 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1366 ram_addr_t offset, uint8_t *source_buf)
1368 RAMState *rs = ram_state;
1369 int bytes_sent, blen;
1370 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1372 bytes_sent = save_page_header(rs, f, block, offset |
1373 RAM_SAVE_FLAG_COMPRESS_PAGE);
1376 * copy it to a internal buffer to avoid it being modified by VM
1377 * so that we can catch up the error during compression and
1378 * decompression
1380 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1381 blen = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1382 if (blen < 0) {
1383 bytes_sent = 0;
1384 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1385 error_report("compressed data failed!");
1386 } else {
1387 bytes_sent += blen;
1388 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1391 return bytes_sent;
1394 static void flush_compressed_data(RAMState *rs)
1396 int idx, len, thread_count;
1398 if (!migrate_use_compression()) {
1399 return;
1401 thread_count = migrate_compress_threads();
1403 qemu_mutex_lock(&comp_done_lock);
1404 for (idx = 0; idx < thread_count; idx++) {
1405 while (!comp_param[idx].done) {
1406 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1409 qemu_mutex_unlock(&comp_done_lock);
1411 for (idx = 0; idx < thread_count; idx++) {
1412 qemu_mutex_lock(&comp_param[idx].mutex);
1413 if (!comp_param[idx].quit) {
1414 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1415 ram_counters.transferred += len;
1417 qemu_mutex_unlock(&comp_param[idx].mutex);
1421 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1422 ram_addr_t offset)
1424 param->block = block;
1425 param->offset = offset;
1428 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1429 ram_addr_t offset)
1431 int idx, thread_count, bytes_xmit = -1, pages = -1;
1433 thread_count = migrate_compress_threads();
1434 qemu_mutex_lock(&comp_done_lock);
1435 while (true) {
1436 for (idx = 0; idx < thread_count; idx++) {
1437 if (comp_param[idx].done) {
1438 comp_param[idx].done = false;
1439 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1440 qemu_mutex_lock(&comp_param[idx].mutex);
1441 set_compress_params(&comp_param[idx], block, offset);
1442 qemu_cond_signal(&comp_param[idx].cond);
1443 qemu_mutex_unlock(&comp_param[idx].mutex);
1444 pages = 1;
1445 ram_counters.normal++;
1446 ram_counters.transferred += bytes_xmit;
1447 break;
1450 if (pages > 0) {
1451 break;
1452 } else {
1453 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1456 qemu_mutex_unlock(&comp_done_lock);
1458 return pages;
1462 * find_dirty_block: find the next dirty page and update any state
1463 * associated with the search process.
1465 * Returns if a page is found
1467 * @rs: current RAM state
1468 * @pss: data about the state of the current dirty page scan
1469 * @again: set to false if the search has scanned the whole of RAM
1471 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1473 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1474 if (pss->complete_round && pss->block == rs->last_seen_block &&
1475 pss->page >= rs->last_page) {
1477 * We've been once around the RAM and haven't found anything.
1478 * Give up.
1480 *again = false;
1481 return false;
1483 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1484 /* Didn't find anything in this RAM Block */
1485 pss->page = 0;
1486 pss->block = QLIST_NEXT_RCU(pss->block, next);
1487 if (!pss->block) {
1488 /* Hit the end of the list */
1489 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1490 /* Flag that we've looped */
1491 pss->complete_round = true;
1492 rs->ram_bulk_stage = false;
1493 if (migrate_use_xbzrle()) {
1494 /* If xbzrle is on, stop using the data compression at this
1495 * point. In theory, xbzrle can do better than compression.
1497 flush_compressed_data(rs);
1500 /* Didn't find anything this time, but try again on the new block */
1501 *again = true;
1502 return false;
1503 } else {
1504 /* Can go around again, but... */
1505 *again = true;
1506 /* We've found something so probably don't need to */
1507 return true;
1512 * unqueue_page: gets a page of the queue
1514 * Helper for 'get_queued_page' - gets a page off the queue
1516 * Returns the block of the page (or NULL if none available)
1518 * @rs: current RAM state
1519 * @offset: used to return the offset within the RAMBlock
1521 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1523 RAMBlock *block = NULL;
1525 qemu_mutex_lock(&rs->src_page_req_mutex);
1526 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1527 struct RAMSrcPageRequest *entry =
1528 QSIMPLEQ_FIRST(&rs->src_page_requests);
1529 block = entry->rb;
1530 *offset = entry->offset;
1532 if (entry->len > TARGET_PAGE_SIZE) {
1533 entry->len -= TARGET_PAGE_SIZE;
1534 entry->offset += TARGET_PAGE_SIZE;
1535 } else {
1536 memory_region_unref(block->mr);
1537 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1538 g_free(entry);
1541 qemu_mutex_unlock(&rs->src_page_req_mutex);
1543 return block;
1547 * get_queued_page: unqueue a page from the postocpy requests
1549 * Skips pages that are already sent (!dirty)
1551 * Returns if a queued page is found
1553 * @rs: current RAM state
1554 * @pss: data about the state of the current dirty page scan
1556 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1558 RAMBlock *block;
1559 ram_addr_t offset;
1560 bool dirty;
1562 do {
1563 block = unqueue_page(rs, &offset);
1565 * We're sending this page, and since it's postcopy nothing else
1566 * will dirty it, and we must make sure it doesn't get sent again
1567 * even if this queue request was received after the background
1568 * search already sent it.
1570 if (block) {
1571 unsigned long page;
1573 page = offset >> TARGET_PAGE_BITS;
1574 dirty = test_bit(page, block->bmap);
1575 if (!dirty) {
1576 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1577 page, test_bit(page, block->unsentmap));
1578 } else {
1579 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1583 } while (block && !dirty);
1585 if (block) {
1587 * As soon as we start servicing pages out of order, then we have
1588 * to kill the bulk stage, since the bulk stage assumes
1589 * in (migration_bitmap_find_and_reset_dirty) that every page is
1590 * dirty, that's no longer true.
1592 rs->ram_bulk_stage = false;
1595 * We want the background search to continue from the queued page
1596 * since the guest is likely to want other pages near to the page
1597 * it just requested.
1599 pss->block = block;
1600 pss->page = offset >> TARGET_PAGE_BITS;
1603 return !!block;
1607 * migration_page_queue_free: drop any remaining pages in the ram
1608 * request queue
1610 * It should be empty at the end anyway, but in error cases there may
1611 * be some left. in case that there is any page left, we drop it.
1614 static void migration_page_queue_free(RAMState *rs)
1616 struct RAMSrcPageRequest *mspr, *next_mspr;
1617 /* This queue generally should be empty - but in the case of a failed
1618 * migration might have some droppings in.
1620 rcu_read_lock();
1621 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1622 memory_region_unref(mspr->rb->mr);
1623 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1624 g_free(mspr);
1626 rcu_read_unlock();
1630 * ram_save_queue_pages: queue the page for transmission
1632 * A request from postcopy destination for example.
1634 * Returns zero on success or negative on error
1636 * @rbname: Name of the RAMBLock of the request. NULL means the
1637 * same that last one.
1638 * @start: starting address from the start of the RAMBlock
1639 * @len: length (in bytes) to send
1641 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1643 RAMBlock *ramblock;
1644 RAMState *rs = ram_state;
1646 ram_counters.postcopy_requests++;
1647 rcu_read_lock();
1648 if (!rbname) {
1649 /* Reuse last RAMBlock */
1650 ramblock = rs->last_req_rb;
1652 if (!ramblock) {
1654 * Shouldn't happen, we can't reuse the last RAMBlock if
1655 * it's the 1st request.
1657 error_report("ram_save_queue_pages no previous block");
1658 goto err;
1660 } else {
1661 ramblock = qemu_ram_block_by_name(rbname);
1663 if (!ramblock) {
1664 /* We shouldn't be asked for a non-existent RAMBlock */
1665 error_report("ram_save_queue_pages no block '%s'", rbname);
1666 goto err;
1668 rs->last_req_rb = ramblock;
1670 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1671 if (start+len > ramblock->used_length) {
1672 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1673 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1674 __func__, start, len, ramblock->used_length);
1675 goto err;
1678 struct RAMSrcPageRequest *new_entry =
1679 g_malloc0(sizeof(struct RAMSrcPageRequest));
1680 new_entry->rb = ramblock;
1681 new_entry->offset = start;
1682 new_entry->len = len;
1684 memory_region_ref(ramblock->mr);
1685 qemu_mutex_lock(&rs->src_page_req_mutex);
1686 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1687 qemu_mutex_unlock(&rs->src_page_req_mutex);
1688 rcu_read_unlock();
1690 return 0;
1692 err:
1693 rcu_read_unlock();
1694 return -1;
1697 static bool save_page_use_compression(RAMState *rs)
1699 if (!migrate_use_compression()) {
1700 return false;
1704 * If xbzrle is on, stop using the data compression after first
1705 * round of migration even if compression is enabled. In theory,
1706 * xbzrle can do better than compression.
1708 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1709 return true;
1712 return false;
1716 * ram_save_target_page: save one target page
1718 * Returns the number of pages written
1720 * @rs: current RAM state
1721 * @pss: data about the page we want to send
1722 * @last_stage: if we are at the completion stage
1724 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1725 bool last_stage)
1727 RAMBlock *block = pss->block;
1728 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1729 int res;
1731 if (control_save_page(rs, block, offset, &res)) {
1732 return res;
1736 * When starting the process of a new block, the first page of
1737 * the block should be sent out before other pages in the same
1738 * block, and all the pages in last block should have been sent
1739 * out, keeping this order is important, because the 'cont' flag
1740 * is used to avoid resending the block name.
1742 if (block != rs->last_sent_block && save_page_use_compression(rs)) {
1743 flush_compressed_data(rs);
1746 res = save_zero_page(rs, block, offset);
1747 if (res > 0) {
1748 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1749 * page would be stale
1751 if (!save_page_use_compression(rs)) {
1752 XBZRLE_cache_lock();
1753 xbzrle_cache_zero_page(rs, block->offset + offset);
1754 XBZRLE_cache_unlock();
1756 ram_release_pages(block->idstr, offset, res);
1757 return res;
1761 * Make sure the first page is sent out before other pages.
1763 * we post it as normal page as compression will take much
1764 * CPU resource.
1766 if (block == rs->last_sent_block && save_page_use_compression(rs)) {
1767 return compress_page_with_multi_thread(rs, block, offset);
1770 return ram_save_page(rs, pss, last_stage);
1774 * ram_save_host_page: save a whole host page
1776 * Starting at *offset send pages up to the end of the current host
1777 * page. It's valid for the initial offset to point into the middle of
1778 * a host page in which case the remainder of the hostpage is sent.
1779 * Only dirty target pages are sent. Note that the host page size may
1780 * be a huge page for this block.
1781 * The saving stops at the boundary of the used_length of the block
1782 * if the RAMBlock isn't a multiple of the host page size.
1784 * Returns the number of pages written or negative on error
1786 * @rs: current RAM state
1787 * @ms: current migration state
1788 * @pss: data about the page we want to send
1789 * @last_stage: if we are at the completion stage
1791 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1792 bool last_stage)
1794 int tmppages, pages = 0;
1795 size_t pagesize_bits =
1796 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1798 if (!qemu_ram_is_migratable(pss->block)) {
1799 error_report("block %s should not be migrated !", pss->block->idstr);
1800 return 0;
1803 do {
1804 /* Check the pages is dirty and if it is send it */
1805 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1806 pss->page++;
1807 continue;
1810 tmppages = ram_save_target_page(rs, pss, last_stage);
1811 if (tmppages < 0) {
1812 return tmppages;
1815 pages += tmppages;
1816 if (pss->block->unsentmap) {
1817 clear_bit(pss->page, pss->block->unsentmap);
1820 pss->page++;
1821 } while ((pss->page & (pagesize_bits - 1)) &&
1822 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1824 /* The offset we leave with is the last one we looked at */
1825 pss->page--;
1826 return pages;
1830 * ram_find_and_save_block: finds a dirty page and sends it to f
1832 * Called within an RCU critical section.
1834 * Returns the number of pages written where zero means no dirty pages
1836 * @rs: current RAM state
1837 * @last_stage: if we are at the completion stage
1839 * On systems where host-page-size > target-page-size it will send all the
1840 * pages in a host page that are dirty.
1843 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1845 PageSearchStatus pss;
1846 int pages = 0;
1847 bool again, found;
1849 /* No dirty page as there is zero RAM */
1850 if (!ram_bytes_total()) {
1851 return pages;
1854 pss.block = rs->last_seen_block;
1855 pss.page = rs->last_page;
1856 pss.complete_round = false;
1858 if (!pss.block) {
1859 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1862 do {
1863 again = true;
1864 found = get_queued_page(rs, &pss);
1866 if (!found) {
1867 /* priority queue empty, so just search for something dirty */
1868 found = find_dirty_block(rs, &pss, &again);
1871 if (found) {
1872 pages = ram_save_host_page(rs, &pss, last_stage);
1874 } while (!pages && again);
1876 rs->last_seen_block = pss.block;
1877 rs->last_page = pss.page;
1879 return pages;
1882 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1884 uint64_t pages = size / TARGET_PAGE_SIZE;
1886 if (zero) {
1887 ram_counters.duplicate += pages;
1888 } else {
1889 ram_counters.normal += pages;
1890 ram_counters.transferred += size;
1891 qemu_update_position(f, size);
1895 uint64_t ram_bytes_total(void)
1897 RAMBlock *block;
1898 uint64_t total = 0;
1900 rcu_read_lock();
1901 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1902 total += block->used_length;
1904 rcu_read_unlock();
1905 return total;
1908 static void xbzrle_load_setup(void)
1910 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1913 static void xbzrle_load_cleanup(void)
1915 g_free(XBZRLE.decoded_buf);
1916 XBZRLE.decoded_buf = NULL;
1919 static void ram_state_cleanup(RAMState **rsp)
1921 if (*rsp) {
1922 migration_page_queue_free(*rsp);
1923 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1924 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1925 g_free(*rsp);
1926 *rsp = NULL;
1930 static void xbzrle_cleanup(void)
1932 XBZRLE_cache_lock();
1933 if (XBZRLE.cache) {
1934 cache_fini(XBZRLE.cache);
1935 g_free(XBZRLE.encoded_buf);
1936 g_free(XBZRLE.current_buf);
1937 g_free(XBZRLE.zero_target_page);
1938 XBZRLE.cache = NULL;
1939 XBZRLE.encoded_buf = NULL;
1940 XBZRLE.current_buf = NULL;
1941 XBZRLE.zero_target_page = NULL;
1943 XBZRLE_cache_unlock();
1946 static void ram_save_cleanup(void *opaque)
1948 RAMState **rsp = opaque;
1949 RAMBlock *block;
1951 /* caller have hold iothread lock or is in a bh, so there is
1952 * no writing race against this migration_bitmap
1954 memory_global_dirty_log_stop();
1956 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1957 g_free(block->bmap);
1958 block->bmap = NULL;
1959 g_free(block->unsentmap);
1960 block->unsentmap = NULL;
1963 xbzrle_cleanup();
1964 compress_threads_save_cleanup();
1965 ram_state_cleanup(rsp);
1968 static void ram_state_reset(RAMState *rs)
1970 rs->last_seen_block = NULL;
1971 rs->last_sent_block = NULL;
1972 rs->last_page = 0;
1973 rs->last_version = ram_list.version;
1974 rs->ram_bulk_stage = true;
1977 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1980 * 'expected' is the value you expect the bitmap mostly to be full
1981 * of; it won't bother printing lines that are all this value.
1982 * If 'todump' is null the migration bitmap is dumped.
1984 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1985 unsigned long pages)
1987 int64_t cur;
1988 int64_t linelen = 128;
1989 char linebuf[129];
1991 for (cur = 0; cur < pages; cur += linelen) {
1992 int64_t curb;
1993 bool found = false;
1995 * Last line; catch the case where the line length
1996 * is longer than remaining ram
1998 if (cur + linelen > pages) {
1999 linelen = pages - cur;
2001 for (curb = 0; curb < linelen; curb++) {
2002 bool thisbit = test_bit(cur + curb, todump);
2003 linebuf[curb] = thisbit ? '1' : '.';
2004 found = found || (thisbit != expected);
2006 if (found) {
2007 linebuf[curb] = '\0';
2008 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2013 /* **** functions for postcopy ***** */
2015 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2017 struct RAMBlock *block;
2019 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2020 unsigned long *bitmap = block->bmap;
2021 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2022 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2024 while (run_start < range) {
2025 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2026 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2027 (run_end - run_start) << TARGET_PAGE_BITS);
2028 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2034 * postcopy_send_discard_bm_ram: discard a RAMBlock
2036 * Returns zero on success
2038 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2039 * Note: At this point the 'unsentmap' is the processed bitmap combined
2040 * with the dirtymap; so a '1' means it's either dirty or unsent.
2042 * @ms: current migration state
2043 * @pds: state for postcopy
2044 * @start: RAMBlock starting page
2045 * @length: RAMBlock size
2047 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2048 PostcopyDiscardState *pds,
2049 RAMBlock *block)
2051 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2052 unsigned long current;
2053 unsigned long *unsentmap = block->unsentmap;
2055 for (current = 0; current < end; ) {
2056 unsigned long one = find_next_bit(unsentmap, end, current);
2058 if (one <= end) {
2059 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2060 unsigned long discard_length;
2062 if (zero >= end) {
2063 discard_length = end - one;
2064 } else {
2065 discard_length = zero - one;
2067 if (discard_length) {
2068 postcopy_discard_send_range(ms, pds, one, discard_length);
2070 current = one + discard_length;
2071 } else {
2072 current = one;
2076 return 0;
2080 * postcopy_each_ram_send_discard: discard all RAMBlocks
2082 * Returns 0 for success or negative for error
2084 * Utility for the outgoing postcopy code.
2085 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2086 * passing it bitmap indexes and name.
2087 * (qemu_ram_foreach_block ends up passing unscaled lengths
2088 * which would mean postcopy code would have to deal with target page)
2090 * @ms: current migration state
2092 static int postcopy_each_ram_send_discard(MigrationState *ms)
2094 struct RAMBlock *block;
2095 int ret;
2097 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2098 PostcopyDiscardState *pds =
2099 postcopy_discard_send_init(ms, block->idstr);
2102 * Postcopy sends chunks of bitmap over the wire, but it
2103 * just needs indexes at this point, avoids it having
2104 * target page specific code.
2106 ret = postcopy_send_discard_bm_ram(ms, pds, block);
2107 postcopy_discard_send_finish(ms, pds);
2108 if (ret) {
2109 return ret;
2113 return 0;
2117 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2119 * Helper for postcopy_chunk_hostpages; it's called twice to
2120 * canonicalize the two bitmaps, that are similar, but one is
2121 * inverted.
2123 * Postcopy requires that all target pages in a hostpage are dirty or
2124 * clean, not a mix. This function canonicalizes the bitmaps.
2126 * @ms: current migration state
2127 * @unsent_pass: if true we need to canonicalize partially unsent host pages
2128 * otherwise we need to canonicalize partially dirty host pages
2129 * @block: block that contains the page we want to canonicalize
2130 * @pds: state for postcopy
2132 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2133 RAMBlock *block,
2134 PostcopyDiscardState *pds)
2136 RAMState *rs = ram_state;
2137 unsigned long *bitmap = block->bmap;
2138 unsigned long *unsentmap = block->unsentmap;
2139 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2140 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2141 unsigned long run_start;
2143 if (block->page_size == TARGET_PAGE_SIZE) {
2144 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2145 return;
2148 if (unsent_pass) {
2149 /* Find a sent page */
2150 run_start = find_next_zero_bit(unsentmap, pages, 0);
2151 } else {
2152 /* Find a dirty page */
2153 run_start = find_next_bit(bitmap, pages, 0);
2156 while (run_start < pages) {
2157 bool do_fixup = false;
2158 unsigned long fixup_start_addr;
2159 unsigned long host_offset;
2162 * If the start of this run of pages is in the middle of a host
2163 * page, then we need to fixup this host page.
2165 host_offset = run_start % host_ratio;
2166 if (host_offset) {
2167 do_fixup = true;
2168 run_start -= host_offset;
2169 fixup_start_addr = run_start;
2170 /* For the next pass */
2171 run_start = run_start + host_ratio;
2172 } else {
2173 /* Find the end of this run */
2174 unsigned long run_end;
2175 if (unsent_pass) {
2176 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2177 } else {
2178 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2181 * If the end isn't at the start of a host page, then the
2182 * run doesn't finish at the end of a host page
2183 * and we need to discard.
2185 host_offset = run_end % host_ratio;
2186 if (host_offset) {
2187 do_fixup = true;
2188 fixup_start_addr = run_end - host_offset;
2190 * This host page has gone, the next loop iteration starts
2191 * from after the fixup
2193 run_start = fixup_start_addr + host_ratio;
2194 } else {
2196 * No discards on this iteration, next loop starts from
2197 * next sent/dirty page
2199 run_start = run_end + 1;
2203 if (do_fixup) {
2204 unsigned long page;
2206 /* Tell the destination to discard this page */
2207 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2208 /* For the unsent_pass we:
2209 * discard partially sent pages
2210 * For the !unsent_pass (dirty) we:
2211 * discard partially dirty pages that were sent
2212 * (any partially sent pages were already discarded
2213 * by the previous unsent_pass)
2215 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2216 host_ratio);
2219 /* Clean up the bitmap */
2220 for (page = fixup_start_addr;
2221 page < fixup_start_addr + host_ratio; page++) {
2222 /* All pages in this host page are now not sent */
2223 set_bit(page, unsentmap);
2226 * Remark them as dirty, updating the count for any pages
2227 * that weren't previously dirty.
2229 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2233 if (unsent_pass) {
2234 /* Find the next sent page for the next iteration */
2235 run_start = find_next_zero_bit(unsentmap, pages, run_start);
2236 } else {
2237 /* Find the next dirty page for the next iteration */
2238 run_start = find_next_bit(bitmap, pages, run_start);
2244 * postcopy_chuck_hostpages: discrad any partially sent host page
2246 * Utility for the outgoing postcopy code.
2248 * Discard any partially sent host-page size chunks, mark any partially
2249 * dirty host-page size chunks as all dirty. In this case the host-page
2250 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2252 * Returns zero on success
2254 * @ms: current migration state
2255 * @block: block we want to work with
2257 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2259 PostcopyDiscardState *pds =
2260 postcopy_discard_send_init(ms, block->idstr);
2262 /* First pass: Discard all partially sent host pages */
2263 postcopy_chunk_hostpages_pass(ms, true, block, pds);
2265 * Second pass: Ensure that all partially dirty host pages are made
2266 * fully dirty.
2268 postcopy_chunk_hostpages_pass(ms, false, block, pds);
2270 postcopy_discard_send_finish(ms, pds);
2271 return 0;
2275 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2277 * Returns zero on success
2279 * Transmit the set of pages to be discarded after precopy to the target
2280 * these are pages that:
2281 * a) Have been previously transmitted but are now dirty again
2282 * b) Pages that have never been transmitted, this ensures that
2283 * any pages on the destination that have been mapped by background
2284 * tasks get discarded (transparent huge pages is the specific concern)
2285 * Hopefully this is pretty sparse
2287 * @ms: current migration state
2289 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2291 RAMState *rs = ram_state;
2292 RAMBlock *block;
2293 int ret;
2295 rcu_read_lock();
2297 /* This should be our last sync, the src is now paused */
2298 migration_bitmap_sync(rs);
2300 /* Easiest way to make sure we don't resume in the middle of a host-page */
2301 rs->last_seen_block = NULL;
2302 rs->last_sent_block = NULL;
2303 rs->last_page = 0;
2305 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2306 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2307 unsigned long *bitmap = block->bmap;
2308 unsigned long *unsentmap = block->unsentmap;
2310 if (!unsentmap) {
2311 /* We don't have a safe way to resize the sentmap, so
2312 * if the bitmap was resized it will be NULL at this
2313 * point.
2315 error_report("migration ram resized during precopy phase");
2316 rcu_read_unlock();
2317 return -EINVAL;
2319 /* Deal with TPS != HPS and huge pages */
2320 ret = postcopy_chunk_hostpages(ms, block);
2321 if (ret) {
2322 rcu_read_unlock();
2323 return ret;
2327 * Update the unsentmap to be unsentmap = unsentmap | dirty
2329 bitmap_or(unsentmap, unsentmap, bitmap, pages);
2330 #ifdef DEBUG_POSTCOPY
2331 ram_debug_dump_bitmap(unsentmap, true, pages);
2332 #endif
2334 trace_ram_postcopy_send_discard_bitmap();
2336 ret = postcopy_each_ram_send_discard(ms);
2337 rcu_read_unlock();
2339 return ret;
2343 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2345 * Returns zero on success
2347 * @rbname: name of the RAMBlock of the request. NULL means the
2348 * same that last one.
2349 * @start: RAMBlock starting page
2350 * @length: RAMBlock size
2352 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2354 int ret = -1;
2356 trace_ram_discard_range(rbname, start, length);
2358 rcu_read_lock();
2359 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2361 if (!rb) {
2362 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2363 goto err;
2366 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2367 length >> qemu_target_page_bits());
2368 ret = ram_block_discard_range(rb, start, length);
2370 err:
2371 rcu_read_unlock();
2373 return ret;
2377 * For every allocation, we will try not to crash the VM if the
2378 * allocation failed.
2380 static int xbzrle_init(void)
2382 Error *local_err = NULL;
2384 if (!migrate_use_xbzrle()) {
2385 return 0;
2388 XBZRLE_cache_lock();
2390 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2391 if (!XBZRLE.zero_target_page) {
2392 error_report("%s: Error allocating zero page", __func__);
2393 goto err_out;
2396 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2397 TARGET_PAGE_SIZE, &local_err);
2398 if (!XBZRLE.cache) {
2399 error_report_err(local_err);
2400 goto free_zero_page;
2403 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2404 if (!XBZRLE.encoded_buf) {
2405 error_report("%s: Error allocating encoded_buf", __func__);
2406 goto free_cache;
2409 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2410 if (!XBZRLE.current_buf) {
2411 error_report("%s: Error allocating current_buf", __func__);
2412 goto free_encoded_buf;
2415 /* We are all good */
2416 XBZRLE_cache_unlock();
2417 return 0;
2419 free_encoded_buf:
2420 g_free(XBZRLE.encoded_buf);
2421 XBZRLE.encoded_buf = NULL;
2422 free_cache:
2423 cache_fini(XBZRLE.cache);
2424 XBZRLE.cache = NULL;
2425 free_zero_page:
2426 g_free(XBZRLE.zero_target_page);
2427 XBZRLE.zero_target_page = NULL;
2428 err_out:
2429 XBZRLE_cache_unlock();
2430 return -ENOMEM;
2433 static int ram_state_init(RAMState **rsp)
2435 *rsp = g_try_new0(RAMState, 1);
2437 if (!*rsp) {
2438 error_report("%s: Init ramstate fail", __func__);
2439 return -1;
2442 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2443 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2444 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2447 * Count the total number of pages used by ram blocks not including any
2448 * gaps due to alignment or unplugs.
2450 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2452 ram_state_reset(*rsp);
2454 return 0;
2457 static void ram_list_init_bitmaps(void)
2459 RAMBlock *block;
2460 unsigned long pages;
2462 /* Skip setting bitmap if there is no RAM */
2463 if (ram_bytes_total()) {
2464 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2465 pages = block->max_length >> TARGET_PAGE_BITS;
2466 block->bmap = bitmap_new(pages);
2467 bitmap_set(block->bmap, 0, pages);
2468 if (migrate_postcopy_ram()) {
2469 block->unsentmap = bitmap_new(pages);
2470 bitmap_set(block->unsentmap, 0, pages);
2476 static void ram_init_bitmaps(RAMState *rs)
2478 /* For memory_global_dirty_log_start below. */
2479 qemu_mutex_lock_iothread();
2480 qemu_mutex_lock_ramlist();
2481 rcu_read_lock();
2483 ram_list_init_bitmaps();
2484 memory_global_dirty_log_start();
2485 migration_bitmap_sync(rs);
2487 rcu_read_unlock();
2488 qemu_mutex_unlock_ramlist();
2489 qemu_mutex_unlock_iothread();
2492 static int ram_init_all(RAMState **rsp)
2494 if (ram_state_init(rsp)) {
2495 return -1;
2498 if (xbzrle_init()) {
2499 ram_state_cleanup(rsp);
2500 return -1;
2503 ram_init_bitmaps(*rsp);
2505 return 0;
2508 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2510 RAMBlock *block;
2511 uint64_t pages = 0;
2514 * Postcopy is not using xbzrle/compression, so no need for that.
2515 * Also, since source are already halted, we don't need to care
2516 * about dirty page logging as well.
2519 RAMBLOCK_FOREACH(block) {
2520 pages += bitmap_count_one(block->bmap,
2521 block->used_length >> TARGET_PAGE_BITS);
2524 /* This may not be aligned with current bitmaps. Recalculate. */
2525 rs->migration_dirty_pages = pages;
2527 rs->last_seen_block = NULL;
2528 rs->last_sent_block = NULL;
2529 rs->last_page = 0;
2530 rs->last_version = ram_list.version;
2532 * Disable the bulk stage, otherwise we'll resend the whole RAM no
2533 * matter what we have sent.
2535 rs->ram_bulk_stage = false;
2537 /* Update RAMState cache of output QEMUFile */
2538 rs->f = out;
2540 trace_ram_state_resume_prepare(pages);
2544 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2545 * long-running RCU critical section. When rcu-reclaims in the code
2546 * start to become numerous it will be necessary to reduce the
2547 * granularity of these critical sections.
2551 * ram_save_setup: Setup RAM for migration
2553 * Returns zero to indicate success and negative for error
2555 * @f: QEMUFile where to send the data
2556 * @opaque: RAMState pointer
2558 static int ram_save_setup(QEMUFile *f, void *opaque)
2560 RAMState **rsp = opaque;
2561 RAMBlock *block;
2563 if (compress_threads_save_setup()) {
2564 return -1;
2567 /* migration has already setup the bitmap, reuse it. */
2568 if (!migration_in_colo_state()) {
2569 if (ram_init_all(rsp) != 0) {
2570 compress_threads_save_cleanup();
2571 return -1;
2574 (*rsp)->f = f;
2576 rcu_read_lock();
2578 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2580 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2581 qemu_put_byte(f, strlen(block->idstr));
2582 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2583 qemu_put_be64(f, block->used_length);
2584 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2585 qemu_put_be64(f, block->page_size);
2589 rcu_read_unlock();
2591 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2592 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2594 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2596 return 0;
2600 * ram_save_iterate: iterative stage for migration
2602 * Returns zero to indicate success and negative for error
2604 * @f: QEMUFile where to send the data
2605 * @opaque: RAMState pointer
2607 static int ram_save_iterate(QEMUFile *f, void *opaque)
2609 RAMState **temp = opaque;
2610 RAMState *rs = *temp;
2611 int ret;
2612 int i;
2613 int64_t t0;
2614 int done = 0;
2616 if (blk_mig_bulk_active()) {
2617 /* Avoid transferring ram during bulk phase of block migration as
2618 * the bulk phase will usually take a long time and transferring
2619 * ram updates during that time is pointless. */
2620 goto out;
2623 rcu_read_lock();
2624 if (ram_list.version != rs->last_version) {
2625 ram_state_reset(rs);
2628 /* Read version before ram_list.blocks */
2629 smp_rmb();
2631 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2633 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2634 i = 0;
2635 while ((ret = qemu_file_rate_limit(f)) == 0) {
2636 int pages;
2638 pages = ram_find_and_save_block(rs, false);
2639 /* no more pages to sent */
2640 if (pages == 0) {
2641 done = 1;
2642 break;
2644 rs->iterations++;
2646 /* we want to check in the 1st loop, just in case it was the 1st time
2647 and we had to sync the dirty bitmap.
2648 qemu_get_clock_ns() is a bit expensive, so we only check each some
2649 iterations
2651 if ((i & 63) == 0) {
2652 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2653 if (t1 > MAX_WAIT) {
2654 trace_ram_save_iterate_big_wait(t1, i);
2655 break;
2658 i++;
2660 flush_compressed_data(rs);
2661 rcu_read_unlock();
2664 * Must occur before EOS (or any QEMUFile operation)
2665 * because of RDMA protocol.
2667 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2669 out:
2670 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2671 ram_counters.transferred += 8;
2673 ret = qemu_file_get_error(f);
2674 if (ret < 0) {
2675 return ret;
2678 return done;
2682 * ram_save_complete: function called to send the remaining amount of ram
2684 * Returns zero to indicate success
2686 * Called with iothread lock
2688 * @f: QEMUFile where to send the data
2689 * @opaque: RAMState pointer
2691 static int ram_save_complete(QEMUFile *f, void *opaque)
2693 RAMState **temp = opaque;
2694 RAMState *rs = *temp;
2696 rcu_read_lock();
2698 if (!migration_in_postcopy()) {
2699 migration_bitmap_sync(rs);
2702 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2704 /* try transferring iterative blocks of memory */
2706 /* flush all remaining blocks regardless of rate limiting */
2707 while (true) {
2708 int pages;
2710 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2711 /* no more blocks to sent */
2712 if (pages == 0) {
2713 break;
2717 flush_compressed_data(rs);
2718 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2720 rcu_read_unlock();
2722 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2724 return 0;
2727 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2728 uint64_t *res_precopy_only,
2729 uint64_t *res_compatible,
2730 uint64_t *res_postcopy_only)
2732 RAMState **temp = opaque;
2733 RAMState *rs = *temp;
2734 uint64_t remaining_size;
2736 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2738 if (!migration_in_postcopy() &&
2739 remaining_size < max_size) {
2740 qemu_mutex_lock_iothread();
2741 rcu_read_lock();
2742 migration_bitmap_sync(rs);
2743 rcu_read_unlock();
2744 qemu_mutex_unlock_iothread();
2745 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2748 if (migrate_postcopy_ram()) {
2749 /* We can do postcopy, and all the data is postcopiable */
2750 *res_compatible += remaining_size;
2751 } else {
2752 *res_precopy_only += remaining_size;
2756 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2758 unsigned int xh_len;
2759 int xh_flags;
2760 uint8_t *loaded_data;
2762 /* extract RLE header */
2763 xh_flags = qemu_get_byte(f);
2764 xh_len = qemu_get_be16(f);
2766 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2767 error_report("Failed to load XBZRLE page - wrong compression!");
2768 return -1;
2771 if (xh_len > TARGET_PAGE_SIZE) {
2772 error_report("Failed to load XBZRLE page - len overflow!");
2773 return -1;
2775 loaded_data = XBZRLE.decoded_buf;
2776 /* load data and decode */
2777 /* it can change loaded_data to point to an internal buffer */
2778 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2780 /* decode RLE */
2781 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2782 TARGET_PAGE_SIZE) == -1) {
2783 error_report("Failed to load XBZRLE page - decode error!");
2784 return -1;
2787 return 0;
2791 * ram_block_from_stream: read a RAMBlock id from the migration stream
2793 * Must be called from within a rcu critical section.
2795 * Returns a pointer from within the RCU-protected ram_list.
2797 * @f: QEMUFile where to read the data from
2798 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2800 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2802 static RAMBlock *block = NULL;
2803 char id[256];
2804 uint8_t len;
2806 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2807 if (!block) {
2808 error_report("Ack, bad migration stream!");
2809 return NULL;
2811 return block;
2814 len = qemu_get_byte(f);
2815 qemu_get_buffer(f, (uint8_t *)id, len);
2816 id[len] = 0;
2818 block = qemu_ram_block_by_name(id);
2819 if (!block) {
2820 error_report("Can't find block %s", id);
2821 return NULL;
2824 if (!qemu_ram_is_migratable(block)) {
2825 error_report("block %s should not be migrated !", id);
2826 return NULL;
2829 return block;
2832 static inline void *host_from_ram_block_offset(RAMBlock *block,
2833 ram_addr_t offset)
2835 if (!offset_in_ramblock(block, offset)) {
2836 return NULL;
2839 return block->host + offset;
2843 * ram_handle_compressed: handle the zero page case
2845 * If a page (or a whole RDMA chunk) has been
2846 * determined to be zero, then zap it.
2848 * @host: host address for the zero page
2849 * @ch: what the page is filled from. We only support zero
2850 * @size: size of the zero page
2852 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2854 if (ch != 0 || !is_zero_range(host, size)) {
2855 memset(host, ch, size);
2859 /* return the size after decompression, or negative value on error */
2860 static int
2861 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2862 const uint8_t *source, size_t source_len)
2864 int err;
2866 err = inflateReset(stream);
2867 if (err != Z_OK) {
2868 return -1;
2871 stream->avail_in = source_len;
2872 stream->next_in = (uint8_t *)source;
2873 stream->avail_out = dest_len;
2874 stream->next_out = dest;
2876 err = inflate(stream, Z_NO_FLUSH);
2877 if (err != Z_STREAM_END) {
2878 return -1;
2881 return stream->total_out;
2884 static void *do_data_decompress(void *opaque)
2886 DecompressParam *param = opaque;
2887 unsigned long pagesize;
2888 uint8_t *des;
2889 int len, ret;
2891 qemu_mutex_lock(&param->mutex);
2892 while (!param->quit) {
2893 if (param->des) {
2894 des = param->des;
2895 len = param->len;
2896 param->des = 0;
2897 qemu_mutex_unlock(&param->mutex);
2899 pagesize = TARGET_PAGE_SIZE;
2901 ret = qemu_uncompress_data(&param->stream, des, pagesize,
2902 param->compbuf, len);
2903 if (ret < 0 && migrate_get_current()->decompress_error_check) {
2904 error_report("decompress data failed");
2905 qemu_file_set_error(decomp_file, ret);
2908 qemu_mutex_lock(&decomp_done_lock);
2909 param->done = true;
2910 qemu_cond_signal(&decomp_done_cond);
2911 qemu_mutex_unlock(&decomp_done_lock);
2913 qemu_mutex_lock(&param->mutex);
2914 } else {
2915 qemu_cond_wait(&param->cond, &param->mutex);
2918 qemu_mutex_unlock(&param->mutex);
2920 return NULL;
2923 static int wait_for_decompress_done(void)
2925 int idx, thread_count;
2927 if (!migrate_use_compression()) {
2928 return 0;
2931 thread_count = migrate_decompress_threads();
2932 qemu_mutex_lock(&decomp_done_lock);
2933 for (idx = 0; idx < thread_count; idx++) {
2934 while (!decomp_param[idx].done) {
2935 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2938 qemu_mutex_unlock(&decomp_done_lock);
2939 return qemu_file_get_error(decomp_file);
2942 static void compress_threads_load_cleanup(void)
2944 int i, thread_count;
2946 if (!migrate_use_compression()) {
2947 return;
2949 thread_count = migrate_decompress_threads();
2950 for (i = 0; i < thread_count; i++) {
2952 * we use it as a indicator which shows if the thread is
2953 * properly init'd or not
2955 if (!decomp_param[i].compbuf) {
2956 break;
2959 qemu_mutex_lock(&decomp_param[i].mutex);
2960 decomp_param[i].quit = true;
2961 qemu_cond_signal(&decomp_param[i].cond);
2962 qemu_mutex_unlock(&decomp_param[i].mutex);
2964 for (i = 0; i < thread_count; i++) {
2965 if (!decomp_param[i].compbuf) {
2966 break;
2969 qemu_thread_join(decompress_threads + i);
2970 qemu_mutex_destroy(&decomp_param[i].mutex);
2971 qemu_cond_destroy(&decomp_param[i].cond);
2972 inflateEnd(&decomp_param[i].stream);
2973 g_free(decomp_param[i].compbuf);
2974 decomp_param[i].compbuf = NULL;
2976 g_free(decompress_threads);
2977 g_free(decomp_param);
2978 decompress_threads = NULL;
2979 decomp_param = NULL;
2980 decomp_file = NULL;
2983 static int compress_threads_load_setup(QEMUFile *f)
2985 int i, thread_count;
2987 if (!migrate_use_compression()) {
2988 return 0;
2991 thread_count = migrate_decompress_threads();
2992 decompress_threads = g_new0(QemuThread, thread_count);
2993 decomp_param = g_new0(DecompressParam, thread_count);
2994 qemu_mutex_init(&decomp_done_lock);
2995 qemu_cond_init(&decomp_done_cond);
2996 decomp_file = f;
2997 for (i = 0; i < thread_count; i++) {
2998 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2999 goto exit;
3002 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3003 qemu_mutex_init(&decomp_param[i].mutex);
3004 qemu_cond_init(&decomp_param[i].cond);
3005 decomp_param[i].done = true;
3006 decomp_param[i].quit = false;
3007 qemu_thread_create(decompress_threads + i, "decompress",
3008 do_data_decompress, decomp_param + i,
3009 QEMU_THREAD_JOINABLE);
3011 return 0;
3012 exit:
3013 compress_threads_load_cleanup();
3014 return -1;
3017 static void decompress_data_with_multi_threads(QEMUFile *f,
3018 void *host, int len)
3020 int idx, thread_count;
3022 thread_count = migrate_decompress_threads();
3023 qemu_mutex_lock(&decomp_done_lock);
3024 while (true) {
3025 for (idx = 0; idx < thread_count; idx++) {
3026 if (decomp_param[idx].done) {
3027 decomp_param[idx].done = false;
3028 qemu_mutex_lock(&decomp_param[idx].mutex);
3029 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3030 decomp_param[idx].des = host;
3031 decomp_param[idx].len = len;
3032 qemu_cond_signal(&decomp_param[idx].cond);
3033 qemu_mutex_unlock(&decomp_param[idx].mutex);
3034 break;
3037 if (idx < thread_count) {
3038 break;
3039 } else {
3040 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3043 qemu_mutex_unlock(&decomp_done_lock);
3047 * ram_load_setup: Setup RAM for migration incoming side
3049 * Returns zero to indicate success and negative for error
3051 * @f: QEMUFile where to receive the data
3052 * @opaque: RAMState pointer
3054 static int ram_load_setup(QEMUFile *f, void *opaque)
3056 if (compress_threads_load_setup(f)) {
3057 return -1;
3060 xbzrle_load_setup();
3061 ramblock_recv_map_init();
3062 return 0;
3065 static int ram_load_cleanup(void *opaque)
3067 RAMBlock *rb;
3068 xbzrle_load_cleanup();
3069 compress_threads_load_cleanup();
3071 RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3072 g_free(rb->receivedmap);
3073 rb->receivedmap = NULL;
3075 return 0;
3079 * ram_postcopy_incoming_init: allocate postcopy data structures
3081 * Returns 0 for success and negative if there was one error
3083 * @mis: current migration incoming state
3085 * Allocate data structures etc needed by incoming migration with
3086 * postcopy-ram. postcopy-ram's similarly names
3087 * postcopy_ram_incoming_init does the work.
3089 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3091 unsigned long ram_pages = last_ram_page();
3093 return postcopy_ram_incoming_init(mis, ram_pages);
3097 * ram_load_postcopy: load a page in postcopy case
3099 * Returns 0 for success or -errno in case of error
3101 * Called in postcopy mode by ram_load().
3102 * rcu_read_lock is taken prior to this being called.
3104 * @f: QEMUFile where to send the data
3106 static int ram_load_postcopy(QEMUFile *f)
3108 int flags = 0, ret = 0;
3109 bool place_needed = false;
3110 bool matching_page_sizes = false;
3111 MigrationIncomingState *mis = migration_incoming_get_current();
3112 /* Temporary page that is later 'placed' */
3113 void *postcopy_host_page = postcopy_get_tmp_page(mis);
3114 void *last_host = NULL;
3115 bool all_zero = false;
3117 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3118 ram_addr_t addr;
3119 void *host = NULL;
3120 void *page_buffer = NULL;
3121 void *place_source = NULL;
3122 RAMBlock *block = NULL;
3123 uint8_t ch;
3125 addr = qemu_get_be64(f);
3128 * If qemu file error, we should stop here, and then "addr"
3129 * may be invalid
3131 ret = qemu_file_get_error(f);
3132 if (ret) {
3133 break;
3136 flags = addr & ~TARGET_PAGE_MASK;
3137 addr &= TARGET_PAGE_MASK;
3139 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3140 place_needed = false;
3141 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
3142 block = ram_block_from_stream(f, flags);
3144 host = host_from_ram_block_offset(block, addr);
3145 if (!host) {
3146 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3147 ret = -EINVAL;
3148 break;
3150 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
3152 * Postcopy requires that we place whole host pages atomically;
3153 * these may be huge pages for RAMBlocks that are backed by
3154 * hugetlbfs.
3155 * To make it atomic, the data is read into a temporary page
3156 * that's moved into place later.
3157 * The migration protocol uses, possibly smaller, target-pages
3158 * however the source ensures it always sends all the components
3159 * of a host page in order.
3161 page_buffer = postcopy_host_page +
3162 ((uintptr_t)host & (block->page_size - 1));
3163 /* If all TP are zero then we can optimise the place */
3164 if (!((uintptr_t)host & (block->page_size - 1))) {
3165 all_zero = true;
3166 } else {
3167 /* not the 1st TP within the HP */
3168 if (host != (last_host + TARGET_PAGE_SIZE)) {
3169 error_report("Non-sequential target page %p/%p",
3170 host, last_host);
3171 ret = -EINVAL;
3172 break;
3178 * If it's the last part of a host page then we place the host
3179 * page
3181 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
3182 (block->page_size - 1)) == 0;
3183 place_source = postcopy_host_page;
3185 last_host = host;
3187 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3188 case RAM_SAVE_FLAG_ZERO:
3189 ch = qemu_get_byte(f);
3190 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3191 if (ch) {
3192 all_zero = false;
3194 break;
3196 case RAM_SAVE_FLAG_PAGE:
3197 all_zero = false;
3198 if (!place_needed || !matching_page_sizes) {
3199 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3200 } else {
3201 /* Avoids the qemu_file copy during postcopy, which is
3202 * going to do a copy later; can only do it when we
3203 * do this read in one go (matching page sizes)
3205 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3206 TARGET_PAGE_SIZE);
3208 break;
3209 case RAM_SAVE_FLAG_EOS:
3210 /* normal exit */
3211 break;
3212 default:
3213 error_report("Unknown combination of migration flags: %#x"
3214 " (postcopy mode)", flags);
3215 ret = -EINVAL;
3216 break;
3219 /* Detect for any possible file errors */
3220 if (!ret && qemu_file_get_error(f)) {
3221 ret = qemu_file_get_error(f);
3224 if (!ret && place_needed) {
3225 /* This gets called at the last target page in the host page */
3226 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3228 if (all_zero) {
3229 ret = postcopy_place_page_zero(mis, place_dest,
3230 block);
3231 } else {
3232 ret = postcopy_place_page(mis, place_dest,
3233 place_source, block);
3238 return ret;
3241 static bool postcopy_is_advised(void)
3243 PostcopyState ps = postcopy_state_get();
3244 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3247 static bool postcopy_is_running(void)
3249 PostcopyState ps = postcopy_state_get();
3250 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3253 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3255 int flags = 0, ret = 0, invalid_flags = 0;
3256 static uint64_t seq_iter;
3257 int len = 0;
3259 * If system is running in postcopy mode, page inserts to host memory must
3260 * be atomic
3262 bool postcopy_running = postcopy_is_running();
3263 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3264 bool postcopy_advised = postcopy_is_advised();
3266 seq_iter++;
3268 if (version_id != 4) {
3269 ret = -EINVAL;
3272 if (!migrate_use_compression()) {
3273 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3275 /* This RCU critical section can be very long running.
3276 * When RCU reclaims in the code start to become numerous,
3277 * it will be necessary to reduce the granularity of this
3278 * critical section.
3280 rcu_read_lock();
3282 if (postcopy_running) {
3283 ret = ram_load_postcopy(f);
3286 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3287 ram_addr_t addr, total_ram_bytes;
3288 void *host = NULL;
3289 uint8_t ch;
3291 addr = qemu_get_be64(f);
3292 flags = addr & ~TARGET_PAGE_MASK;
3293 addr &= TARGET_PAGE_MASK;
3295 if (flags & invalid_flags) {
3296 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3297 error_report("Received an unexpected compressed page");
3300 ret = -EINVAL;
3301 break;
3304 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3305 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3306 RAMBlock *block = ram_block_from_stream(f, flags);
3308 host = host_from_ram_block_offset(block, addr);
3309 if (!host) {
3310 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3311 ret = -EINVAL;
3312 break;
3314 ramblock_recv_bitmap_set(block, host);
3315 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3318 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3319 case RAM_SAVE_FLAG_MEM_SIZE:
3320 /* Synchronize RAM block list */
3321 total_ram_bytes = addr;
3322 while (!ret && total_ram_bytes) {
3323 RAMBlock *block;
3324 char id[256];
3325 ram_addr_t length;
3327 len = qemu_get_byte(f);
3328 qemu_get_buffer(f, (uint8_t *)id, len);
3329 id[len] = 0;
3330 length = qemu_get_be64(f);
3332 block = qemu_ram_block_by_name(id);
3333 if (block && !qemu_ram_is_migratable(block)) {
3334 error_report("block %s should not be migrated !", id);
3335 ret = -EINVAL;
3336 } else if (block) {
3337 if (length != block->used_length) {
3338 Error *local_err = NULL;
3340 ret = qemu_ram_resize(block, length,
3341 &local_err);
3342 if (local_err) {
3343 error_report_err(local_err);
3346 /* For postcopy we need to check hugepage sizes match */
3347 if (postcopy_advised &&
3348 block->page_size != qemu_host_page_size) {
3349 uint64_t remote_page_size = qemu_get_be64(f);
3350 if (remote_page_size != block->page_size) {
3351 error_report("Mismatched RAM page size %s "
3352 "(local) %zd != %" PRId64,
3353 id, block->page_size,
3354 remote_page_size);
3355 ret = -EINVAL;
3358 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3359 block->idstr);
3360 } else {
3361 error_report("Unknown ramblock \"%s\", cannot "
3362 "accept migration", id);
3363 ret = -EINVAL;
3366 total_ram_bytes -= length;
3368 break;
3370 case RAM_SAVE_FLAG_ZERO:
3371 ch = qemu_get_byte(f);
3372 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3373 break;
3375 case RAM_SAVE_FLAG_PAGE:
3376 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3377 break;
3379 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3380 len = qemu_get_be32(f);
3381 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3382 error_report("Invalid compressed data length: %d", len);
3383 ret = -EINVAL;
3384 break;
3386 decompress_data_with_multi_threads(f, host, len);
3387 break;
3389 case RAM_SAVE_FLAG_XBZRLE:
3390 if (load_xbzrle(f, addr, host) < 0) {
3391 error_report("Failed to decompress XBZRLE page at "
3392 RAM_ADDR_FMT, addr);
3393 ret = -EINVAL;
3394 break;
3396 break;
3397 case RAM_SAVE_FLAG_EOS:
3398 /* normal exit */
3399 break;
3400 default:
3401 if (flags & RAM_SAVE_FLAG_HOOK) {
3402 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3403 } else {
3404 error_report("Unknown combination of migration flags: %#x",
3405 flags);
3406 ret = -EINVAL;
3409 if (!ret) {
3410 ret = qemu_file_get_error(f);
3414 ret |= wait_for_decompress_done();
3415 rcu_read_unlock();
3416 trace_ram_load_complete(ret, seq_iter);
3417 return ret;
3420 static bool ram_has_postcopy(void *opaque)
3422 return migrate_postcopy_ram();
3425 /* Sync all the dirty bitmap with destination VM. */
3426 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3428 RAMBlock *block;
3429 QEMUFile *file = s->to_dst_file;
3430 int ramblock_count = 0;
3432 trace_ram_dirty_bitmap_sync_start();
3434 RAMBLOCK_FOREACH(block) {
3435 qemu_savevm_send_recv_bitmap(file, block->idstr);
3436 trace_ram_dirty_bitmap_request(block->idstr);
3437 ramblock_count++;
3440 trace_ram_dirty_bitmap_sync_wait();
3442 /* Wait until all the ramblocks' dirty bitmap synced */
3443 while (ramblock_count--) {
3444 qemu_sem_wait(&s->rp_state.rp_sem);
3447 trace_ram_dirty_bitmap_sync_complete();
3449 return 0;
3452 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3454 qemu_sem_post(&s->rp_state.rp_sem);
3458 * Read the received bitmap, revert it as the initial dirty bitmap.
3459 * This is only used when the postcopy migration is paused but wants
3460 * to resume from a middle point.
3462 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3464 int ret = -EINVAL;
3465 QEMUFile *file = s->rp_state.from_dst_file;
3466 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3467 uint64_t local_size = nbits / 8;
3468 uint64_t size, end_mark;
3470 trace_ram_dirty_bitmap_reload_begin(block->idstr);
3472 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3473 error_report("%s: incorrect state %s", __func__,
3474 MigrationStatus_str(s->state));
3475 return -EINVAL;
3479 * Note: see comments in ramblock_recv_bitmap_send() on why we
3480 * need the endianess convertion, and the paddings.
3482 local_size = ROUND_UP(local_size, 8);
3484 /* Add paddings */
3485 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3487 size = qemu_get_be64(file);
3489 /* The size of the bitmap should match with our ramblock */
3490 if (size != local_size) {
3491 error_report("%s: ramblock '%s' bitmap size mismatch "
3492 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3493 block->idstr, size, local_size);
3494 ret = -EINVAL;
3495 goto out;
3498 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3499 end_mark = qemu_get_be64(file);
3501 ret = qemu_file_get_error(file);
3502 if (ret || size != local_size) {
3503 error_report("%s: read bitmap failed for ramblock '%s': %d"
3504 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3505 __func__, block->idstr, ret, local_size, size);
3506 ret = -EIO;
3507 goto out;
3510 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3511 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3512 __func__, block->idstr, end_mark);
3513 ret = -EINVAL;
3514 goto out;
3518 * Endianess convertion. We are during postcopy (though paused).
3519 * The dirty bitmap won't change. We can directly modify it.
3521 bitmap_from_le(block->bmap, le_bitmap, nbits);
3524 * What we received is "received bitmap". Revert it as the initial
3525 * dirty bitmap for this ramblock.
3527 bitmap_complement(block->bmap, block->bmap, nbits);
3529 trace_ram_dirty_bitmap_reload_complete(block->idstr);
3532 * We succeeded to sync bitmap for current ramblock. If this is
3533 * the last one to sync, we need to notify the main send thread.
3535 ram_dirty_bitmap_reload_notify(s);
3537 ret = 0;
3538 out:
3539 g_free(le_bitmap);
3540 return ret;
3543 static int ram_resume_prepare(MigrationState *s, void *opaque)
3545 RAMState *rs = *(RAMState **)opaque;
3546 int ret;
3548 ret = ram_dirty_bitmap_sync_all(s, rs);
3549 if (ret) {
3550 return ret;
3553 ram_state_resume_prepare(rs, s->to_dst_file);
3555 return 0;
3558 static SaveVMHandlers savevm_ram_handlers = {
3559 .save_setup = ram_save_setup,
3560 .save_live_iterate = ram_save_iterate,
3561 .save_live_complete_postcopy = ram_save_complete,
3562 .save_live_complete_precopy = ram_save_complete,
3563 .has_postcopy = ram_has_postcopy,
3564 .save_live_pending = ram_save_pending,
3565 .load_state = ram_load,
3566 .save_cleanup = ram_save_cleanup,
3567 .load_setup = ram_load_setup,
3568 .load_cleanup = ram_load_cleanup,
3569 .resume_prepare = ram_resume_prepare,
3572 void ram_mig_init(void)
3574 qemu_mutex_init(&XBZRLE.lock);
3575 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);