hw/arm/stellaris: Use qemu_log_mask(UNIMP) instead of fprintf
[qemu/kevin.git] / migration / ram.c
blobcd5f55117d944e01767043f689b2cee88734234c
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qemu/cutils.h"
33 #include "qemu/bitops.h"
34 #include "qemu/bitmap.h"
35 #include "qemu/main-loop.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "socket.h"
40 #include "migration/register.h"
41 #include "migration/misc.h"
42 #include "qemu-file.h"
43 #include "postcopy-ram.h"
44 #include "page_cache.h"
45 #include "qemu/error-report.h"
46 #include "qapi/error.h"
47 #include "qapi/qapi-events-migration.h"
48 #include "qapi/qmp/qerror.h"
49 #include "trace.h"
50 #include "exec/ram_addr.h"
51 #include "exec/target_page.h"
52 #include "qemu/rcu_queue.h"
53 #include "migration/colo.h"
54 #include "block.h"
55 #include "sysemu/sysemu.h"
56 #include "qemu/uuid.h"
57 #include "savevm.h"
59 /***********************************************************/
60 /* ram save/restore */
62 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
63 * worked for pages that where filled with the same char. We switched
64 * it to only search for the zero value. And to avoid confusion with
65 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
68 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
69 #define RAM_SAVE_FLAG_ZERO 0x02
70 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
71 #define RAM_SAVE_FLAG_PAGE 0x08
72 #define RAM_SAVE_FLAG_EOS 0x10
73 #define RAM_SAVE_FLAG_CONTINUE 0x20
74 #define RAM_SAVE_FLAG_XBZRLE 0x40
75 /* 0x80 is reserved in migration.h start with 0x100 next */
76 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
78 static inline bool is_zero_range(uint8_t *p, uint64_t size)
80 return buffer_is_zero(p, size);
83 XBZRLECacheStats xbzrle_counters;
85 /* struct contains XBZRLE cache and a static page
86 used by the compression */
87 static struct {
88 /* buffer used for XBZRLE encoding */
89 uint8_t *encoded_buf;
90 /* buffer for storing page content */
91 uint8_t *current_buf;
92 /* Cache for XBZRLE, Protected by lock. */
93 PageCache *cache;
94 QemuMutex lock;
95 /* it will store a page full of zeros */
96 uint8_t *zero_target_page;
97 /* buffer used for XBZRLE decoding */
98 uint8_t *decoded_buf;
99 } XBZRLE;
101 static void XBZRLE_cache_lock(void)
103 if (migrate_use_xbzrle())
104 qemu_mutex_lock(&XBZRLE.lock);
107 static void XBZRLE_cache_unlock(void)
109 if (migrate_use_xbzrle())
110 qemu_mutex_unlock(&XBZRLE.lock);
114 * xbzrle_cache_resize: resize the xbzrle cache
116 * This function is called from qmp_migrate_set_cache_size in main
117 * thread, possibly while a migration is in progress. A running
118 * migration may be using the cache and might finish during this call,
119 * hence changes to the cache are protected by XBZRLE.lock().
121 * Returns 0 for success or -1 for error
123 * @new_size: new cache size
124 * @errp: set *errp if the check failed, with reason
126 int xbzrle_cache_resize(int64_t new_size, Error **errp)
128 PageCache *new_cache;
129 int64_t ret = 0;
131 /* Check for truncation */
132 if (new_size != (size_t)new_size) {
133 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
134 "exceeding address space");
135 return -1;
138 if (new_size == migrate_xbzrle_cache_size()) {
139 /* nothing to do */
140 return 0;
143 XBZRLE_cache_lock();
145 if (XBZRLE.cache != NULL) {
146 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
147 if (!new_cache) {
148 ret = -1;
149 goto out;
152 cache_fini(XBZRLE.cache);
153 XBZRLE.cache = new_cache;
155 out:
156 XBZRLE_cache_unlock();
157 return ret;
160 /* Should be holding either ram_list.mutex, or the RCU lock. */
161 #define RAMBLOCK_FOREACH_MIGRATABLE(block) \
162 INTERNAL_RAMBLOCK_FOREACH(block) \
163 if (!qemu_ram_is_migratable(block)) {} else
165 #undef RAMBLOCK_FOREACH
167 static void ramblock_recv_map_init(void)
169 RAMBlock *rb;
171 RAMBLOCK_FOREACH_MIGRATABLE(rb) {
172 assert(!rb->receivedmap);
173 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
177 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
179 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
180 rb->receivedmap);
183 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
185 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
188 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
190 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
193 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
194 size_t nr)
196 bitmap_set_atomic(rb->receivedmap,
197 ramblock_recv_bitmap_offset(host_addr, rb),
198 nr);
201 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
204 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
206 * Returns >0 if success with sent bytes, or <0 if error.
208 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
209 const char *block_name)
211 RAMBlock *block = qemu_ram_block_by_name(block_name);
212 unsigned long *le_bitmap, nbits;
213 uint64_t size;
215 if (!block) {
216 error_report("%s: invalid block name: %s", __func__, block_name);
217 return -1;
220 nbits = block->used_length >> TARGET_PAGE_BITS;
223 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
224 * machines we may need 4 more bytes for padding (see below
225 * comment). So extend it a bit before hand.
227 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
230 * Always use little endian when sending the bitmap. This is
231 * required that when source and destination VMs are not using the
232 * same endianess. (Note: big endian won't work.)
234 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
236 /* Size of the bitmap, in bytes */
237 size = nbits / 8;
240 * size is always aligned to 8 bytes for 64bit machines, but it
241 * may not be true for 32bit machines. We need this padding to
242 * make sure the migration can survive even between 32bit and
243 * 64bit machines.
245 size = ROUND_UP(size, 8);
247 qemu_put_be64(file, size);
248 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
250 * Mark as an end, in case the middle part is screwed up due to
251 * some "misterious" reason.
253 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
254 qemu_fflush(file);
256 g_free(le_bitmap);
258 if (qemu_file_get_error(file)) {
259 return qemu_file_get_error(file);
262 return size + sizeof(size);
266 * An outstanding page request, on the source, having been received
267 * and queued
269 struct RAMSrcPageRequest {
270 RAMBlock *rb;
271 hwaddr offset;
272 hwaddr len;
274 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
277 /* State of RAM for migration */
278 struct RAMState {
279 /* QEMUFile used for this migration */
280 QEMUFile *f;
281 /* Last block that we have visited searching for dirty pages */
282 RAMBlock *last_seen_block;
283 /* Last block from where we have sent data */
284 RAMBlock *last_sent_block;
285 /* Last dirty target page we have sent */
286 ram_addr_t last_page;
287 /* last ram version we have seen */
288 uint32_t last_version;
289 /* We are in the first round */
290 bool ram_bulk_stage;
291 /* How many times we have dirty too many pages */
292 int dirty_rate_high_cnt;
293 /* these variables are used for bitmap sync */
294 /* last time we did a full bitmap_sync */
295 int64_t time_last_bitmap_sync;
296 /* bytes transferred at start_time */
297 uint64_t bytes_xfer_prev;
298 /* number of dirty pages since start_time */
299 uint64_t num_dirty_pages_period;
300 /* xbzrle misses since the beginning of the period */
301 uint64_t xbzrle_cache_miss_prev;
302 /* number of iterations at the beginning of period */
303 uint64_t iterations_prev;
304 /* Iterations since start */
305 uint64_t iterations;
306 /* number of dirty bits in the bitmap */
307 uint64_t migration_dirty_pages;
308 /* protects modification of the bitmap */
309 QemuMutex bitmap_mutex;
310 /* The RAMBlock used in the last src_page_requests */
311 RAMBlock *last_req_rb;
312 /* Queue of outstanding page requests from the destination */
313 QemuMutex src_page_req_mutex;
314 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
316 typedef struct RAMState RAMState;
318 static RAMState *ram_state;
320 uint64_t ram_bytes_remaining(void)
322 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
326 MigrationStats ram_counters;
328 /* used by the search for pages to send */
329 struct PageSearchStatus {
330 /* Current block being searched */
331 RAMBlock *block;
332 /* Current page to search from */
333 unsigned long page;
334 /* Set once we wrap around */
335 bool complete_round;
337 typedef struct PageSearchStatus PageSearchStatus;
339 struct CompressParam {
340 bool done;
341 bool quit;
342 QEMUFile *file;
343 QemuMutex mutex;
344 QemuCond cond;
345 RAMBlock *block;
346 ram_addr_t offset;
348 /* internally used fields */
349 z_stream stream;
350 uint8_t *originbuf;
352 typedef struct CompressParam CompressParam;
354 struct DecompressParam {
355 bool done;
356 bool quit;
357 QemuMutex mutex;
358 QemuCond cond;
359 void *des;
360 uint8_t *compbuf;
361 int len;
362 z_stream stream;
364 typedef struct DecompressParam DecompressParam;
366 static CompressParam *comp_param;
367 static QemuThread *compress_threads;
368 /* comp_done_cond is used to wake up the migration thread when
369 * one of the compression threads has finished the compression.
370 * comp_done_lock is used to co-work with comp_done_cond.
372 static QemuMutex comp_done_lock;
373 static QemuCond comp_done_cond;
374 /* The empty QEMUFileOps will be used by file in CompressParam */
375 static const QEMUFileOps empty_ops = { };
377 static QEMUFile *decomp_file;
378 static DecompressParam *decomp_param;
379 static QemuThread *decompress_threads;
380 static QemuMutex decomp_done_lock;
381 static QemuCond decomp_done_cond;
383 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
384 ram_addr_t offset, uint8_t *source_buf);
386 static void *do_data_compress(void *opaque)
388 CompressParam *param = opaque;
389 RAMBlock *block;
390 ram_addr_t offset;
392 qemu_mutex_lock(&param->mutex);
393 while (!param->quit) {
394 if (param->block) {
395 block = param->block;
396 offset = param->offset;
397 param->block = NULL;
398 qemu_mutex_unlock(&param->mutex);
400 do_compress_ram_page(param->file, &param->stream, block, offset,
401 param->originbuf);
403 qemu_mutex_lock(&comp_done_lock);
404 param->done = true;
405 qemu_cond_signal(&comp_done_cond);
406 qemu_mutex_unlock(&comp_done_lock);
408 qemu_mutex_lock(&param->mutex);
409 } else {
410 qemu_cond_wait(&param->cond, &param->mutex);
413 qemu_mutex_unlock(&param->mutex);
415 return NULL;
418 static inline void terminate_compression_threads(void)
420 int idx, thread_count;
422 thread_count = migrate_compress_threads();
424 for (idx = 0; idx < thread_count; idx++) {
425 qemu_mutex_lock(&comp_param[idx].mutex);
426 comp_param[idx].quit = true;
427 qemu_cond_signal(&comp_param[idx].cond);
428 qemu_mutex_unlock(&comp_param[idx].mutex);
432 static void compress_threads_save_cleanup(void)
434 int i, thread_count;
436 if (!migrate_use_compression()) {
437 return;
439 terminate_compression_threads();
440 thread_count = migrate_compress_threads();
441 for (i = 0; i < thread_count; i++) {
443 * we use it as a indicator which shows if the thread is
444 * properly init'd or not
446 if (!comp_param[i].file) {
447 break;
449 qemu_thread_join(compress_threads + i);
450 qemu_mutex_destroy(&comp_param[i].mutex);
451 qemu_cond_destroy(&comp_param[i].cond);
452 deflateEnd(&comp_param[i].stream);
453 g_free(comp_param[i].originbuf);
454 qemu_fclose(comp_param[i].file);
455 comp_param[i].file = NULL;
457 qemu_mutex_destroy(&comp_done_lock);
458 qemu_cond_destroy(&comp_done_cond);
459 g_free(compress_threads);
460 g_free(comp_param);
461 compress_threads = NULL;
462 comp_param = NULL;
465 static int compress_threads_save_setup(void)
467 int i, thread_count;
469 if (!migrate_use_compression()) {
470 return 0;
472 thread_count = migrate_compress_threads();
473 compress_threads = g_new0(QemuThread, thread_count);
474 comp_param = g_new0(CompressParam, thread_count);
475 qemu_cond_init(&comp_done_cond);
476 qemu_mutex_init(&comp_done_lock);
477 for (i = 0; i < thread_count; i++) {
478 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
479 if (!comp_param[i].originbuf) {
480 goto exit;
483 if (deflateInit(&comp_param[i].stream,
484 migrate_compress_level()) != Z_OK) {
485 g_free(comp_param[i].originbuf);
486 goto exit;
489 /* comp_param[i].file is just used as a dummy buffer to save data,
490 * set its ops to empty.
492 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
493 comp_param[i].done = true;
494 comp_param[i].quit = false;
495 qemu_mutex_init(&comp_param[i].mutex);
496 qemu_cond_init(&comp_param[i].cond);
497 qemu_thread_create(compress_threads + i, "compress",
498 do_data_compress, comp_param + i,
499 QEMU_THREAD_JOINABLE);
501 return 0;
503 exit:
504 compress_threads_save_cleanup();
505 return -1;
508 /* Multiple fd's */
510 #define MULTIFD_MAGIC 0x11223344U
511 #define MULTIFD_VERSION 1
513 typedef struct {
514 uint32_t magic;
515 uint32_t version;
516 unsigned char uuid[16]; /* QemuUUID */
517 uint8_t id;
518 } __attribute__((packed)) MultiFDInit_t;
520 typedef struct {
521 /* this fields are not changed once the thread is created */
522 /* channel number */
523 uint8_t id;
524 /* channel thread name */
525 char *name;
526 /* channel thread id */
527 QemuThread thread;
528 /* communication channel */
529 QIOChannel *c;
530 /* sem where to wait for more work */
531 QemuSemaphore sem;
532 /* this mutex protects the following parameters */
533 QemuMutex mutex;
534 /* is this channel thread running */
535 bool running;
536 /* should this thread finish */
537 bool quit;
538 } MultiFDSendParams;
540 typedef struct {
541 /* this fields are not changed once the thread is created */
542 /* channel number */
543 uint8_t id;
544 /* channel thread name */
545 char *name;
546 /* channel thread id */
547 QemuThread thread;
548 /* communication channel */
549 QIOChannel *c;
550 /* sem where to wait for more work */
551 QemuSemaphore sem;
552 /* this mutex protects the following parameters */
553 QemuMutex mutex;
554 /* is this channel thread running */
555 bool running;
556 /* should this thread finish */
557 bool quit;
558 } MultiFDRecvParams;
560 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
562 MultiFDInit_t msg;
563 int ret;
565 msg.magic = cpu_to_be32(MULTIFD_MAGIC);
566 msg.version = cpu_to_be32(MULTIFD_VERSION);
567 msg.id = p->id;
568 memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
570 ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
571 if (ret != 0) {
572 return -1;
574 return 0;
577 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
579 MultiFDInit_t msg;
580 int ret;
582 ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
583 if (ret != 0) {
584 return -1;
587 be32_to_cpus(&msg.magic);
588 be32_to_cpus(&msg.version);
590 if (msg.magic != MULTIFD_MAGIC) {
591 error_setg(errp, "multifd: received packet magic %x "
592 "expected %x", msg.magic, MULTIFD_MAGIC);
593 return -1;
596 if (msg.version != MULTIFD_VERSION) {
597 error_setg(errp, "multifd: received packet version %d "
598 "expected %d", msg.version, MULTIFD_VERSION);
599 return -1;
602 if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
603 char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
604 char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
606 error_setg(errp, "multifd: received uuid '%s' and expected "
607 "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
608 g_free(uuid);
609 g_free(msg_uuid);
610 return -1;
613 if (msg.id > migrate_multifd_channels()) {
614 error_setg(errp, "multifd: received channel version %d "
615 "expected %d", msg.version, MULTIFD_VERSION);
616 return -1;
619 return msg.id;
622 struct {
623 MultiFDSendParams *params;
624 /* number of created threads */
625 int count;
626 } *multifd_send_state;
628 static void multifd_send_terminate_threads(Error *err)
630 int i;
632 if (err) {
633 MigrationState *s = migrate_get_current();
634 migrate_set_error(s, err);
635 if (s->state == MIGRATION_STATUS_SETUP ||
636 s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
637 s->state == MIGRATION_STATUS_DEVICE ||
638 s->state == MIGRATION_STATUS_ACTIVE) {
639 migrate_set_state(&s->state, s->state,
640 MIGRATION_STATUS_FAILED);
644 for (i = 0; i < migrate_multifd_channels(); i++) {
645 MultiFDSendParams *p = &multifd_send_state->params[i];
647 qemu_mutex_lock(&p->mutex);
648 p->quit = true;
649 qemu_sem_post(&p->sem);
650 qemu_mutex_unlock(&p->mutex);
654 int multifd_save_cleanup(Error **errp)
656 int i;
657 int ret = 0;
659 if (!migrate_use_multifd()) {
660 return 0;
662 multifd_send_terminate_threads(NULL);
663 for (i = 0; i < migrate_multifd_channels(); i++) {
664 MultiFDSendParams *p = &multifd_send_state->params[i];
666 if (p->running) {
667 qemu_thread_join(&p->thread);
669 socket_send_channel_destroy(p->c);
670 p->c = NULL;
671 qemu_mutex_destroy(&p->mutex);
672 qemu_sem_destroy(&p->sem);
673 g_free(p->name);
674 p->name = NULL;
676 g_free(multifd_send_state->params);
677 multifd_send_state->params = NULL;
678 g_free(multifd_send_state);
679 multifd_send_state = NULL;
680 return ret;
683 static void *multifd_send_thread(void *opaque)
685 MultiFDSendParams *p = opaque;
686 Error *local_err = NULL;
688 if (multifd_send_initial_packet(p, &local_err) < 0) {
689 goto out;
692 while (true) {
693 qemu_mutex_lock(&p->mutex);
694 if (p->quit) {
695 qemu_mutex_unlock(&p->mutex);
696 break;
698 qemu_mutex_unlock(&p->mutex);
699 qemu_sem_wait(&p->sem);
702 out:
703 if (local_err) {
704 multifd_send_terminate_threads(local_err);
707 qemu_mutex_lock(&p->mutex);
708 p->running = false;
709 qemu_mutex_unlock(&p->mutex);
711 return NULL;
714 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
716 MultiFDSendParams *p = opaque;
717 QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
718 Error *local_err = NULL;
720 if (qio_task_propagate_error(task, &local_err)) {
721 if (multifd_save_cleanup(&local_err) != 0) {
722 migrate_set_error(migrate_get_current(), local_err);
724 } else {
725 p->c = QIO_CHANNEL(sioc);
726 qio_channel_set_delay(p->c, false);
727 p->running = true;
728 qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
729 QEMU_THREAD_JOINABLE);
731 atomic_inc(&multifd_send_state->count);
735 int multifd_save_setup(void)
737 int thread_count;
738 uint8_t i;
740 if (!migrate_use_multifd()) {
741 return 0;
743 thread_count = migrate_multifd_channels();
744 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
745 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
746 atomic_set(&multifd_send_state->count, 0);
747 for (i = 0; i < thread_count; i++) {
748 MultiFDSendParams *p = &multifd_send_state->params[i];
750 qemu_mutex_init(&p->mutex);
751 qemu_sem_init(&p->sem, 0);
752 p->quit = false;
753 p->id = i;
754 p->name = g_strdup_printf("multifdsend_%d", i);
755 socket_send_channel_create(multifd_new_send_channel_async, p);
757 return 0;
760 struct {
761 MultiFDRecvParams *params;
762 /* number of created threads */
763 int count;
764 } *multifd_recv_state;
766 static void multifd_recv_terminate_threads(Error *err)
768 int i;
770 if (err) {
771 MigrationState *s = migrate_get_current();
772 migrate_set_error(s, err);
773 if (s->state == MIGRATION_STATUS_SETUP ||
774 s->state == MIGRATION_STATUS_ACTIVE) {
775 migrate_set_state(&s->state, s->state,
776 MIGRATION_STATUS_FAILED);
780 for (i = 0; i < migrate_multifd_channels(); i++) {
781 MultiFDRecvParams *p = &multifd_recv_state->params[i];
783 qemu_mutex_lock(&p->mutex);
784 p->quit = true;
785 qemu_sem_post(&p->sem);
786 qemu_mutex_unlock(&p->mutex);
790 int multifd_load_cleanup(Error **errp)
792 int i;
793 int ret = 0;
795 if (!migrate_use_multifd()) {
796 return 0;
798 multifd_recv_terminate_threads(NULL);
799 for (i = 0; i < migrate_multifd_channels(); i++) {
800 MultiFDRecvParams *p = &multifd_recv_state->params[i];
802 if (p->running) {
803 qemu_thread_join(&p->thread);
805 object_unref(OBJECT(p->c));
806 p->c = NULL;
807 qemu_mutex_destroy(&p->mutex);
808 qemu_sem_destroy(&p->sem);
809 g_free(p->name);
810 p->name = NULL;
812 g_free(multifd_recv_state->params);
813 multifd_recv_state->params = NULL;
814 g_free(multifd_recv_state);
815 multifd_recv_state = NULL;
817 return ret;
820 static void *multifd_recv_thread(void *opaque)
822 MultiFDRecvParams *p = opaque;
824 while (true) {
825 qemu_mutex_lock(&p->mutex);
826 if (p->quit) {
827 qemu_mutex_unlock(&p->mutex);
828 break;
830 qemu_mutex_unlock(&p->mutex);
831 qemu_sem_wait(&p->sem);
834 qemu_mutex_lock(&p->mutex);
835 p->running = false;
836 qemu_mutex_unlock(&p->mutex);
838 return NULL;
841 int multifd_load_setup(void)
843 int thread_count;
844 uint8_t i;
846 if (!migrate_use_multifd()) {
847 return 0;
849 thread_count = migrate_multifd_channels();
850 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
851 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
852 atomic_set(&multifd_recv_state->count, 0);
853 for (i = 0; i < thread_count; i++) {
854 MultiFDRecvParams *p = &multifd_recv_state->params[i];
856 qemu_mutex_init(&p->mutex);
857 qemu_sem_init(&p->sem, 0);
858 p->quit = false;
859 p->id = i;
860 p->name = g_strdup_printf("multifdrecv_%d", i);
862 return 0;
865 bool multifd_recv_all_channels_created(void)
867 int thread_count = migrate_multifd_channels();
869 if (!migrate_use_multifd()) {
870 return true;
873 return thread_count == atomic_read(&multifd_recv_state->count);
876 void multifd_recv_new_channel(QIOChannel *ioc)
878 MultiFDRecvParams *p;
879 Error *local_err = NULL;
880 int id;
882 id = multifd_recv_initial_packet(ioc, &local_err);
883 if (id < 0) {
884 multifd_recv_terminate_threads(local_err);
885 return;
888 p = &multifd_recv_state->params[id];
889 if (p->c != NULL) {
890 error_setg(&local_err, "multifd: received id '%d' already setup'",
891 id);
892 multifd_recv_terminate_threads(local_err);
893 return;
895 p->c = ioc;
896 object_ref(OBJECT(ioc));
898 p->running = true;
899 qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
900 QEMU_THREAD_JOINABLE);
901 atomic_inc(&multifd_recv_state->count);
902 if (multifd_recv_state->count == migrate_multifd_channels()) {
903 migration_incoming_process();
908 * save_page_header: write page header to wire
910 * If this is the 1st block, it also writes the block identification
912 * Returns the number of bytes written
914 * @f: QEMUFile where to send the data
915 * @block: block that contains the page we want to send
916 * @offset: offset inside the block for the page
917 * in the lower bits, it contains flags
919 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
920 ram_addr_t offset)
922 size_t size, len;
924 if (block == rs->last_sent_block) {
925 offset |= RAM_SAVE_FLAG_CONTINUE;
927 qemu_put_be64(f, offset);
928 size = 8;
930 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
931 len = strlen(block->idstr);
932 qemu_put_byte(f, len);
933 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
934 size += 1 + len;
935 rs->last_sent_block = block;
937 return size;
941 * mig_throttle_guest_down: throotle down the guest
943 * Reduce amount of guest cpu execution to hopefully slow down memory
944 * writes. If guest dirty memory rate is reduced below the rate at
945 * which we can transfer pages to the destination then we should be
946 * able to complete migration. Some workloads dirty memory way too
947 * fast and will not effectively converge, even with auto-converge.
949 static void mig_throttle_guest_down(void)
951 MigrationState *s = migrate_get_current();
952 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
953 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
955 /* We have not started throttling yet. Let's start it. */
956 if (!cpu_throttle_active()) {
957 cpu_throttle_set(pct_initial);
958 } else {
959 /* Throttling already on, just increase the rate */
960 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
965 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
967 * @rs: current RAM state
968 * @current_addr: address for the zero page
970 * Update the xbzrle cache to reflect a page that's been sent as all 0.
971 * The important thing is that a stale (not-yet-0'd) page be replaced
972 * by the new data.
973 * As a bonus, if the page wasn't in the cache it gets added so that
974 * when a small write is made into the 0'd page it gets XBZRLE sent.
976 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
978 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
979 return;
982 /* We don't care if this fails to allocate a new cache page
983 * as long as it updated an old one */
984 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
985 ram_counters.dirty_sync_count);
988 #define ENCODING_FLAG_XBZRLE 0x1
991 * save_xbzrle_page: compress and send current page
993 * Returns: 1 means that we wrote the page
994 * 0 means that page is identical to the one already sent
995 * -1 means that xbzrle would be longer than normal
997 * @rs: current RAM state
998 * @current_data: pointer to the address of the page contents
999 * @current_addr: addr of the page
1000 * @block: block that contains the page we want to send
1001 * @offset: offset inside the block for the page
1002 * @last_stage: if we are at the completion stage
1004 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1005 ram_addr_t current_addr, RAMBlock *block,
1006 ram_addr_t offset, bool last_stage)
1008 int encoded_len = 0, bytes_xbzrle;
1009 uint8_t *prev_cached_page;
1011 if (!cache_is_cached(XBZRLE.cache, current_addr,
1012 ram_counters.dirty_sync_count)) {
1013 xbzrle_counters.cache_miss++;
1014 if (!last_stage) {
1015 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1016 ram_counters.dirty_sync_count) == -1) {
1017 return -1;
1018 } else {
1019 /* update *current_data when the page has been
1020 inserted into cache */
1021 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1024 return -1;
1027 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1029 /* save current buffer into memory */
1030 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1032 /* XBZRLE encoding (if there is no overflow) */
1033 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1034 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1035 TARGET_PAGE_SIZE);
1036 if (encoded_len == 0) {
1037 trace_save_xbzrle_page_skipping();
1038 return 0;
1039 } else if (encoded_len == -1) {
1040 trace_save_xbzrle_page_overflow();
1041 xbzrle_counters.overflow++;
1042 /* update data in the cache */
1043 if (!last_stage) {
1044 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1045 *current_data = prev_cached_page;
1047 return -1;
1050 /* we need to update the data in the cache, in order to get the same data */
1051 if (!last_stage) {
1052 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1055 /* Send XBZRLE based compressed page */
1056 bytes_xbzrle = save_page_header(rs, rs->f, block,
1057 offset | RAM_SAVE_FLAG_XBZRLE);
1058 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1059 qemu_put_be16(rs->f, encoded_len);
1060 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1061 bytes_xbzrle += encoded_len + 1 + 2;
1062 xbzrle_counters.pages++;
1063 xbzrle_counters.bytes += bytes_xbzrle;
1064 ram_counters.transferred += bytes_xbzrle;
1066 return 1;
1070 * migration_bitmap_find_dirty: find the next dirty page from start
1072 * Called with rcu_read_lock() to protect migration_bitmap
1074 * Returns the byte offset within memory region of the start of a dirty page
1076 * @rs: current RAM state
1077 * @rb: RAMBlock where to search for dirty pages
1078 * @start: page where we start the search
1080 static inline
1081 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1082 unsigned long start)
1084 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1085 unsigned long *bitmap = rb->bmap;
1086 unsigned long next;
1088 if (!qemu_ram_is_migratable(rb)) {
1089 return size;
1092 if (rs->ram_bulk_stage && start > 0) {
1093 next = start + 1;
1094 } else {
1095 next = find_next_bit(bitmap, size, start);
1098 return next;
1101 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1102 RAMBlock *rb,
1103 unsigned long page)
1105 bool ret;
1107 ret = test_and_clear_bit(page, rb->bmap);
1109 if (ret) {
1110 rs->migration_dirty_pages--;
1112 return ret;
1115 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1116 ram_addr_t start, ram_addr_t length)
1118 rs->migration_dirty_pages +=
1119 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1120 &rs->num_dirty_pages_period);
1124 * ram_pagesize_summary: calculate all the pagesizes of a VM
1126 * Returns a summary bitmap of the page sizes of all RAMBlocks
1128 * For VMs with just normal pages this is equivalent to the host page
1129 * size. If it's got some huge pages then it's the OR of all the
1130 * different page sizes.
1132 uint64_t ram_pagesize_summary(void)
1134 RAMBlock *block;
1135 uint64_t summary = 0;
1137 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1138 summary |= block->page_size;
1141 return summary;
1144 static void migration_update_rates(RAMState *rs, int64_t end_time)
1146 uint64_t iter_count = rs->iterations - rs->iterations_prev;
1148 /* calculate period counters */
1149 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1150 / (end_time - rs->time_last_bitmap_sync);
1152 if (!iter_count) {
1153 return;
1156 if (migrate_use_xbzrle()) {
1157 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1158 rs->xbzrle_cache_miss_prev) / iter_count;
1159 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1163 static void migration_bitmap_sync(RAMState *rs)
1165 RAMBlock *block;
1166 int64_t end_time;
1167 uint64_t bytes_xfer_now;
1169 ram_counters.dirty_sync_count++;
1171 if (!rs->time_last_bitmap_sync) {
1172 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1175 trace_migration_bitmap_sync_start();
1176 memory_global_dirty_log_sync();
1178 qemu_mutex_lock(&rs->bitmap_mutex);
1179 rcu_read_lock();
1180 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1181 migration_bitmap_sync_range(rs, block, 0, block->used_length);
1183 ram_counters.remaining = ram_bytes_remaining();
1184 rcu_read_unlock();
1185 qemu_mutex_unlock(&rs->bitmap_mutex);
1187 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1189 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1191 /* more than 1 second = 1000 millisecons */
1192 if (end_time > rs->time_last_bitmap_sync + 1000) {
1193 bytes_xfer_now = ram_counters.transferred;
1195 /* During block migration the auto-converge logic incorrectly detects
1196 * that ram migration makes no progress. Avoid this by disabling the
1197 * throttling logic during the bulk phase of block migration. */
1198 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1199 /* The following detection logic can be refined later. For now:
1200 Check to see if the dirtied bytes is 50% more than the approx.
1201 amount of bytes that just got transferred since the last time we
1202 were in this routine. If that happens twice, start or increase
1203 throttling */
1205 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1206 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1207 (++rs->dirty_rate_high_cnt >= 2)) {
1208 trace_migration_throttle();
1209 rs->dirty_rate_high_cnt = 0;
1210 mig_throttle_guest_down();
1214 migration_update_rates(rs, end_time);
1216 rs->iterations_prev = rs->iterations;
1218 /* reset period counters */
1219 rs->time_last_bitmap_sync = end_time;
1220 rs->num_dirty_pages_period = 0;
1221 rs->bytes_xfer_prev = bytes_xfer_now;
1223 if (migrate_use_events()) {
1224 qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
1229 * save_zero_page: send the zero page to the stream
1231 * Returns the number of pages written.
1233 * @rs: current RAM state
1234 * @block: block that contains the page we want to send
1235 * @offset: offset inside the block for the page
1237 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1239 uint8_t *p = block->host + offset;
1240 int pages = -1;
1242 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1243 ram_counters.duplicate++;
1244 ram_counters.transferred +=
1245 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
1246 qemu_put_byte(rs->f, 0);
1247 ram_counters.transferred += 1;
1248 pages = 1;
1251 return pages;
1254 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1256 if (!migrate_release_ram() || !migration_in_postcopy()) {
1257 return;
1260 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1264 * @pages: the number of pages written by the control path,
1265 * < 0 - error
1266 * > 0 - number of pages written
1268 * Return true if the pages has been saved, otherwise false is returned.
1270 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1271 int *pages)
1273 uint64_t bytes_xmit = 0;
1274 int ret;
1276 *pages = -1;
1277 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1278 &bytes_xmit);
1279 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1280 return false;
1283 if (bytes_xmit) {
1284 ram_counters.transferred += bytes_xmit;
1285 *pages = 1;
1288 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1289 return true;
1292 if (bytes_xmit > 0) {
1293 ram_counters.normal++;
1294 } else if (bytes_xmit == 0) {
1295 ram_counters.duplicate++;
1298 return true;
1302 * directly send the page to the stream
1304 * Returns the number of pages written.
1306 * @rs: current RAM state
1307 * @block: block that contains the page we want to send
1308 * @offset: offset inside the block for the page
1309 * @buf: the page to be sent
1310 * @async: send to page asyncly
1312 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1313 uint8_t *buf, bool async)
1315 ram_counters.transferred += save_page_header(rs, rs->f, block,
1316 offset | RAM_SAVE_FLAG_PAGE);
1317 if (async) {
1318 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1319 migrate_release_ram() &
1320 migration_in_postcopy());
1321 } else {
1322 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1324 ram_counters.transferred += TARGET_PAGE_SIZE;
1325 ram_counters.normal++;
1326 return 1;
1330 * ram_save_page: send the given page to the stream
1332 * Returns the number of pages written.
1333 * < 0 - error
1334 * >=0 - Number of pages written - this might legally be 0
1335 * if xbzrle noticed the page was the same.
1337 * @rs: current RAM state
1338 * @block: block that contains the page we want to send
1339 * @offset: offset inside the block for the page
1340 * @last_stage: if we are at the completion stage
1342 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1344 int pages = -1;
1345 uint8_t *p;
1346 bool send_async = true;
1347 RAMBlock *block = pss->block;
1348 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1349 ram_addr_t current_addr = block->offset + offset;
1351 p = block->host + offset;
1352 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1354 XBZRLE_cache_lock();
1355 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1356 migrate_use_xbzrle()) {
1357 pages = save_xbzrle_page(rs, &p, current_addr, block,
1358 offset, last_stage);
1359 if (!last_stage) {
1360 /* Can't send this cached data async, since the cache page
1361 * might get updated before it gets to the wire
1363 send_async = false;
1367 /* XBZRLE overflow or normal page */
1368 if (pages == -1) {
1369 pages = save_normal_page(rs, block, offset, p, send_async);
1372 XBZRLE_cache_unlock();
1374 return pages;
1377 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1378 ram_addr_t offset, uint8_t *source_buf)
1380 RAMState *rs = ram_state;
1381 int bytes_sent, blen;
1382 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1384 bytes_sent = save_page_header(rs, f, block, offset |
1385 RAM_SAVE_FLAG_COMPRESS_PAGE);
1388 * copy it to a internal buffer to avoid it being modified by VM
1389 * so that we can catch up the error during compression and
1390 * decompression
1392 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1393 blen = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1394 if (blen < 0) {
1395 bytes_sent = 0;
1396 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1397 error_report("compressed data failed!");
1398 } else {
1399 bytes_sent += blen;
1400 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1403 return bytes_sent;
1406 static void flush_compressed_data(RAMState *rs)
1408 int idx, len, thread_count;
1410 if (!migrate_use_compression()) {
1411 return;
1413 thread_count = migrate_compress_threads();
1415 qemu_mutex_lock(&comp_done_lock);
1416 for (idx = 0; idx < thread_count; idx++) {
1417 while (!comp_param[idx].done) {
1418 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1421 qemu_mutex_unlock(&comp_done_lock);
1423 for (idx = 0; idx < thread_count; idx++) {
1424 qemu_mutex_lock(&comp_param[idx].mutex);
1425 if (!comp_param[idx].quit) {
1426 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1427 ram_counters.transferred += len;
1429 qemu_mutex_unlock(&comp_param[idx].mutex);
1433 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1434 ram_addr_t offset)
1436 param->block = block;
1437 param->offset = offset;
1440 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1441 ram_addr_t offset)
1443 int idx, thread_count, bytes_xmit = -1, pages = -1;
1445 thread_count = migrate_compress_threads();
1446 qemu_mutex_lock(&comp_done_lock);
1447 while (true) {
1448 for (idx = 0; idx < thread_count; idx++) {
1449 if (comp_param[idx].done) {
1450 comp_param[idx].done = false;
1451 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1452 qemu_mutex_lock(&comp_param[idx].mutex);
1453 set_compress_params(&comp_param[idx], block, offset);
1454 qemu_cond_signal(&comp_param[idx].cond);
1455 qemu_mutex_unlock(&comp_param[idx].mutex);
1456 pages = 1;
1457 ram_counters.normal++;
1458 ram_counters.transferred += bytes_xmit;
1459 break;
1462 if (pages > 0) {
1463 break;
1464 } else {
1465 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1468 qemu_mutex_unlock(&comp_done_lock);
1470 return pages;
1474 * find_dirty_block: find the next dirty page and update any state
1475 * associated with the search process.
1477 * Returns if a page is found
1479 * @rs: current RAM state
1480 * @pss: data about the state of the current dirty page scan
1481 * @again: set to false if the search has scanned the whole of RAM
1483 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1485 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1486 if (pss->complete_round && pss->block == rs->last_seen_block &&
1487 pss->page >= rs->last_page) {
1489 * We've been once around the RAM and haven't found anything.
1490 * Give up.
1492 *again = false;
1493 return false;
1495 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1496 /* Didn't find anything in this RAM Block */
1497 pss->page = 0;
1498 pss->block = QLIST_NEXT_RCU(pss->block, next);
1499 if (!pss->block) {
1500 /* Hit the end of the list */
1501 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1502 /* Flag that we've looped */
1503 pss->complete_round = true;
1504 rs->ram_bulk_stage = false;
1505 if (migrate_use_xbzrle()) {
1506 /* If xbzrle is on, stop using the data compression at this
1507 * point. In theory, xbzrle can do better than compression.
1509 flush_compressed_data(rs);
1512 /* Didn't find anything this time, but try again on the new block */
1513 *again = true;
1514 return false;
1515 } else {
1516 /* Can go around again, but... */
1517 *again = true;
1518 /* We've found something so probably don't need to */
1519 return true;
1524 * unqueue_page: gets a page of the queue
1526 * Helper for 'get_queued_page' - gets a page off the queue
1528 * Returns the block of the page (or NULL if none available)
1530 * @rs: current RAM state
1531 * @offset: used to return the offset within the RAMBlock
1533 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1535 RAMBlock *block = NULL;
1537 qemu_mutex_lock(&rs->src_page_req_mutex);
1538 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1539 struct RAMSrcPageRequest *entry =
1540 QSIMPLEQ_FIRST(&rs->src_page_requests);
1541 block = entry->rb;
1542 *offset = entry->offset;
1544 if (entry->len > TARGET_PAGE_SIZE) {
1545 entry->len -= TARGET_PAGE_SIZE;
1546 entry->offset += TARGET_PAGE_SIZE;
1547 } else {
1548 memory_region_unref(block->mr);
1549 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1550 g_free(entry);
1551 migration_consume_urgent_request();
1554 qemu_mutex_unlock(&rs->src_page_req_mutex);
1556 return block;
1560 * get_queued_page: unqueue a page from the postocpy requests
1562 * Skips pages that are already sent (!dirty)
1564 * Returns if a queued page is found
1566 * @rs: current RAM state
1567 * @pss: data about the state of the current dirty page scan
1569 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1571 RAMBlock *block;
1572 ram_addr_t offset;
1573 bool dirty;
1575 do {
1576 block = unqueue_page(rs, &offset);
1578 * We're sending this page, and since it's postcopy nothing else
1579 * will dirty it, and we must make sure it doesn't get sent again
1580 * even if this queue request was received after the background
1581 * search already sent it.
1583 if (block) {
1584 unsigned long page;
1586 page = offset >> TARGET_PAGE_BITS;
1587 dirty = test_bit(page, block->bmap);
1588 if (!dirty) {
1589 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1590 page, test_bit(page, block->unsentmap));
1591 } else {
1592 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1596 } while (block && !dirty);
1598 if (block) {
1600 * As soon as we start servicing pages out of order, then we have
1601 * to kill the bulk stage, since the bulk stage assumes
1602 * in (migration_bitmap_find_and_reset_dirty) that every page is
1603 * dirty, that's no longer true.
1605 rs->ram_bulk_stage = false;
1608 * We want the background search to continue from the queued page
1609 * since the guest is likely to want other pages near to the page
1610 * it just requested.
1612 pss->block = block;
1613 pss->page = offset >> TARGET_PAGE_BITS;
1616 return !!block;
1620 * migration_page_queue_free: drop any remaining pages in the ram
1621 * request queue
1623 * It should be empty at the end anyway, but in error cases there may
1624 * be some left. in case that there is any page left, we drop it.
1627 static void migration_page_queue_free(RAMState *rs)
1629 struct RAMSrcPageRequest *mspr, *next_mspr;
1630 /* This queue generally should be empty - but in the case of a failed
1631 * migration might have some droppings in.
1633 rcu_read_lock();
1634 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1635 memory_region_unref(mspr->rb->mr);
1636 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1637 g_free(mspr);
1639 rcu_read_unlock();
1643 * ram_save_queue_pages: queue the page for transmission
1645 * A request from postcopy destination for example.
1647 * Returns zero on success or negative on error
1649 * @rbname: Name of the RAMBLock of the request. NULL means the
1650 * same that last one.
1651 * @start: starting address from the start of the RAMBlock
1652 * @len: length (in bytes) to send
1654 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1656 RAMBlock *ramblock;
1657 RAMState *rs = ram_state;
1659 ram_counters.postcopy_requests++;
1660 rcu_read_lock();
1661 if (!rbname) {
1662 /* Reuse last RAMBlock */
1663 ramblock = rs->last_req_rb;
1665 if (!ramblock) {
1667 * Shouldn't happen, we can't reuse the last RAMBlock if
1668 * it's the 1st request.
1670 error_report("ram_save_queue_pages no previous block");
1671 goto err;
1673 } else {
1674 ramblock = qemu_ram_block_by_name(rbname);
1676 if (!ramblock) {
1677 /* We shouldn't be asked for a non-existent RAMBlock */
1678 error_report("ram_save_queue_pages no block '%s'", rbname);
1679 goto err;
1681 rs->last_req_rb = ramblock;
1683 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1684 if (start+len > ramblock->used_length) {
1685 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1686 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1687 __func__, start, len, ramblock->used_length);
1688 goto err;
1691 struct RAMSrcPageRequest *new_entry =
1692 g_malloc0(sizeof(struct RAMSrcPageRequest));
1693 new_entry->rb = ramblock;
1694 new_entry->offset = start;
1695 new_entry->len = len;
1697 memory_region_ref(ramblock->mr);
1698 qemu_mutex_lock(&rs->src_page_req_mutex);
1699 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1700 migration_make_urgent_request();
1701 qemu_mutex_unlock(&rs->src_page_req_mutex);
1702 rcu_read_unlock();
1704 return 0;
1706 err:
1707 rcu_read_unlock();
1708 return -1;
1711 static bool save_page_use_compression(RAMState *rs)
1713 if (!migrate_use_compression()) {
1714 return false;
1718 * If xbzrle is on, stop using the data compression after first
1719 * round of migration even if compression is enabled. In theory,
1720 * xbzrle can do better than compression.
1722 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1723 return true;
1726 return false;
1730 * ram_save_target_page: save one target page
1732 * Returns the number of pages written
1734 * @rs: current RAM state
1735 * @pss: data about the page we want to send
1736 * @last_stage: if we are at the completion stage
1738 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1739 bool last_stage)
1741 RAMBlock *block = pss->block;
1742 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1743 int res;
1745 if (control_save_page(rs, block, offset, &res)) {
1746 return res;
1750 * When starting the process of a new block, the first page of
1751 * the block should be sent out before other pages in the same
1752 * block, and all the pages in last block should have been sent
1753 * out, keeping this order is important, because the 'cont' flag
1754 * is used to avoid resending the block name.
1756 if (block != rs->last_sent_block && save_page_use_compression(rs)) {
1757 flush_compressed_data(rs);
1760 res = save_zero_page(rs, block, offset);
1761 if (res > 0) {
1762 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1763 * page would be stale
1765 if (!save_page_use_compression(rs)) {
1766 XBZRLE_cache_lock();
1767 xbzrle_cache_zero_page(rs, block->offset + offset);
1768 XBZRLE_cache_unlock();
1770 ram_release_pages(block->idstr, offset, res);
1771 return res;
1775 * Make sure the first page is sent out before other pages.
1777 * we post it as normal page as compression will take much
1778 * CPU resource.
1780 if (block == rs->last_sent_block && save_page_use_compression(rs)) {
1781 return compress_page_with_multi_thread(rs, block, offset);
1784 return ram_save_page(rs, pss, last_stage);
1788 * ram_save_host_page: save a whole host page
1790 * Starting at *offset send pages up to the end of the current host
1791 * page. It's valid for the initial offset to point into the middle of
1792 * a host page in which case the remainder of the hostpage is sent.
1793 * Only dirty target pages are sent. Note that the host page size may
1794 * be a huge page for this block.
1795 * The saving stops at the boundary of the used_length of the block
1796 * if the RAMBlock isn't a multiple of the host page size.
1798 * Returns the number of pages written or negative on error
1800 * @rs: current RAM state
1801 * @ms: current migration state
1802 * @pss: data about the page we want to send
1803 * @last_stage: if we are at the completion stage
1805 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1806 bool last_stage)
1808 int tmppages, pages = 0;
1809 size_t pagesize_bits =
1810 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1812 if (!qemu_ram_is_migratable(pss->block)) {
1813 error_report("block %s should not be migrated !", pss->block->idstr);
1814 return 0;
1817 do {
1818 /* Check the pages is dirty and if it is send it */
1819 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1820 pss->page++;
1821 continue;
1824 tmppages = ram_save_target_page(rs, pss, last_stage);
1825 if (tmppages < 0) {
1826 return tmppages;
1829 pages += tmppages;
1830 if (pss->block->unsentmap) {
1831 clear_bit(pss->page, pss->block->unsentmap);
1834 pss->page++;
1835 } while ((pss->page & (pagesize_bits - 1)) &&
1836 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1838 /* The offset we leave with is the last one we looked at */
1839 pss->page--;
1840 return pages;
1844 * ram_find_and_save_block: finds a dirty page and sends it to f
1846 * Called within an RCU critical section.
1848 * Returns the number of pages written where zero means no dirty pages
1850 * @rs: current RAM state
1851 * @last_stage: if we are at the completion stage
1853 * On systems where host-page-size > target-page-size it will send all the
1854 * pages in a host page that are dirty.
1857 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1859 PageSearchStatus pss;
1860 int pages = 0;
1861 bool again, found;
1863 /* No dirty page as there is zero RAM */
1864 if (!ram_bytes_total()) {
1865 return pages;
1868 pss.block = rs->last_seen_block;
1869 pss.page = rs->last_page;
1870 pss.complete_round = false;
1872 if (!pss.block) {
1873 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1876 do {
1877 again = true;
1878 found = get_queued_page(rs, &pss);
1880 if (!found) {
1881 /* priority queue empty, so just search for something dirty */
1882 found = find_dirty_block(rs, &pss, &again);
1885 if (found) {
1886 pages = ram_save_host_page(rs, &pss, last_stage);
1888 } while (!pages && again);
1890 rs->last_seen_block = pss.block;
1891 rs->last_page = pss.page;
1893 return pages;
1896 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1898 uint64_t pages = size / TARGET_PAGE_SIZE;
1900 if (zero) {
1901 ram_counters.duplicate += pages;
1902 } else {
1903 ram_counters.normal += pages;
1904 ram_counters.transferred += size;
1905 qemu_update_position(f, size);
1909 uint64_t ram_bytes_total(void)
1911 RAMBlock *block;
1912 uint64_t total = 0;
1914 rcu_read_lock();
1915 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1916 total += block->used_length;
1918 rcu_read_unlock();
1919 return total;
1922 static void xbzrle_load_setup(void)
1924 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1927 static void xbzrle_load_cleanup(void)
1929 g_free(XBZRLE.decoded_buf);
1930 XBZRLE.decoded_buf = NULL;
1933 static void ram_state_cleanup(RAMState **rsp)
1935 if (*rsp) {
1936 migration_page_queue_free(*rsp);
1937 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1938 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1939 g_free(*rsp);
1940 *rsp = NULL;
1944 static void xbzrle_cleanup(void)
1946 XBZRLE_cache_lock();
1947 if (XBZRLE.cache) {
1948 cache_fini(XBZRLE.cache);
1949 g_free(XBZRLE.encoded_buf);
1950 g_free(XBZRLE.current_buf);
1951 g_free(XBZRLE.zero_target_page);
1952 XBZRLE.cache = NULL;
1953 XBZRLE.encoded_buf = NULL;
1954 XBZRLE.current_buf = NULL;
1955 XBZRLE.zero_target_page = NULL;
1957 XBZRLE_cache_unlock();
1960 static void ram_save_cleanup(void *opaque)
1962 RAMState **rsp = opaque;
1963 RAMBlock *block;
1965 /* caller have hold iothread lock or is in a bh, so there is
1966 * no writing race against this migration_bitmap
1968 memory_global_dirty_log_stop();
1970 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1971 g_free(block->bmap);
1972 block->bmap = NULL;
1973 g_free(block->unsentmap);
1974 block->unsentmap = NULL;
1977 xbzrle_cleanup();
1978 compress_threads_save_cleanup();
1979 ram_state_cleanup(rsp);
1982 static void ram_state_reset(RAMState *rs)
1984 rs->last_seen_block = NULL;
1985 rs->last_sent_block = NULL;
1986 rs->last_page = 0;
1987 rs->last_version = ram_list.version;
1988 rs->ram_bulk_stage = true;
1991 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1994 * 'expected' is the value you expect the bitmap mostly to be full
1995 * of; it won't bother printing lines that are all this value.
1996 * If 'todump' is null the migration bitmap is dumped.
1998 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1999 unsigned long pages)
2001 int64_t cur;
2002 int64_t linelen = 128;
2003 char linebuf[129];
2005 for (cur = 0; cur < pages; cur += linelen) {
2006 int64_t curb;
2007 bool found = false;
2009 * Last line; catch the case where the line length
2010 * is longer than remaining ram
2012 if (cur + linelen > pages) {
2013 linelen = pages - cur;
2015 for (curb = 0; curb < linelen; curb++) {
2016 bool thisbit = test_bit(cur + curb, todump);
2017 linebuf[curb] = thisbit ? '1' : '.';
2018 found = found || (thisbit != expected);
2020 if (found) {
2021 linebuf[curb] = '\0';
2022 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2027 /* **** functions for postcopy ***** */
2029 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2031 struct RAMBlock *block;
2033 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2034 unsigned long *bitmap = block->bmap;
2035 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2036 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2038 while (run_start < range) {
2039 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2040 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2041 (run_end - run_start) << TARGET_PAGE_BITS);
2042 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2048 * postcopy_send_discard_bm_ram: discard a RAMBlock
2050 * Returns zero on success
2052 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2053 * Note: At this point the 'unsentmap' is the processed bitmap combined
2054 * with the dirtymap; so a '1' means it's either dirty or unsent.
2056 * @ms: current migration state
2057 * @pds: state for postcopy
2058 * @start: RAMBlock starting page
2059 * @length: RAMBlock size
2061 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2062 PostcopyDiscardState *pds,
2063 RAMBlock *block)
2065 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2066 unsigned long current;
2067 unsigned long *unsentmap = block->unsentmap;
2069 for (current = 0; current < end; ) {
2070 unsigned long one = find_next_bit(unsentmap, end, current);
2072 if (one <= end) {
2073 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2074 unsigned long discard_length;
2076 if (zero >= end) {
2077 discard_length = end - one;
2078 } else {
2079 discard_length = zero - one;
2081 if (discard_length) {
2082 postcopy_discard_send_range(ms, pds, one, discard_length);
2084 current = one + discard_length;
2085 } else {
2086 current = one;
2090 return 0;
2094 * postcopy_each_ram_send_discard: discard all RAMBlocks
2096 * Returns 0 for success or negative for error
2098 * Utility for the outgoing postcopy code.
2099 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2100 * passing it bitmap indexes and name.
2101 * (qemu_ram_foreach_block ends up passing unscaled lengths
2102 * which would mean postcopy code would have to deal with target page)
2104 * @ms: current migration state
2106 static int postcopy_each_ram_send_discard(MigrationState *ms)
2108 struct RAMBlock *block;
2109 int ret;
2111 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2112 PostcopyDiscardState *pds =
2113 postcopy_discard_send_init(ms, block->idstr);
2116 * Postcopy sends chunks of bitmap over the wire, but it
2117 * just needs indexes at this point, avoids it having
2118 * target page specific code.
2120 ret = postcopy_send_discard_bm_ram(ms, pds, block);
2121 postcopy_discard_send_finish(ms, pds);
2122 if (ret) {
2123 return ret;
2127 return 0;
2131 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2133 * Helper for postcopy_chunk_hostpages; it's called twice to
2134 * canonicalize the two bitmaps, that are similar, but one is
2135 * inverted.
2137 * Postcopy requires that all target pages in a hostpage are dirty or
2138 * clean, not a mix. This function canonicalizes the bitmaps.
2140 * @ms: current migration state
2141 * @unsent_pass: if true we need to canonicalize partially unsent host pages
2142 * otherwise we need to canonicalize partially dirty host pages
2143 * @block: block that contains the page we want to canonicalize
2144 * @pds: state for postcopy
2146 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2147 RAMBlock *block,
2148 PostcopyDiscardState *pds)
2150 RAMState *rs = ram_state;
2151 unsigned long *bitmap = block->bmap;
2152 unsigned long *unsentmap = block->unsentmap;
2153 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2154 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2155 unsigned long run_start;
2157 if (block->page_size == TARGET_PAGE_SIZE) {
2158 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2159 return;
2162 if (unsent_pass) {
2163 /* Find a sent page */
2164 run_start = find_next_zero_bit(unsentmap, pages, 0);
2165 } else {
2166 /* Find a dirty page */
2167 run_start = find_next_bit(bitmap, pages, 0);
2170 while (run_start < pages) {
2171 bool do_fixup = false;
2172 unsigned long fixup_start_addr;
2173 unsigned long host_offset;
2176 * If the start of this run of pages is in the middle of a host
2177 * page, then we need to fixup this host page.
2179 host_offset = run_start % host_ratio;
2180 if (host_offset) {
2181 do_fixup = true;
2182 run_start -= host_offset;
2183 fixup_start_addr = run_start;
2184 /* For the next pass */
2185 run_start = run_start + host_ratio;
2186 } else {
2187 /* Find the end of this run */
2188 unsigned long run_end;
2189 if (unsent_pass) {
2190 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2191 } else {
2192 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2195 * If the end isn't at the start of a host page, then the
2196 * run doesn't finish at the end of a host page
2197 * and we need to discard.
2199 host_offset = run_end % host_ratio;
2200 if (host_offset) {
2201 do_fixup = true;
2202 fixup_start_addr = run_end - host_offset;
2204 * This host page has gone, the next loop iteration starts
2205 * from after the fixup
2207 run_start = fixup_start_addr + host_ratio;
2208 } else {
2210 * No discards on this iteration, next loop starts from
2211 * next sent/dirty page
2213 run_start = run_end + 1;
2217 if (do_fixup) {
2218 unsigned long page;
2220 /* Tell the destination to discard this page */
2221 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2222 /* For the unsent_pass we:
2223 * discard partially sent pages
2224 * For the !unsent_pass (dirty) we:
2225 * discard partially dirty pages that were sent
2226 * (any partially sent pages were already discarded
2227 * by the previous unsent_pass)
2229 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2230 host_ratio);
2233 /* Clean up the bitmap */
2234 for (page = fixup_start_addr;
2235 page < fixup_start_addr + host_ratio; page++) {
2236 /* All pages in this host page are now not sent */
2237 set_bit(page, unsentmap);
2240 * Remark them as dirty, updating the count for any pages
2241 * that weren't previously dirty.
2243 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2247 if (unsent_pass) {
2248 /* Find the next sent page for the next iteration */
2249 run_start = find_next_zero_bit(unsentmap, pages, run_start);
2250 } else {
2251 /* Find the next dirty page for the next iteration */
2252 run_start = find_next_bit(bitmap, pages, run_start);
2258 * postcopy_chuck_hostpages: discrad any partially sent host page
2260 * Utility for the outgoing postcopy code.
2262 * Discard any partially sent host-page size chunks, mark any partially
2263 * dirty host-page size chunks as all dirty. In this case the host-page
2264 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2266 * Returns zero on success
2268 * @ms: current migration state
2269 * @block: block we want to work with
2271 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2273 PostcopyDiscardState *pds =
2274 postcopy_discard_send_init(ms, block->idstr);
2276 /* First pass: Discard all partially sent host pages */
2277 postcopy_chunk_hostpages_pass(ms, true, block, pds);
2279 * Second pass: Ensure that all partially dirty host pages are made
2280 * fully dirty.
2282 postcopy_chunk_hostpages_pass(ms, false, block, pds);
2284 postcopy_discard_send_finish(ms, pds);
2285 return 0;
2289 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2291 * Returns zero on success
2293 * Transmit the set of pages to be discarded after precopy to the target
2294 * these are pages that:
2295 * a) Have been previously transmitted but are now dirty again
2296 * b) Pages that have never been transmitted, this ensures that
2297 * any pages on the destination that have been mapped by background
2298 * tasks get discarded (transparent huge pages is the specific concern)
2299 * Hopefully this is pretty sparse
2301 * @ms: current migration state
2303 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2305 RAMState *rs = ram_state;
2306 RAMBlock *block;
2307 int ret;
2309 rcu_read_lock();
2311 /* This should be our last sync, the src is now paused */
2312 migration_bitmap_sync(rs);
2314 /* Easiest way to make sure we don't resume in the middle of a host-page */
2315 rs->last_seen_block = NULL;
2316 rs->last_sent_block = NULL;
2317 rs->last_page = 0;
2319 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2320 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2321 unsigned long *bitmap = block->bmap;
2322 unsigned long *unsentmap = block->unsentmap;
2324 if (!unsentmap) {
2325 /* We don't have a safe way to resize the sentmap, so
2326 * if the bitmap was resized it will be NULL at this
2327 * point.
2329 error_report("migration ram resized during precopy phase");
2330 rcu_read_unlock();
2331 return -EINVAL;
2333 /* Deal with TPS != HPS and huge pages */
2334 ret = postcopy_chunk_hostpages(ms, block);
2335 if (ret) {
2336 rcu_read_unlock();
2337 return ret;
2341 * Update the unsentmap to be unsentmap = unsentmap | dirty
2343 bitmap_or(unsentmap, unsentmap, bitmap, pages);
2344 #ifdef DEBUG_POSTCOPY
2345 ram_debug_dump_bitmap(unsentmap, true, pages);
2346 #endif
2348 trace_ram_postcopy_send_discard_bitmap();
2350 ret = postcopy_each_ram_send_discard(ms);
2351 rcu_read_unlock();
2353 return ret;
2357 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2359 * Returns zero on success
2361 * @rbname: name of the RAMBlock of the request. NULL means the
2362 * same that last one.
2363 * @start: RAMBlock starting page
2364 * @length: RAMBlock size
2366 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2368 int ret = -1;
2370 trace_ram_discard_range(rbname, start, length);
2372 rcu_read_lock();
2373 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2375 if (!rb) {
2376 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2377 goto err;
2380 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2381 length >> qemu_target_page_bits());
2382 ret = ram_block_discard_range(rb, start, length);
2384 err:
2385 rcu_read_unlock();
2387 return ret;
2391 * For every allocation, we will try not to crash the VM if the
2392 * allocation failed.
2394 static int xbzrle_init(void)
2396 Error *local_err = NULL;
2398 if (!migrate_use_xbzrle()) {
2399 return 0;
2402 XBZRLE_cache_lock();
2404 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2405 if (!XBZRLE.zero_target_page) {
2406 error_report("%s: Error allocating zero page", __func__);
2407 goto err_out;
2410 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2411 TARGET_PAGE_SIZE, &local_err);
2412 if (!XBZRLE.cache) {
2413 error_report_err(local_err);
2414 goto free_zero_page;
2417 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2418 if (!XBZRLE.encoded_buf) {
2419 error_report("%s: Error allocating encoded_buf", __func__);
2420 goto free_cache;
2423 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2424 if (!XBZRLE.current_buf) {
2425 error_report("%s: Error allocating current_buf", __func__);
2426 goto free_encoded_buf;
2429 /* We are all good */
2430 XBZRLE_cache_unlock();
2431 return 0;
2433 free_encoded_buf:
2434 g_free(XBZRLE.encoded_buf);
2435 XBZRLE.encoded_buf = NULL;
2436 free_cache:
2437 cache_fini(XBZRLE.cache);
2438 XBZRLE.cache = NULL;
2439 free_zero_page:
2440 g_free(XBZRLE.zero_target_page);
2441 XBZRLE.zero_target_page = NULL;
2442 err_out:
2443 XBZRLE_cache_unlock();
2444 return -ENOMEM;
2447 static int ram_state_init(RAMState **rsp)
2449 *rsp = g_try_new0(RAMState, 1);
2451 if (!*rsp) {
2452 error_report("%s: Init ramstate fail", __func__);
2453 return -1;
2456 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2457 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2458 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2461 * Count the total number of pages used by ram blocks not including any
2462 * gaps due to alignment or unplugs.
2464 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2466 ram_state_reset(*rsp);
2468 return 0;
2471 static void ram_list_init_bitmaps(void)
2473 RAMBlock *block;
2474 unsigned long pages;
2476 /* Skip setting bitmap if there is no RAM */
2477 if (ram_bytes_total()) {
2478 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2479 pages = block->max_length >> TARGET_PAGE_BITS;
2480 block->bmap = bitmap_new(pages);
2481 bitmap_set(block->bmap, 0, pages);
2482 if (migrate_postcopy_ram()) {
2483 block->unsentmap = bitmap_new(pages);
2484 bitmap_set(block->unsentmap, 0, pages);
2490 static void ram_init_bitmaps(RAMState *rs)
2492 /* For memory_global_dirty_log_start below. */
2493 qemu_mutex_lock_iothread();
2494 qemu_mutex_lock_ramlist();
2495 rcu_read_lock();
2497 ram_list_init_bitmaps();
2498 memory_global_dirty_log_start();
2499 migration_bitmap_sync(rs);
2501 rcu_read_unlock();
2502 qemu_mutex_unlock_ramlist();
2503 qemu_mutex_unlock_iothread();
2506 static int ram_init_all(RAMState **rsp)
2508 if (ram_state_init(rsp)) {
2509 return -1;
2512 if (xbzrle_init()) {
2513 ram_state_cleanup(rsp);
2514 return -1;
2517 ram_init_bitmaps(*rsp);
2519 return 0;
2522 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2524 RAMBlock *block;
2525 uint64_t pages = 0;
2528 * Postcopy is not using xbzrle/compression, so no need for that.
2529 * Also, since source are already halted, we don't need to care
2530 * about dirty page logging as well.
2533 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2534 pages += bitmap_count_one(block->bmap,
2535 block->used_length >> TARGET_PAGE_BITS);
2538 /* This may not be aligned with current bitmaps. Recalculate. */
2539 rs->migration_dirty_pages = pages;
2541 rs->last_seen_block = NULL;
2542 rs->last_sent_block = NULL;
2543 rs->last_page = 0;
2544 rs->last_version = ram_list.version;
2546 * Disable the bulk stage, otherwise we'll resend the whole RAM no
2547 * matter what we have sent.
2549 rs->ram_bulk_stage = false;
2551 /* Update RAMState cache of output QEMUFile */
2552 rs->f = out;
2554 trace_ram_state_resume_prepare(pages);
2558 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2559 * long-running RCU critical section. When rcu-reclaims in the code
2560 * start to become numerous it will be necessary to reduce the
2561 * granularity of these critical sections.
2565 * ram_save_setup: Setup RAM for migration
2567 * Returns zero to indicate success and negative for error
2569 * @f: QEMUFile where to send the data
2570 * @opaque: RAMState pointer
2572 static int ram_save_setup(QEMUFile *f, void *opaque)
2574 RAMState **rsp = opaque;
2575 RAMBlock *block;
2577 if (compress_threads_save_setup()) {
2578 return -1;
2581 /* migration has already setup the bitmap, reuse it. */
2582 if (!migration_in_colo_state()) {
2583 if (ram_init_all(rsp) != 0) {
2584 compress_threads_save_cleanup();
2585 return -1;
2588 (*rsp)->f = f;
2590 rcu_read_lock();
2592 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2594 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2595 qemu_put_byte(f, strlen(block->idstr));
2596 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2597 qemu_put_be64(f, block->used_length);
2598 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2599 qemu_put_be64(f, block->page_size);
2603 rcu_read_unlock();
2605 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2606 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2608 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2610 return 0;
2614 * ram_save_iterate: iterative stage for migration
2616 * Returns zero to indicate success and negative for error
2618 * @f: QEMUFile where to send the data
2619 * @opaque: RAMState pointer
2621 static int ram_save_iterate(QEMUFile *f, void *opaque)
2623 RAMState **temp = opaque;
2624 RAMState *rs = *temp;
2625 int ret;
2626 int i;
2627 int64_t t0;
2628 int done = 0;
2630 if (blk_mig_bulk_active()) {
2631 /* Avoid transferring ram during bulk phase of block migration as
2632 * the bulk phase will usually take a long time and transferring
2633 * ram updates during that time is pointless. */
2634 goto out;
2637 rcu_read_lock();
2638 if (ram_list.version != rs->last_version) {
2639 ram_state_reset(rs);
2642 /* Read version before ram_list.blocks */
2643 smp_rmb();
2645 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2647 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2648 i = 0;
2649 while ((ret = qemu_file_rate_limit(f)) == 0 ||
2650 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2651 int pages;
2653 if (qemu_file_get_error(f)) {
2654 break;
2657 pages = ram_find_and_save_block(rs, false);
2658 /* no more pages to sent */
2659 if (pages == 0) {
2660 done = 1;
2661 break;
2663 rs->iterations++;
2665 /* we want to check in the 1st loop, just in case it was the 1st time
2666 and we had to sync the dirty bitmap.
2667 qemu_get_clock_ns() is a bit expensive, so we only check each some
2668 iterations
2670 if ((i & 63) == 0) {
2671 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2672 if (t1 > MAX_WAIT) {
2673 trace_ram_save_iterate_big_wait(t1, i);
2674 break;
2677 i++;
2679 flush_compressed_data(rs);
2680 rcu_read_unlock();
2683 * Must occur before EOS (or any QEMUFile operation)
2684 * because of RDMA protocol.
2686 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2688 out:
2689 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2690 ram_counters.transferred += 8;
2692 ret = qemu_file_get_error(f);
2693 if (ret < 0) {
2694 return ret;
2697 return done;
2701 * ram_save_complete: function called to send the remaining amount of ram
2703 * Returns zero to indicate success
2705 * Called with iothread lock
2707 * @f: QEMUFile where to send the data
2708 * @opaque: RAMState pointer
2710 static int ram_save_complete(QEMUFile *f, void *opaque)
2712 RAMState **temp = opaque;
2713 RAMState *rs = *temp;
2715 rcu_read_lock();
2717 if (!migration_in_postcopy()) {
2718 migration_bitmap_sync(rs);
2721 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2723 /* try transferring iterative blocks of memory */
2725 /* flush all remaining blocks regardless of rate limiting */
2726 while (true) {
2727 int pages;
2729 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2730 /* no more blocks to sent */
2731 if (pages == 0) {
2732 break;
2736 flush_compressed_data(rs);
2737 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2739 rcu_read_unlock();
2741 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2743 return 0;
2746 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2747 uint64_t *res_precopy_only,
2748 uint64_t *res_compatible,
2749 uint64_t *res_postcopy_only)
2751 RAMState **temp = opaque;
2752 RAMState *rs = *temp;
2753 uint64_t remaining_size;
2755 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2757 if (!migration_in_postcopy() &&
2758 remaining_size < max_size) {
2759 qemu_mutex_lock_iothread();
2760 rcu_read_lock();
2761 migration_bitmap_sync(rs);
2762 rcu_read_unlock();
2763 qemu_mutex_unlock_iothread();
2764 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2767 if (migrate_postcopy_ram()) {
2768 /* We can do postcopy, and all the data is postcopiable */
2769 *res_compatible += remaining_size;
2770 } else {
2771 *res_precopy_only += remaining_size;
2775 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2777 unsigned int xh_len;
2778 int xh_flags;
2779 uint8_t *loaded_data;
2781 /* extract RLE header */
2782 xh_flags = qemu_get_byte(f);
2783 xh_len = qemu_get_be16(f);
2785 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2786 error_report("Failed to load XBZRLE page - wrong compression!");
2787 return -1;
2790 if (xh_len > TARGET_PAGE_SIZE) {
2791 error_report("Failed to load XBZRLE page - len overflow!");
2792 return -1;
2794 loaded_data = XBZRLE.decoded_buf;
2795 /* load data and decode */
2796 /* it can change loaded_data to point to an internal buffer */
2797 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2799 /* decode RLE */
2800 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2801 TARGET_PAGE_SIZE) == -1) {
2802 error_report("Failed to load XBZRLE page - decode error!");
2803 return -1;
2806 return 0;
2810 * ram_block_from_stream: read a RAMBlock id from the migration stream
2812 * Must be called from within a rcu critical section.
2814 * Returns a pointer from within the RCU-protected ram_list.
2816 * @f: QEMUFile where to read the data from
2817 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2819 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2821 static RAMBlock *block = NULL;
2822 char id[256];
2823 uint8_t len;
2825 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2826 if (!block) {
2827 error_report("Ack, bad migration stream!");
2828 return NULL;
2830 return block;
2833 len = qemu_get_byte(f);
2834 qemu_get_buffer(f, (uint8_t *)id, len);
2835 id[len] = 0;
2837 block = qemu_ram_block_by_name(id);
2838 if (!block) {
2839 error_report("Can't find block %s", id);
2840 return NULL;
2843 if (!qemu_ram_is_migratable(block)) {
2844 error_report("block %s should not be migrated !", id);
2845 return NULL;
2848 return block;
2851 static inline void *host_from_ram_block_offset(RAMBlock *block,
2852 ram_addr_t offset)
2854 if (!offset_in_ramblock(block, offset)) {
2855 return NULL;
2858 return block->host + offset;
2862 * ram_handle_compressed: handle the zero page case
2864 * If a page (or a whole RDMA chunk) has been
2865 * determined to be zero, then zap it.
2867 * @host: host address for the zero page
2868 * @ch: what the page is filled from. We only support zero
2869 * @size: size of the zero page
2871 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2873 if (ch != 0 || !is_zero_range(host, size)) {
2874 memset(host, ch, size);
2878 /* return the size after decompression, or negative value on error */
2879 static int
2880 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2881 const uint8_t *source, size_t source_len)
2883 int err;
2885 err = inflateReset(stream);
2886 if (err != Z_OK) {
2887 return -1;
2890 stream->avail_in = source_len;
2891 stream->next_in = (uint8_t *)source;
2892 stream->avail_out = dest_len;
2893 stream->next_out = dest;
2895 err = inflate(stream, Z_NO_FLUSH);
2896 if (err != Z_STREAM_END) {
2897 return -1;
2900 return stream->total_out;
2903 static void *do_data_decompress(void *opaque)
2905 DecompressParam *param = opaque;
2906 unsigned long pagesize;
2907 uint8_t *des;
2908 int len, ret;
2910 qemu_mutex_lock(&param->mutex);
2911 while (!param->quit) {
2912 if (param->des) {
2913 des = param->des;
2914 len = param->len;
2915 param->des = 0;
2916 qemu_mutex_unlock(&param->mutex);
2918 pagesize = TARGET_PAGE_SIZE;
2920 ret = qemu_uncompress_data(&param->stream, des, pagesize,
2921 param->compbuf, len);
2922 if (ret < 0 && migrate_get_current()->decompress_error_check) {
2923 error_report("decompress data failed");
2924 qemu_file_set_error(decomp_file, ret);
2927 qemu_mutex_lock(&decomp_done_lock);
2928 param->done = true;
2929 qemu_cond_signal(&decomp_done_cond);
2930 qemu_mutex_unlock(&decomp_done_lock);
2932 qemu_mutex_lock(&param->mutex);
2933 } else {
2934 qemu_cond_wait(&param->cond, &param->mutex);
2937 qemu_mutex_unlock(&param->mutex);
2939 return NULL;
2942 static int wait_for_decompress_done(void)
2944 int idx, thread_count;
2946 if (!migrate_use_compression()) {
2947 return 0;
2950 thread_count = migrate_decompress_threads();
2951 qemu_mutex_lock(&decomp_done_lock);
2952 for (idx = 0; idx < thread_count; idx++) {
2953 while (!decomp_param[idx].done) {
2954 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2957 qemu_mutex_unlock(&decomp_done_lock);
2958 return qemu_file_get_error(decomp_file);
2961 static void compress_threads_load_cleanup(void)
2963 int i, thread_count;
2965 if (!migrate_use_compression()) {
2966 return;
2968 thread_count = migrate_decompress_threads();
2969 for (i = 0; i < thread_count; i++) {
2971 * we use it as a indicator which shows if the thread is
2972 * properly init'd or not
2974 if (!decomp_param[i].compbuf) {
2975 break;
2978 qemu_mutex_lock(&decomp_param[i].mutex);
2979 decomp_param[i].quit = true;
2980 qemu_cond_signal(&decomp_param[i].cond);
2981 qemu_mutex_unlock(&decomp_param[i].mutex);
2983 for (i = 0; i < thread_count; i++) {
2984 if (!decomp_param[i].compbuf) {
2985 break;
2988 qemu_thread_join(decompress_threads + i);
2989 qemu_mutex_destroy(&decomp_param[i].mutex);
2990 qemu_cond_destroy(&decomp_param[i].cond);
2991 inflateEnd(&decomp_param[i].stream);
2992 g_free(decomp_param[i].compbuf);
2993 decomp_param[i].compbuf = NULL;
2995 g_free(decompress_threads);
2996 g_free(decomp_param);
2997 decompress_threads = NULL;
2998 decomp_param = NULL;
2999 decomp_file = NULL;
3002 static int compress_threads_load_setup(QEMUFile *f)
3004 int i, thread_count;
3006 if (!migrate_use_compression()) {
3007 return 0;
3010 thread_count = migrate_decompress_threads();
3011 decompress_threads = g_new0(QemuThread, thread_count);
3012 decomp_param = g_new0(DecompressParam, thread_count);
3013 qemu_mutex_init(&decomp_done_lock);
3014 qemu_cond_init(&decomp_done_cond);
3015 decomp_file = f;
3016 for (i = 0; i < thread_count; i++) {
3017 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3018 goto exit;
3021 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3022 qemu_mutex_init(&decomp_param[i].mutex);
3023 qemu_cond_init(&decomp_param[i].cond);
3024 decomp_param[i].done = true;
3025 decomp_param[i].quit = false;
3026 qemu_thread_create(decompress_threads + i, "decompress",
3027 do_data_decompress, decomp_param + i,
3028 QEMU_THREAD_JOINABLE);
3030 return 0;
3031 exit:
3032 compress_threads_load_cleanup();
3033 return -1;
3036 static void decompress_data_with_multi_threads(QEMUFile *f,
3037 void *host, int len)
3039 int idx, thread_count;
3041 thread_count = migrate_decompress_threads();
3042 qemu_mutex_lock(&decomp_done_lock);
3043 while (true) {
3044 for (idx = 0; idx < thread_count; idx++) {
3045 if (decomp_param[idx].done) {
3046 decomp_param[idx].done = false;
3047 qemu_mutex_lock(&decomp_param[idx].mutex);
3048 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3049 decomp_param[idx].des = host;
3050 decomp_param[idx].len = len;
3051 qemu_cond_signal(&decomp_param[idx].cond);
3052 qemu_mutex_unlock(&decomp_param[idx].mutex);
3053 break;
3056 if (idx < thread_count) {
3057 break;
3058 } else {
3059 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3062 qemu_mutex_unlock(&decomp_done_lock);
3066 * ram_load_setup: Setup RAM for migration incoming side
3068 * Returns zero to indicate success and negative for error
3070 * @f: QEMUFile where to receive the data
3071 * @opaque: RAMState pointer
3073 static int ram_load_setup(QEMUFile *f, void *opaque)
3075 if (compress_threads_load_setup(f)) {
3076 return -1;
3079 xbzrle_load_setup();
3080 ramblock_recv_map_init();
3081 return 0;
3084 static int ram_load_cleanup(void *opaque)
3086 RAMBlock *rb;
3087 xbzrle_load_cleanup();
3088 compress_threads_load_cleanup();
3090 RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3091 g_free(rb->receivedmap);
3092 rb->receivedmap = NULL;
3094 return 0;
3098 * ram_postcopy_incoming_init: allocate postcopy data structures
3100 * Returns 0 for success and negative if there was one error
3102 * @mis: current migration incoming state
3104 * Allocate data structures etc needed by incoming migration with
3105 * postcopy-ram. postcopy-ram's similarly names
3106 * postcopy_ram_incoming_init does the work.
3108 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3110 unsigned long ram_pages = last_ram_page();
3112 return postcopy_ram_incoming_init(mis, ram_pages);
3116 * ram_load_postcopy: load a page in postcopy case
3118 * Returns 0 for success or -errno in case of error
3120 * Called in postcopy mode by ram_load().
3121 * rcu_read_lock is taken prior to this being called.
3123 * @f: QEMUFile where to send the data
3125 static int ram_load_postcopy(QEMUFile *f)
3127 int flags = 0, ret = 0;
3128 bool place_needed = false;
3129 bool matching_page_sizes = false;
3130 MigrationIncomingState *mis = migration_incoming_get_current();
3131 /* Temporary page that is later 'placed' */
3132 void *postcopy_host_page = postcopy_get_tmp_page(mis);
3133 void *last_host = NULL;
3134 bool all_zero = false;
3136 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3137 ram_addr_t addr;
3138 void *host = NULL;
3139 void *page_buffer = NULL;
3140 void *place_source = NULL;
3141 RAMBlock *block = NULL;
3142 uint8_t ch;
3144 addr = qemu_get_be64(f);
3147 * If qemu file error, we should stop here, and then "addr"
3148 * may be invalid
3150 ret = qemu_file_get_error(f);
3151 if (ret) {
3152 break;
3155 flags = addr & ~TARGET_PAGE_MASK;
3156 addr &= TARGET_PAGE_MASK;
3158 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3159 place_needed = false;
3160 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
3161 block = ram_block_from_stream(f, flags);
3163 host = host_from_ram_block_offset(block, addr);
3164 if (!host) {
3165 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3166 ret = -EINVAL;
3167 break;
3169 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
3171 * Postcopy requires that we place whole host pages atomically;
3172 * these may be huge pages for RAMBlocks that are backed by
3173 * hugetlbfs.
3174 * To make it atomic, the data is read into a temporary page
3175 * that's moved into place later.
3176 * The migration protocol uses, possibly smaller, target-pages
3177 * however the source ensures it always sends all the components
3178 * of a host page in order.
3180 page_buffer = postcopy_host_page +
3181 ((uintptr_t)host & (block->page_size - 1));
3182 /* If all TP are zero then we can optimise the place */
3183 if (!((uintptr_t)host & (block->page_size - 1))) {
3184 all_zero = true;
3185 } else {
3186 /* not the 1st TP within the HP */
3187 if (host != (last_host + TARGET_PAGE_SIZE)) {
3188 error_report("Non-sequential target page %p/%p",
3189 host, last_host);
3190 ret = -EINVAL;
3191 break;
3197 * If it's the last part of a host page then we place the host
3198 * page
3200 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
3201 (block->page_size - 1)) == 0;
3202 place_source = postcopy_host_page;
3204 last_host = host;
3206 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3207 case RAM_SAVE_FLAG_ZERO:
3208 ch = qemu_get_byte(f);
3209 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3210 if (ch) {
3211 all_zero = false;
3213 break;
3215 case RAM_SAVE_FLAG_PAGE:
3216 all_zero = false;
3217 if (!place_needed || !matching_page_sizes) {
3218 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3219 } else {
3220 /* Avoids the qemu_file copy during postcopy, which is
3221 * going to do a copy later; can only do it when we
3222 * do this read in one go (matching page sizes)
3224 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3225 TARGET_PAGE_SIZE);
3227 break;
3228 case RAM_SAVE_FLAG_EOS:
3229 /* normal exit */
3230 break;
3231 default:
3232 error_report("Unknown combination of migration flags: %#x"
3233 " (postcopy mode)", flags);
3234 ret = -EINVAL;
3235 break;
3238 /* Detect for any possible file errors */
3239 if (!ret && qemu_file_get_error(f)) {
3240 ret = qemu_file_get_error(f);
3243 if (!ret && place_needed) {
3244 /* This gets called at the last target page in the host page */
3245 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3247 if (all_zero) {
3248 ret = postcopy_place_page_zero(mis, place_dest,
3249 block);
3250 } else {
3251 ret = postcopy_place_page(mis, place_dest,
3252 place_source, block);
3257 return ret;
3260 static bool postcopy_is_advised(void)
3262 PostcopyState ps = postcopy_state_get();
3263 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3266 static bool postcopy_is_running(void)
3268 PostcopyState ps = postcopy_state_get();
3269 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3272 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3274 int flags = 0, ret = 0, invalid_flags = 0;
3275 static uint64_t seq_iter;
3276 int len = 0;
3278 * If system is running in postcopy mode, page inserts to host memory must
3279 * be atomic
3281 bool postcopy_running = postcopy_is_running();
3282 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3283 bool postcopy_advised = postcopy_is_advised();
3285 seq_iter++;
3287 if (version_id != 4) {
3288 ret = -EINVAL;
3291 if (!migrate_use_compression()) {
3292 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3294 /* This RCU critical section can be very long running.
3295 * When RCU reclaims in the code start to become numerous,
3296 * it will be necessary to reduce the granularity of this
3297 * critical section.
3299 rcu_read_lock();
3301 if (postcopy_running) {
3302 ret = ram_load_postcopy(f);
3305 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3306 ram_addr_t addr, total_ram_bytes;
3307 void *host = NULL;
3308 uint8_t ch;
3310 addr = qemu_get_be64(f);
3311 flags = addr & ~TARGET_PAGE_MASK;
3312 addr &= TARGET_PAGE_MASK;
3314 if (flags & invalid_flags) {
3315 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3316 error_report("Received an unexpected compressed page");
3319 ret = -EINVAL;
3320 break;
3323 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3324 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3325 RAMBlock *block = ram_block_from_stream(f, flags);
3327 host = host_from_ram_block_offset(block, addr);
3328 if (!host) {
3329 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3330 ret = -EINVAL;
3331 break;
3333 ramblock_recv_bitmap_set(block, host);
3334 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3337 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3338 case RAM_SAVE_FLAG_MEM_SIZE:
3339 /* Synchronize RAM block list */
3340 total_ram_bytes = addr;
3341 while (!ret && total_ram_bytes) {
3342 RAMBlock *block;
3343 char id[256];
3344 ram_addr_t length;
3346 len = qemu_get_byte(f);
3347 qemu_get_buffer(f, (uint8_t *)id, len);
3348 id[len] = 0;
3349 length = qemu_get_be64(f);
3351 block = qemu_ram_block_by_name(id);
3352 if (block && !qemu_ram_is_migratable(block)) {
3353 error_report("block %s should not be migrated !", id);
3354 ret = -EINVAL;
3355 } else if (block) {
3356 if (length != block->used_length) {
3357 Error *local_err = NULL;
3359 ret = qemu_ram_resize(block, length,
3360 &local_err);
3361 if (local_err) {
3362 error_report_err(local_err);
3365 /* For postcopy we need to check hugepage sizes match */
3366 if (postcopy_advised &&
3367 block->page_size != qemu_host_page_size) {
3368 uint64_t remote_page_size = qemu_get_be64(f);
3369 if (remote_page_size != block->page_size) {
3370 error_report("Mismatched RAM page size %s "
3371 "(local) %zd != %" PRId64,
3372 id, block->page_size,
3373 remote_page_size);
3374 ret = -EINVAL;
3377 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3378 block->idstr);
3379 } else {
3380 error_report("Unknown ramblock \"%s\", cannot "
3381 "accept migration", id);
3382 ret = -EINVAL;
3385 total_ram_bytes -= length;
3387 break;
3389 case RAM_SAVE_FLAG_ZERO:
3390 ch = qemu_get_byte(f);
3391 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3392 break;
3394 case RAM_SAVE_FLAG_PAGE:
3395 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3396 break;
3398 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3399 len = qemu_get_be32(f);
3400 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3401 error_report("Invalid compressed data length: %d", len);
3402 ret = -EINVAL;
3403 break;
3405 decompress_data_with_multi_threads(f, host, len);
3406 break;
3408 case RAM_SAVE_FLAG_XBZRLE:
3409 if (load_xbzrle(f, addr, host) < 0) {
3410 error_report("Failed to decompress XBZRLE page at "
3411 RAM_ADDR_FMT, addr);
3412 ret = -EINVAL;
3413 break;
3415 break;
3416 case RAM_SAVE_FLAG_EOS:
3417 /* normal exit */
3418 break;
3419 default:
3420 if (flags & RAM_SAVE_FLAG_HOOK) {
3421 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3422 } else {
3423 error_report("Unknown combination of migration flags: %#x",
3424 flags);
3425 ret = -EINVAL;
3428 if (!ret) {
3429 ret = qemu_file_get_error(f);
3433 ret |= wait_for_decompress_done();
3434 rcu_read_unlock();
3435 trace_ram_load_complete(ret, seq_iter);
3436 return ret;
3439 static bool ram_has_postcopy(void *opaque)
3441 return migrate_postcopy_ram();
3444 /* Sync all the dirty bitmap with destination VM. */
3445 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3447 RAMBlock *block;
3448 QEMUFile *file = s->to_dst_file;
3449 int ramblock_count = 0;
3451 trace_ram_dirty_bitmap_sync_start();
3453 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3454 qemu_savevm_send_recv_bitmap(file, block->idstr);
3455 trace_ram_dirty_bitmap_request(block->idstr);
3456 ramblock_count++;
3459 trace_ram_dirty_bitmap_sync_wait();
3461 /* Wait until all the ramblocks' dirty bitmap synced */
3462 while (ramblock_count--) {
3463 qemu_sem_wait(&s->rp_state.rp_sem);
3466 trace_ram_dirty_bitmap_sync_complete();
3468 return 0;
3471 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3473 qemu_sem_post(&s->rp_state.rp_sem);
3477 * Read the received bitmap, revert it as the initial dirty bitmap.
3478 * This is only used when the postcopy migration is paused but wants
3479 * to resume from a middle point.
3481 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3483 int ret = -EINVAL;
3484 QEMUFile *file = s->rp_state.from_dst_file;
3485 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3486 uint64_t local_size = nbits / 8;
3487 uint64_t size, end_mark;
3489 trace_ram_dirty_bitmap_reload_begin(block->idstr);
3491 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3492 error_report("%s: incorrect state %s", __func__,
3493 MigrationStatus_str(s->state));
3494 return -EINVAL;
3498 * Note: see comments in ramblock_recv_bitmap_send() on why we
3499 * need the endianess convertion, and the paddings.
3501 local_size = ROUND_UP(local_size, 8);
3503 /* Add paddings */
3504 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3506 size = qemu_get_be64(file);
3508 /* The size of the bitmap should match with our ramblock */
3509 if (size != local_size) {
3510 error_report("%s: ramblock '%s' bitmap size mismatch "
3511 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3512 block->idstr, size, local_size);
3513 ret = -EINVAL;
3514 goto out;
3517 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3518 end_mark = qemu_get_be64(file);
3520 ret = qemu_file_get_error(file);
3521 if (ret || size != local_size) {
3522 error_report("%s: read bitmap failed for ramblock '%s': %d"
3523 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3524 __func__, block->idstr, ret, local_size, size);
3525 ret = -EIO;
3526 goto out;
3529 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3530 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3531 __func__, block->idstr, end_mark);
3532 ret = -EINVAL;
3533 goto out;
3537 * Endianess convertion. We are during postcopy (though paused).
3538 * The dirty bitmap won't change. We can directly modify it.
3540 bitmap_from_le(block->bmap, le_bitmap, nbits);
3543 * What we received is "received bitmap". Revert it as the initial
3544 * dirty bitmap for this ramblock.
3546 bitmap_complement(block->bmap, block->bmap, nbits);
3548 trace_ram_dirty_bitmap_reload_complete(block->idstr);
3551 * We succeeded to sync bitmap for current ramblock. If this is
3552 * the last one to sync, we need to notify the main send thread.
3554 ram_dirty_bitmap_reload_notify(s);
3556 ret = 0;
3557 out:
3558 g_free(le_bitmap);
3559 return ret;
3562 static int ram_resume_prepare(MigrationState *s, void *opaque)
3564 RAMState *rs = *(RAMState **)opaque;
3565 int ret;
3567 ret = ram_dirty_bitmap_sync_all(s, rs);
3568 if (ret) {
3569 return ret;
3572 ram_state_resume_prepare(rs, s->to_dst_file);
3574 return 0;
3577 static SaveVMHandlers savevm_ram_handlers = {
3578 .save_setup = ram_save_setup,
3579 .save_live_iterate = ram_save_iterate,
3580 .save_live_complete_postcopy = ram_save_complete,
3581 .save_live_complete_precopy = ram_save_complete,
3582 .has_postcopy = ram_has_postcopy,
3583 .save_live_pending = ram_save_pending,
3584 .load_state = ram_load,
3585 .save_cleanup = ram_save_cleanup,
3586 .load_setup = ram_load_setup,
3587 .load_cleanup = ram_load_cleanup,
3588 .resume_prepare = ram_resume_prepare,
3591 void ram_mig_init(void)
3593 qemu_mutex_init(&XBZRLE.lock);
3594 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);