migration: new message MIG_RP_MSG_RECV_BITMAP
[qemu/ar7.git] / migration / ram.c
blob5542843adc6974361bfe5870f8bd7d442a997302
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qemu/cutils.h"
33 #include "qemu/bitops.h"
34 #include "qemu/bitmap.h"
35 #include "qemu/main-loop.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "socket.h"
40 #include "migration/register.h"
41 #include "migration/misc.h"
42 #include "qemu-file.h"
43 #include "postcopy-ram.h"
44 #include "migration/page_cache.h"
45 #include "qemu/error-report.h"
46 #include "qapi/error.h"
47 #include "qapi/qapi-events-migration.h"
48 #include "qapi/qmp/qerror.h"
49 #include "trace.h"
50 #include "exec/ram_addr.h"
51 #include "exec/target_page.h"
52 #include "qemu/rcu_queue.h"
53 #include "migration/colo.h"
54 #include "migration/block.h"
55 #include "sysemu/sysemu.h"
56 #include "qemu/uuid.h"
58 /***********************************************************/
59 /* ram save/restore */
61 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
62 * worked for pages that where filled with the same char. We switched
63 * it to only search for the zero value. And to avoid confusion with
64 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
67 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
68 #define RAM_SAVE_FLAG_ZERO 0x02
69 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
70 #define RAM_SAVE_FLAG_PAGE 0x08
71 #define RAM_SAVE_FLAG_EOS 0x10
72 #define RAM_SAVE_FLAG_CONTINUE 0x20
73 #define RAM_SAVE_FLAG_XBZRLE 0x40
74 /* 0x80 is reserved in migration.h start with 0x100 next */
75 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
77 static inline bool is_zero_range(uint8_t *p, uint64_t size)
79 return buffer_is_zero(p, size);
82 XBZRLECacheStats xbzrle_counters;
84 /* struct contains XBZRLE cache and a static page
85 used by the compression */
86 static struct {
87 /* buffer used for XBZRLE encoding */
88 uint8_t *encoded_buf;
89 /* buffer for storing page content */
90 uint8_t *current_buf;
91 /* Cache for XBZRLE, Protected by lock. */
92 PageCache *cache;
93 QemuMutex lock;
94 /* it will store a page full of zeros */
95 uint8_t *zero_target_page;
96 /* buffer used for XBZRLE decoding */
97 uint8_t *decoded_buf;
98 } XBZRLE;
100 static void XBZRLE_cache_lock(void)
102 if (migrate_use_xbzrle())
103 qemu_mutex_lock(&XBZRLE.lock);
106 static void XBZRLE_cache_unlock(void)
108 if (migrate_use_xbzrle())
109 qemu_mutex_unlock(&XBZRLE.lock);
113 * xbzrle_cache_resize: resize the xbzrle cache
115 * This function is called from qmp_migrate_set_cache_size in main
116 * thread, possibly while a migration is in progress. A running
117 * migration may be using the cache and might finish during this call,
118 * hence changes to the cache are protected by XBZRLE.lock().
120 * Returns 0 for success or -1 for error
122 * @new_size: new cache size
123 * @errp: set *errp if the check failed, with reason
125 int xbzrle_cache_resize(int64_t new_size, Error **errp)
127 PageCache *new_cache;
128 int64_t ret = 0;
130 /* Check for truncation */
131 if (new_size != (size_t)new_size) {
132 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
133 "exceeding address space");
134 return -1;
137 if (new_size == migrate_xbzrle_cache_size()) {
138 /* nothing to do */
139 return 0;
142 XBZRLE_cache_lock();
144 if (XBZRLE.cache != NULL) {
145 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
146 if (!new_cache) {
147 ret = -1;
148 goto out;
151 cache_fini(XBZRLE.cache);
152 XBZRLE.cache = new_cache;
154 out:
155 XBZRLE_cache_unlock();
156 return ret;
159 static void ramblock_recv_map_init(void)
161 RAMBlock *rb;
163 RAMBLOCK_FOREACH(rb) {
164 assert(!rb->receivedmap);
165 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
169 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
171 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
172 rb->receivedmap);
175 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
177 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
180 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
182 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
185 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
186 size_t nr)
188 bitmap_set_atomic(rb->receivedmap,
189 ramblock_recv_bitmap_offset(host_addr, rb),
190 nr);
193 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
196 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
198 * Returns >0 if success with sent bytes, or <0 if error.
200 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
201 const char *block_name)
203 RAMBlock *block = qemu_ram_block_by_name(block_name);
204 unsigned long *le_bitmap, nbits;
205 uint64_t size;
207 if (!block) {
208 error_report("%s: invalid block name: %s", __func__, block_name);
209 return -1;
212 nbits = block->used_length >> TARGET_PAGE_BITS;
215 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
216 * machines we may need 4 more bytes for padding (see below
217 * comment). So extend it a bit before hand.
219 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
222 * Always use little endian when sending the bitmap. This is
223 * required that when source and destination VMs are not using the
224 * same endianess. (Note: big endian won't work.)
226 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
228 /* Size of the bitmap, in bytes */
229 size = nbits / 8;
232 * size is always aligned to 8 bytes for 64bit machines, but it
233 * may not be true for 32bit machines. We need this padding to
234 * make sure the migration can survive even between 32bit and
235 * 64bit machines.
237 size = ROUND_UP(size, 8);
239 qemu_put_be64(file, size);
240 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
242 * Mark as an end, in case the middle part is screwed up due to
243 * some "misterious" reason.
245 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
246 qemu_fflush(file);
248 free(le_bitmap);
250 if (qemu_file_get_error(file)) {
251 return qemu_file_get_error(file);
254 return size + sizeof(size);
258 * An outstanding page request, on the source, having been received
259 * and queued
261 struct RAMSrcPageRequest {
262 RAMBlock *rb;
263 hwaddr offset;
264 hwaddr len;
266 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
269 /* State of RAM for migration */
270 struct RAMState {
271 /* QEMUFile used for this migration */
272 QEMUFile *f;
273 /* Last block that we have visited searching for dirty pages */
274 RAMBlock *last_seen_block;
275 /* Last block from where we have sent data */
276 RAMBlock *last_sent_block;
277 /* Last dirty target page we have sent */
278 ram_addr_t last_page;
279 /* last ram version we have seen */
280 uint32_t last_version;
281 /* We are in the first round */
282 bool ram_bulk_stage;
283 /* How many times we have dirty too many pages */
284 int dirty_rate_high_cnt;
285 /* these variables are used for bitmap sync */
286 /* last time we did a full bitmap_sync */
287 int64_t time_last_bitmap_sync;
288 /* bytes transferred at start_time */
289 uint64_t bytes_xfer_prev;
290 /* number of dirty pages since start_time */
291 uint64_t num_dirty_pages_period;
292 /* xbzrle misses since the beginning of the period */
293 uint64_t xbzrle_cache_miss_prev;
294 /* number of iterations at the beginning of period */
295 uint64_t iterations_prev;
296 /* Iterations since start */
297 uint64_t iterations;
298 /* number of dirty bits in the bitmap */
299 uint64_t migration_dirty_pages;
300 /* protects modification of the bitmap */
301 QemuMutex bitmap_mutex;
302 /* The RAMBlock used in the last src_page_requests */
303 RAMBlock *last_req_rb;
304 /* Queue of outstanding page requests from the destination */
305 QemuMutex src_page_req_mutex;
306 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
308 typedef struct RAMState RAMState;
310 static RAMState *ram_state;
312 uint64_t ram_bytes_remaining(void)
314 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
318 MigrationStats ram_counters;
320 /* used by the search for pages to send */
321 struct PageSearchStatus {
322 /* Current block being searched */
323 RAMBlock *block;
324 /* Current page to search from */
325 unsigned long page;
326 /* Set once we wrap around */
327 bool complete_round;
329 typedef struct PageSearchStatus PageSearchStatus;
331 struct CompressParam {
332 bool done;
333 bool quit;
334 QEMUFile *file;
335 QemuMutex mutex;
336 QemuCond cond;
337 RAMBlock *block;
338 ram_addr_t offset;
340 /* internally used fields */
341 z_stream stream;
342 uint8_t *originbuf;
344 typedef struct CompressParam CompressParam;
346 struct DecompressParam {
347 bool done;
348 bool quit;
349 QemuMutex mutex;
350 QemuCond cond;
351 void *des;
352 uint8_t *compbuf;
353 int len;
354 z_stream stream;
356 typedef struct DecompressParam DecompressParam;
358 static CompressParam *comp_param;
359 static QemuThread *compress_threads;
360 /* comp_done_cond is used to wake up the migration thread when
361 * one of the compression threads has finished the compression.
362 * comp_done_lock is used to co-work with comp_done_cond.
364 static QemuMutex comp_done_lock;
365 static QemuCond comp_done_cond;
366 /* The empty QEMUFileOps will be used by file in CompressParam */
367 static const QEMUFileOps empty_ops = { };
369 static QEMUFile *decomp_file;
370 static DecompressParam *decomp_param;
371 static QemuThread *decompress_threads;
372 static QemuMutex decomp_done_lock;
373 static QemuCond decomp_done_cond;
375 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
376 ram_addr_t offset, uint8_t *source_buf);
378 static void *do_data_compress(void *opaque)
380 CompressParam *param = opaque;
381 RAMBlock *block;
382 ram_addr_t offset;
384 qemu_mutex_lock(&param->mutex);
385 while (!param->quit) {
386 if (param->block) {
387 block = param->block;
388 offset = param->offset;
389 param->block = NULL;
390 qemu_mutex_unlock(&param->mutex);
392 do_compress_ram_page(param->file, &param->stream, block, offset,
393 param->originbuf);
395 qemu_mutex_lock(&comp_done_lock);
396 param->done = true;
397 qemu_cond_signal(&comp_done_cond);
398 qemu_mutex_unlock(&comp_done_lock);
400 qemu_mutex_lock(&param->mutex);
401 } else {
402 qemu_cond_wait(&param->cond, &param->mutex);
405 qemu_mutex_unlock(&param->mutex);
407 return NULL;
410 static inline void terminate_compression_threads(void)
412 int idx, thread_count;
414 thread_count = migrate_compress_threads();
416 for (idx = 0; idx < thread_count; idx++) {
417 qemu_mutex_lock(&comp_param[idx].mutex);
418 comp_param[idx].quit = true;
419 qemu_cond_signal(&comp_param[idx].cond);
420 qemu_mutex_unlock(&comp_param[idx].mutex);
424 static void compress_threads_save_cleanup(void)
426 int i, thread_count;
428 if (!migrate_use_compression()) {
429 return;
431 terminate_compression_threads();
432 thread_count = migrate_compress_threads();
433 for (i = 0; i < thread_count; i++) {
435 * we use it as a indicator which shows if the thread is
436 * properly init'd or not
438 if (!comp_param[i].file) {
439 break;
441 qemu_thread_join(compress_threads + i);
442 qemu_mutex_destroy(&comp_param[i].mutex);
443 qemu_cond_destroy(&comp_param[i].cond);
444 deflateEnd(&comp_param[i].stream);
445 g_free(comp_param[i].originbuf);
446 qemu_fclose(comp_param[i].file);
447 comp_param[i].file = NULL;
449 qemu_mutex_destroy(&comp_done_lock);
450 qemu_cond_destroy(&comp_done_cond);
451 g_free(compress_threads);
452 g_free(comp_param);
453 compress_threads = NULL;
454 comp_param = NULL;
457 static int compress_threads_save_setup(void)
459 int i, thread_count;
461 if (!migrate_use_compression()) {
462 return 0;
464 thread_count = migrate_compress_threads();
465 compress_threads = g_new0(QemuThread, thread_count);
466 comp_param = g_new0(CompressParam, thread_count);
467 qemu_cond_init(&comp_done_cond);
468 qemu_mutex_init(&comp_done_lock);
469 for (i = 0; i < thread_count; i++) {
470 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
471 if (!comp_param[i].originbuf) {
472 goto exit;
475 if (deflateInit(&comp_param[i].stream,
476 migrate_compress_level()) != Z_OK) {
477 g_free(comp_param[i].originbuf);
478 goto exit;
481 /* comp_param[i].file is just used as a dummy buffer to save data,
482 * set its ops to empty.
484 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
485 comp_param[i].done = true;
486 comp_param[i].quit = false;
487 qemu_mutex_init(&comp_param[i].mutex);
488 qemu_cond_init(&comp_param[i].cond);
489 qemu_thread_create(compress_threads + i, "compress",
490 do_data_compress, comp_param + i,
491 QEMU_THREAD_JOINABLE);
493 return 0;
495 exit:
496 compress_threads_save_cleanup();
497 return -1;
500 /* Multiple fd's */
502 #define MULTIFD_MAGIC 0x11223344U
503 #define MULTIFD_VERSION 1
505 typedef struct {
506 uint32_t magic;
507 uint32_t version;
508 unsigned char uuid[16]; /* QemuUUID */
509 uint8_t id;
510 } __attribute__((packed)) MultiFDInit_t;
512 typedef struct {
513 /* this fields are not changed once the thread is created */
514 /* channel number */
515 uint8_t id;
516 /* channel thread name */
517 char *name;
518 /* channel thread id */
519 QemuThread thread;
520 /* communication channel */
521 QIOChannel *c;
522 /* sem where to wait for more work */
523 QemuSemaphore sem;
524 /* this mutex protects the following parameters */
525 QemuMutex mutex;
526 /* is this channel thread running */
527 bool running;
528 /* should this thread finish */
529 bool quit;
530 } MultiFDSendParams;
532 typedef struct {
533 /* this fields are not changed once the thread is created */
534 /* channel number */
535 uint8_t id;
536 /* channel thread name */
537 char *name;
538 /* channel thread id */
539 QemuThread thread;
540 /* communication channel */
541 QIOChannel *c;
542 /* sem where to wait for more work */
543 QemuSemaphore sem;
544 /* this mutex protects the following parameters */
545 QemuMutex mutex;
546 /* is this channel thread running */
547 bool running;
548 /* should this thread finish */
549 bool quit;
550 } MultiFDRecvParams;
552 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
554 MultiFDInit_t msg;
555 int ret;
557 msg.magic = cpu_to_be32(MULTIFD_MAGIC);
558 msg.version = cpu_to_be32(MULTIFD_VERSION);
559 msg.id = p->id;
560 memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
562 ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
563 if (ret != 0) {
564 return -1;
566 return 0;
569 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
571 MultiFDInit_t msg;
572 int ret;
574 ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
575 if (ret != 0) {
576 return -1;
579 be32_to_cpus(&msg.magic);
580 be32_to_cpus(&msg.version);
582 if (msg.magic != MULTIFD_MAGIC) {
583 error_setg(errp, "multifd: received packet magic %x "
584 "expected %x", msg.magic, MULTIFD_MAGIC);
585 return -1;
588 if (msg.version != MULTIFD_VERSION) {
589 error_setg(errp, "multifd: received packet version %d "
590 "expected %d", msg.version, MULTIFD_VERSION);
591 return -1;
594 if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
595 char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
596 char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
598 error_setg(errp, "multifd: received uuid '%s' and expected "
599 "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
600 g_free(uuid);
601 g_free(msg_uuid);
602 return -1;
605 if (msg.id > migrate_multifd_channels()) {
606 error_setg(errp, "multifd: received channel version %d "
607 "expected %d", msg.version, MULTIFD_VERSION);
608 return -1;
611 return msg.id;
614 struct {
615 MultiFDSendParams *params;
616 /* number of created threads */
617 int count;
618 } *multifd_send_state;
620 static void multifd_send_terminate_threads(Error *err)
622 int i;
624 if (err) {
625 MigrationState *s = migrate_get_current();
626 migrate_set_error(s, err);
627 if (s->state == MIGRATION_STATUS_SETUP ||
628 s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
629 s->state == MIGRATION_STATUS_DEVICE ||
630 s->state == MIGRATION_STATUS_ACTIVE) {
631 migrate_set_state(&s->state, s->state,
632 MIGRATION_STATUS_FAILED);
636 for (i = 0; i < migrate_multifd_channels(); i++) {
637 MultiFDSendParams *p = &multifd_send_state->params[i];
639 qemu_mutex_lock(&p->mutex);
640 p->quit = true;
641 qemu_sem_post(&p->sem);
642 qemu_mutex_unlock(&p->mutex);
646 int multifd_save_cleanup(Error **errp)
648 int i;
649 int ret = 0;
651 if (!migrate_use_multifd()) {
652 return 0;
654 multifd_send_terminate_threads(NULL);
655 for (i = 0; i < migrate_multifd_channels(); i++) {
656 MultiFDSendParams *p = &multifd_send_state->params[i];
658 if (p->running) {
659 qemu_thread_join(&p->thread);
661 socket_send_channel_destroy(p->c);
662 p->c = NULL;
663 qemu_mutex_destroy(&p->mutex);
664 qemu_sem_destroy(&p->sem);
665 g_free(p->name);
666 p->name = NULL;
668 g_free(multifd_send_state->params);
669 multifd_send_state->params = NULL;
670 g_free(multifd_send_state);
671 multifd_send_state = NULL;
672 return ret;
675 static void *multifd_send_thread(void *opaque)
677 MultiFDSendParams *p = opaque;
678 Error *local_err = NULL;
680 if (multifd_send_initial_packet(p, &local_err) < 0) {
681 goto out;
684 while (true) {
685 qemu_mutex_lock(&p->mutex);
686 if (p->quit) {
687 qemu_mutex_unlock(&p->mutex);
688 break;
690 qemu_mutex_unlock(&p->mutex);
691 qemu_sem_wait(&p->sem);
694 out:
695 if (local_err) {
696 multifd_send_terminate_threads(local_err);
699 qemu_mutex_lock(&p->mutex);
700 p->running = false;
701 qemu_mutex_unlock(&p->mutex);
703 return NULL;
706 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
708 MultiFDSendParams *p = opaque;
709 QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
710 Error *local_err = NULL;
712 if (qio_task_propagate_error(task, &local_err)) {
713 if (multifd_save_cleanup(&local_err) != 0) {
714 migrate_set_error(migrate_get_current(), local_err);
716 } else {
717 p->c = QIO_CHANNEL(sioc);
718 qio_channel_set_delay(p->c, false);
719 p->running = true;
720 qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
721 QEMU_THREAD_JOINABLE);
723 atomic_inc(&multifd_send_state->count);
727 int multifd_save_setup(void)
729 int thread_count;
730 uint8_t i;
732 if (!migrate_use_multifd()) {
733 return 0;
735 thread_count = migrate_multifd_channels();
736 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
737 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
738 atomic_set(&multifd_send_state->count, 0);
739 for (i = 0; i < thread_count; i++) {
740 MultiFDSendParams *p = &multifd_send_state->params[i];
742 qemu_mutex_init(&p->mutex);
743 qemu_sem_init(&p->sem, 0);
744 p->quit = false;
745 p->id = i;
746 p->name = g_strdup_printf("multifdsend_%d", i);
747 socket_send_channel_create(multifd_new_send_channel_async, p);
749 return 0;
752 struct {
753 MultiFDRecvParams *params;
754 /* number of created threads */
755 int count;
756 } *multifd_recv_state;
758 static void multifd_recv_terminate_threads(Error *err)
760 int i;
762 if (err) {
763 MigrationState *s = migrate_get_current();
764 migrate_set_error(s, err);
765 if (s->state == MIGRATION_STATUS_SETUP ||
766 s->state == MIGRATION_STATUS_ACTIVE) {
767 migrate_set_state(&s->state, s->state,
768 MIGRATION_STATUS_FAILED);
772 for (i = 0; i < migrate_multifd_channels(); i++) {
773 MultiFDRecvParams *p = &multifd_recv_state->params[i];
775 qemu_mutex_lock(&p->mutex);
776 p->quit = true;
777 qemu_sem_post(&p->sem);
778 qemu_mutex_unlock(&p->mutex);
782 int multifd_load_cleanup(Error **errp)
784 int i;
785 int ret = 0;
787 if (!migrate_use_multifd()) {
788 return 0;
790 multifd_recv_terminate_threads(NULL);
791 for (i = 0; i < migrate_multifd_channels(); i++) {
792 MultiFDRecvParams *p = &multifd_recv_state->params[i];
794 if (p->running) {
795 qemu_thread_join(&p->thread);
797 object_unref(OBJECT(p->c));
798 p->c = NULL;
799 qemu_mutex_destroy(&p->mutex);
800 qemu_sem_destroy(&p->sem);
801 g_free(p->name);
802 p->name = NULL;
804 g_free(multifd_recv_state->params);
805 multifd_recv_state->params = NULL;
806 g_free(multifd_recv_state);
807 multifd_recv_state = NULL;
809 return ret;
812 static void *multifd_recv_thread(void *opaque)
814 MultiFDRecvParams *p = opaque;
816 while (true) {
817 qemu_mutex_lock(&p->mutex);
818 if (p->quit) {
819 qemu_mutex_unlock(&p->mutex);
820 break;
822 qemu_mutex_unlock(&p->mutex);
823 qemu_sem_wait(&p->sem);
826 qemu_mutex_lock(&p->mutex);
827 p->running = false;
828 qemu_mutex_unlock(&p->mutex);
830 return NULL;
833 int multifd_load_setup(void)
835 int thread_count;
836 uint8_t i;
838 if (!migrate_use_multifd()) {
839 return 0;
841 thread_count = migrate_multifd_channels();
842 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
843 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
844 atomic_set(&multifd_recv_state->count, 0);
845 for (i = 0; i < thread_count; i++) {
846 MultiFDRecvParams *p = &multifd_recv_state->params[i];
848 qemu_mutex_init(&p->mutex);
849 qemu_sem_init(&p->sem, 0);
850 p->quit = false;
851 p->id = i;
852 p->name = g_strdup_printf("multifdrecv_%d", i);
854 return 0;
857 bool multifd_recv_all_channels_created(void)
859 int thread_count = migrate_multifd_channels();
861 if (!migrate_use_multifd()) {
862 return true;
865 return thread_count == atomic_read(&multifd_recv_state->count);
868 void multifd_recv_new_channel(QIOChannel *ioc)
870 MultiFDRecvParams *p;
871 Error *local_err = NULL;
872 int id;
874 id = multifd_recv_initial_packet(ioc, &local_err);
875 if (id < 0) {
876 multifd_recv_terminate_threads(local_err);
877 return;
880 p = &multifd_recv_state->params[id];
881 if (p->c != NULL) {
882 error_setg(&local_err, "multifd: received id '%d' already setup'",
883 id);
884 multifd_recv_terminate_threads(local_err);
885 return;
887 p->c = ioc;
888 object_ref(OBJECT(ioc));
890 p->running = true;
891 qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
892 QEMU_THREAD_JOINABLE);
893 atomic_inc(&multifd_recv_state->count);
894 if (multifd_recv_state->count == migrate_multifd_channels()) {
895 migration_incoming_process();
900 * save_page_header: write page header to wire
902 * If this is the 1st block, it also writes the block identification
904 * Returns the number of bytes written
906 * @f: QEMUFile where to send the data
907 * @block: block that contains the page we want to send
908 * @offset: offset inside the block for the page
909 * in the lower bits, it contains flags
911 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
912 ram_addr_t offset)
914 size_t size, len;
916 if (block == rs->last_sent_block) {
917 offset |= RAM_SAVE_FLAG_CONTINUE;
919 qemu_put_be64(f, offset);
920 size = 8;
922 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
923 len = strlen(block->idstr);
924 qemu_put_byte(f, len);
925 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
926 size += 1 + len;
927 rs->last_sent_block = block;
929 return size;
933 * mig_throttle_guest_down: throotle down the guest
935 * Reduce amount of guest cpu execution to hopefully slow down memory
936 * writes. If guest dirty memory rate is reduced below the rate at
937 * which we can transfer pages to the destination then we should be
938 * able to complete migration. Some workloads dirty memory way too
939 * fast and will not effectively converge, even with auto-converge.
941 static void mig_throttle_guest_down(void)
943 MigrationState *s = migrate_get_current();
944 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
945 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
947 /* We have not started throttling yet. Let's start it. */
948 if (!cpu_throttle_active()) {
949 cpu_throttle_set(pct_initial);
950 } else {
951 /* Throttling already on, just increase the rate */
952 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
957 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
959 * @rs: current RAM state
960 * @current_addr: address for the zero page
962 * Update the xbzrle cache to reflect a page that's been sent as all 0.
963 * The important thing is that a stale (not-yet-0'd) page be replaced
964 * by the new data.
965 * As a bonus, if the page wasn't in the cache it gets added so that
966 * when a small write is made into the 0'd page it gets XBZRLE sent.
968 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
970 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
971 return;
974 /* We don't care if this fails to allocate a new cache page
975 * as long as it updated an old one */
976 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
977 ram_counters.dirty_sync_count);
980 #define ENCODING_FLAG_XBZRLE 0x1
983 * save_xbzrle_page: compress and send current page
985 * Returns: 1 means that we wrote the page
986 * 0 means that page is identical to the one already sent
987 * -1 means that xbzrle would be longer than normal
989 * @rs: current RAM state
990 * @current_data: pointer to the address of the page contents
991 * @current_addr: addr of the page
992 * @block: block that contains the page we want to send
993 * @offset: offset inside the block for the page
994 * @last_stage: if we are at the completion stage
996 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
997 ram_addr_t current_addr, RAMBlock *block,
998 ram_addr_t offset, bool last_stage)
1000 int encoded_len = 0, bytes_xbzrle;
1001 uint8_t *prev_cached_page;
1003 if (!cache_is_cached(XBZRLE.cache, current_addr,
1004 ram_counters.dirty_sync_count)) {
1005 xbzrle_counters.cache_miss++;
1006 if (!last_stage) {
1007 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1008 ram_counters.dirty_sync_count) == -1) {
1009 return -1;
1010 } else {
1011 /* update *current_data when the page has been
1012 inserted into cache */
1013 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1016 return -1;
1019 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1021 /* save current buffer into memory */
1022 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1024 /* XBZRLE encoding (if there is no overflow) */
1025 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1026 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1027 TARGET_PAGE_SIZE);
1028 if (encoded_len == 0) {
1029 trace_save_xbzrle_page_skipping();
1030 return 0;
1031 } else if (encoded_len == -1) {
1032 trace_save_xbzrle_page_overflow();
1033 xbzrle_counters.overflow++;
1034 /* update data in the cache */
1035 if (!last_stage) {
1036 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1037 *current_data = prev_cached_page;
1039 return -1;
1042 /* we need to update the data in the cache, in order to get the same data */
1043 if (!last_stage) {
1044 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1047 /* Send XBZRLE based compressed page */
1048 bytes_xbzrle = save_page_header(rs, rs->f, block,
1049 offset | RAM_SAVE_FLAG_XBZRLE);
1050 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1051 qemu_put_be16(rs->f, encoded_len);
1052 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1053 bytes_xbzrle += encoded_len + 1 + 2;
1054 xbzrle_counters.pages++;
1055 xbzrle_counters.bytes += bytes_xbzrle;
1056 ram_counters.transferred += bytes_xbzrle;
1058 return 1;
1062 * migration_bitmap_find_dirty: find the next dirty page from start
1064 * Called with rcu_read_lock() to protect migration_bitmap
1066 * Returns the byte offset within memory region of the start of a dirty page
1068 * @rs: current RAM state
1069 * @rb: RAMBlock where to search for dirty pages
1070 * @start: page where we start the search
1072 static inline
1073 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1074 unsigned long start)
1076 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1077 unsigned long *bitmap = rb->bmap;
1078 unsigned long next;
1080 if (rs->ram_bulk_stage && start > 0) {
1081 next = start + 1;
1082 } else {
1083 next = find_next_bit(bitmap, size, start);
1086 return next;
1089 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1090 RAMBlock *rb,
1091 unsigned long page)
1093 bool ret;
1095 ret = test_and_clear_bit(page, rb->bmap);
1097 if (ret) {
1098 rs->migration_dirty_pages--;
1100 return ret;
1103 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1104 ram_addr_t start, ram_addr_t length)
1106 rs->migration_dirty_pages +=
1107 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1108 &rs->num_dirty_pages_period);
1112 * ram_pagesize_summary: calculate all the pagesizes of a VM
1114 * Returns a summary bitmap of the page sizes of all RAMBlocks
1116 * For VMs with just normal pages this is equivalent to the host page
1117 * size. If it's got some huge pages then it's the OR of all the
1118 * different page sizes.
1120 uint64_t ram_pagesize_summary(void)
1122 RAMBlock *block;
1123 uint64_t summary = 0;
1125 RAMBLOCK_FOREACH(block) {
1126 summary |= block->page_size;
1129 return summary;
1132 static void migration_bitmap_sync(RAMState *rs)
1134 RAMBlock *block;
1135 int64_t end_time;
1136 uint64_t bytes_xfer_now;
1138 ram_counters.dirty_sync_count++;
1140 if (!rs->time_last_bitmap_sync) {
1141 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1144 trace_migration_bitmap_sync_start();
1145 memory_global_dirty_log_sync();
1147 qemu_mutex_lock(&rs->bitmap_mutex);
1148 rcu_read_lock();
1149 RAMBLOCK_FOREACH(block) {
1150 migration_bitmap_sync_range(rs, block, 0, block->used_length);
1152 rcu_read_unlock();
1153 qemu_mutex_unlock(&rs->bitmap_mutex);
1155 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1157 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1159 /* more than 1 second = 1000 millisecons */
1160 if (end_time > rs->time_last_bitmap_sync + 1000) {
1161 /* calculate period counters */
1162 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1163 / (end_time - rs->time_last_bitmap_sync);
1164 bytes_xfer_now = ram_counters.transferred;
1166 /* During block migration the auto-converge logic incorrectly detects
1167 * that ram migration makes no progress. Avoid this by disabling the
1168 * throttling logic during the bulk phase of block migration. */
1169 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1170 /* The following detection logic can be refined later. For now:
1171 Check to see if the dirtied bytes is 50% more than the approx.
1172 amount of bytes that just got transferred since the last time we
1173 were in this routine. If that happens twice, start or increase
1174 throttling */
1176 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1177 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1178 (++rs->dirty_rate_high_cnt >= 2)) {
1179 trace_migration_throttle();
1180 rs->dirty_rate_high_cnt = 0;
1181 mig_throttle_guest_down();
1185 if (migrate_use_xbzrle()) {
1186 if (rs->iterations_prev != rs->iterations) {
1187 xbzrle_counters.cache_miss_rate =
1188 (double)(xbzrle_counters.cache_miss -
1189 rs->xbzrle_cache_miss_prev) /
1190 (rs->iterations - rs->iterations_prev);
1192 rs->iterations_prev = rs->iterations;
1193 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1196 /* reset period counters */
1197 rs->time_last_bitmap_sync = end_time;
1198 rs->num_dirty_pages_period = 0;
1199 rs->bytes_xfer_prev = bytes_xfer_now;
1201 if (migrate_use_events()) {
1202 qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
1207 * save_zero_page: send the zero page to the stream
1209 * Returns the number of pages written.
1211 * @rs: current RAM state
1212 * @block: block that contains the page we want to send
1213 * @offset: offset inside the block for the page
1215 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1217 uint8_t *p = block->host + offset;
1218 int pages = -1;
1220 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1221 ram_counters.duplicate++;
1222 ram_counters.transferred +=
1223 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
1224 qemu_put_byte(rs->f, 0);
1225 ram_counters.transferred += 1;
1226 pages = 1;
1229 return pages;
1232 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1234 if (!migrate_release_ram() || !migration_in_postcopy()) {
1235 return;
1238 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1242 * @pages: the number of pages written by the control path,
1243 * < 0 - error
1244 * > 0 - number of pages written
1246 * Return true if the pages has been saved, otherwise false is returned.
1248 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1249 int *pages)
1251 uint64_t bytes_xmit = 0;
1252 int ret;
1254 *pages = -1;
1255 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1256 &bytes_xmit);
1257 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1258 return false;
1261 if (bytes_xmit) {
1262 ram_counters.transferred += bytes_xmit;
1263 *pages = 1;
1266 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1267 return true;
1270 if (bytes_xmit > 0) {
1271 ram_counters.normal++;
1272 } else if (bytes_xmit == 0) {
1273 ram_counters.duplicate++;
1276 return true;
1280 * directly send the page to the stream
1282 * Returns the number of pages written.
1284 * @rs: current RAM state
1285 * @block: block that contains the page we want to send
1286 * @offset: offset inside the block for the page
1287 * @buf: the page to be sent
1288 * @async: send to page asyncly
1290 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1291 uint8_t *buf, bool async)
1293 ram_counters.transferred += save_page_header(rs, rs->f, block,
1294 offset | RAM_SAVE_FLAG_PAGE);
1295 if (async) {
1296 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1297 migrate_release_ram() &
1298 migration_in_postcopy());
1299 } else {
1300 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1302 ram_counters.transferred += TARGET_PAGE_SIZE;
1303 ram_counters.normal++;
1304 return 1;
1308 * ram_save_page: send the given page to the stream
1310 * Returns the number of pages written.
1311 * < 0 - error
1312 * >=0 - Number of pages written - this might legally be 0
1313 * if xbzrle noticed the page was the same.
1315 * @rs: current RAM state
1316 * @block: block that contains the page we want to send
1317 * @offset: offset inside the block for the page
1318 * @last_stage: if we are at the completion stage
1320 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1322 int pages = -1;
1323 uint8_t *p;
1324 bool send_async = true;
1325 RAMBlock *block = pss->block;
1326 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1327 ram_addr_t current_addr = block->offset + offset;
1329 p = block->host + offset;
1330 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1332 XBZRLE_cache_lock();
1333 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1334 migrate_use_xbzrle()) {
1335 pages = save_xbzrle_page(rs, &p, current_addr, block,
1336 offset, last_stage);
1337 if (!last_stage) {
1338 /* Can't send this cached data async, since the cache page
1339 * might get updated before it gets to the wire
1341 send_async = false;
1345 /* XBZRLE overflow or normal page */
1346 if (pages == -1) {
1347 pages = save_normal_page(rs, block, offset, p, send_async);
1350 XBZRLE_cache_unlock();
1352 return pages;
1355 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1356 ram_addr_t offset, uint8_t *source_buf)
1358 RAMState *rs = ram_state;
1359 int bytes_sent, blen;
1360 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1362 bytes_sent = save_page_header(rs, f, block, offset |
1363 RAM_SAVE_FLAG_COMPRESS_PAGE);
1366 * copy it to a internal buffer to avoid it being modified by VM
1367 * so that we can catch up the error during compression and
1368 * decompression
1370 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1371 blen = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1372 if (blen < 0) {
1373 bytes_sent = 0;
1374 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1375 error_report("compressed data failed!");
1376 } else {
1377 bytes_sent += blen;
1378 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1381 return bytes_sent;
1384 static void flush_compressed_data(RAMState *rs)
1386 int idx, len, thread_count;
1388 if (!migrate_use_compression()) {
1389 return;
1391 thread_count = migrate_compress_threads();
1393 qemu_mutex_lock(&comp_done_lock);
1394 for (idx = 0; idx < thread_count; idx++) {
1395 while (!comp_param[idx].done) {
1396 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1399 qemu_mutex_unlock(&comp_done_lock);
1401 for (idx = 0; idx < thread_count; idx++) {
1402 qemu_mutex_lock(&comp_param[idx].mutex);
1403 if (!comp_param[idx].quit) {
1404 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1405 ram_counters.transferred += len;
1407 qemu_mutex_unlock(&comp_param[idx].mutex);
1411 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1412 ram_addr_t offset)
1414 param->block = block;
1415 param->offset = offset;
1418 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1419 ram_addr_t offset)
1421 int idx, thread_count, bytes_xmit = -1, pages = -1;
1423 thread_count = migrate_compress_threads();
1424 qemu_mutex_lock(&comp_done_lock);
1425 while (true) {
1426 for (idx = 0; idx < thread_count; idx++) {
1427 if (comp_param[idx].done) {
1428 comp_param[idx].done = false;
1429 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1430 qemu_mutex_lock(&comp_param[idx].mutex);
1431 set_compress_params(&comp_param[idx], block, offset);
1432 qemu_cond_signal(&comp_param[idx].cond);
1433 qemu_mutex_unlock(&comp_param[idx].mutex);
1434 pages = 1;
1435 ram_counters.normal++;
1436 ram_counters.transferred += bytes_xmit;
1437 break;
1440 if (pages > 0) {
1441 break;
1442 } else {
1443 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1446 qemu_mutex_unlock(&comp_done_lock);
1448 return pages;
1452 * find_dirty_block: find the next dirty page and update any state
1453 * associated with the search process.
1455 * Returns if a page is found
1457 * @rs: current RAM state
1458 * @pss: data about the state of the current dirty page scan
1459 * @again: set to false if the search has scanned the whole of RAM
1461 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1463 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1464 if (pss->complete_round && pss->block == rs->last_seen_block &&
1465 pss->page >= rs->last_page) {
1467 * We've been once around the RAM and haven't found anything.
1468 * Give up.
1470 *again = false;
1471 return false;
1473 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1474 /* Didn't find anything in this RAM Block */
1475 pss->page = 0;
1476 pss->block = QLIST_NEXT_RCU(pss->block, next);
1477 if (!pss->block) {
1478 /* Hit the end of the list */
1479 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1480 /* Flag that we've looped */
1481 pss->complete_round = true;
1482 rs->ram_bulk_stage = false;
1483 if (migrate_use_xbzrle()) {
1484 /* If xbzrle is on, stop using the data compression at this
1485 * point. In theory, xbzrle can do better than compression.
1487 flush_compressed_data(rs);
1490 /* Didn't find anything this time, but try again on the new block */
1491 *again = true;
1492 return false;
1493 } else {
1494 /* Can go around again, but... */
1495 *again = true;
1496 /* We've found something so probably don't need to */
1497 return true;
1502 * unqueue_page: gets a page of the queue
1504 * Helper for 'get_queued_page' - gets a page off the queue
1506 * Returns the block of the page (or NULL if none available)
1508 * @rs: current RAM state
1509 * @offset: used to return the offset within the RAMBlock
1511 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1513 RAMBlock *block = NULL;
1515 qemu_mutex_lock(&rs->src_page_req_mutex);
1516 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1517 struct RAMSrcPageRequest *entry =
1518 QSIMPLEQ_FIRST(&rs->src_page_requests);
1519 block = entry->rb;
1520 *offset = entry->offset;
1522 if (entry->len > TARGET_PAGE_SIZE) {
1523 entry->len -= TARGET_PAGE_SIZE;
1524 entry->offset += TARGET_PAGE_SIZE;
1525 } else {
1526 memory_region_unref(block->mr);
1527 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1528 g_free(entry);
1531 qemu_mutex_unlock(&rs->src_page_req_mutex);
1533 return block;
1537 * get_queued_page: unqueue a page from the postocpy requests
1539 * Skips pages that are already sent (!dirty)
1541 * Returns if a queued page is found
1543 * @rs: current RAM state
1544 * @pss: data about the state of the current dirty page scan
1546 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1548 RAMBlock *block;
1549 ram_addr_t offset;
1550 bool dirty;
1552 do {
1553 block = unqueue_page(rs, &offset);
1555 * We're sending this page, and since it's postcopy nothing else
1556 * will dirty it, and we must make sure it doesn't get sent again
1557 * even if this queue request was received after the background
1558 * search already sent it.
1560 if (block) {
1561 unsigned long page;
1563 page = offset >> TARGET_PAGE_BITS;
1564 dirty = test_bit(page, block->bmap);
1565 if (!dirty) {
1566 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1567 page, test_bit(page, block->unsentmap));
1568 } else {
1569 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1573 } while (block && !dirty);
1575 if (block) {
1577 * As soon as we start servicing pages out of order, then we have
1578 * to kill the bulk stage, since the bulk stage assumes
1579 * in (migration_bitmap_find_and_reset_dirty) that every page is
1580 * dirty, that's no longer true.
1582 rs->ram_bulk_stage = false;
1585 * We want the background search to continue from the queued page
1586 * since the guest is likely to want other pages near to the page
1587 * it just requested.
1589 pss->block = block;
1590 pss->page = offset >> TARGET_PAGE_BITS;
1593 return !!block;
1597 * migration_page_queue_free: drop any remaining pages in the ram
1598 * request queue
1600 * It should be empty at the end anyway, but in error cases there may
1601 * be some left. in case that there is any page left, we drop it.
1604 static void migration_page_queue_free(RAMState *rs)
1606 struct RAMSrcPageRequest *mspr, *next_mspr;
1607 /* This queue generally should be empty - but in the case of a failed
1608 * migration might have some droppings in.
1610 rcu_read_lock();
1611 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1612 memory_region_unref(mspr->rb->mr);
1613 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1614 g_free(mspr);
1616 rcu_read_unlock();
1620 * ram_save_queue_pages: queue the page for transmission
1622 * A request from postcopy destination for example.
1624 * Returns zero on success or negative on error
1626 * @rbname: Name of the RAMBLock of the request. NULL means the
1627 * same that last one.
1628 * @start: starting address from the start of the RAMBlock
1629 * @len: length (in bytes) to send
1631 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1633 RAMBlock *ramblock;
1634 RAMState *rs = ram_state;
1636 ram_counters.postcopy_requests++;
1637 rcu_read_lock();
1638 if (!rbname) {
1639 /* Reuse last RAMBlock */
1640 ramblock = rs->last_req_rb;
1642 if (!ramblock) {
1644 * Shouldn't happen, we can't reuse the last RAMBlock if
1645 * it's the 1st request.
1647 error_report("ram_save_queue_pages no previous block");
1648 goto err;
1650 } else {
1651 ramblock = qemu_ram_block_by_name(rbname);
1653 if (!ramblock) {
1654 /* We shouldn't be asked for a non-existent RAMBlock */
1655 error_report("ram_save_queue_pages no block '%s'", rbname);
1656 goto err;
1658 rs->last_req_rb = ramblock;
1660 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1661 if (start+len > ramblock->used_length) {
1662 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1663 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1664 __func__, start, len, ramblock->used_length);
1665 goto err;
1668 struct RAMSrcPageRequest *new_entry =
1669 g_malloc0(sizeof(struct RAMSrcPageRequest));
1670 new_entry->rb = ramblock;
1671 new_entry->offset = start;
1672 new_entry->len = len;
1674 memory_region_ref(ramblock->mr);
1675 qemu_mutex_lock(&rs->src_page_req_mutex);
1676 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1677 qemu_mutex_unlock(&rs->src_page_req_mutex);
1678 rcu_read_unlock();
1680 return 0;
1682 err:
1683 rcu_read_unlock();
1684 return -1;
1687 static bool save_page_use_compression(RAMState *rs)
1689 if (!migrate_use_compression()) {
1690 return false;
1694 * If xbzrle is on, stop using the data compression after first
1695 * round of migration even if compression is enabled. In theory,
1696 * xbzrle can do better than compression.
1698 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1699 return true;
1702 return false;
1706 * ram_save_target_page: save one target page
1708 * Returns the number of pages written
1710 * @rs: current RAM state
1711 * @pss: data about the page we want to send
1712 * @last_stage: if we are at the completion stage
1714 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1715 bool last_stage)
1717 RAMBlock *block = pss->block;
1718 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1719 int res;
1721 if (control_save_page(rs, block, offset, &res)) {
1722 return res;
1726 * When starting the process of a new block, the first page of
1727 * the block should be sent out before other pages in the same
1728 * block, and all the pages in last block should have been sent
1729 * out, keeping this order is important, because the 'cont' flag
1730 * is used to avoid resending the block name.
1732 if (block != rs->last_sent_block && save_page_use_compression(rs)) {
1733 flush_compressed_data(rs);
1736 res = save_zero_page(rs, block, offset);
1737 if (res > 0) {
1738 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1739 * page would be stale
1741 if (!save_page_use_compression(rs)) {
1742 XBZRLE_cache_lock();
1743 xbzrle_cache_zero_page(rs, block->offset + offset);
1744 XBZRLE_cache_unlock();
1746 ram_release_pages(block->idstr, offset, res);
1747 return res;
1751 * Make sure the first page is sent out before other pages.
1753 * we post it as normal page as compression will take much
1754 * CPU resource.
1756 if (block == rs->last_sent_block && save_page_use_compression(rs)) {
1757 return compress_page_with_multi_thread(rs, block, offset);
1760 return ram_save_page(rs, pss, last_stage);
1764 * ram_save_host_page: save a whole host page
1766 * Starting at *offset send pages up to the end of the current host
1767 * page. It's valid for the initial offset to point into the middle of
1768 * a host page in which case the remainder of the hostpage is sent.
1769 * Only dirty target pages are sent. Note that the host page size may
1770 * be a huge page for this block.
1771 * The saving stops at the boundary of the used_length of the block
1772 * if the RAMBlock isn't a multiple of the host page size.
1774 * Returns the number of pages written or negative on error
1776 * @rs: current RAM state
1777 * @ms: current migration state
1778 * @pss: data about the page we want to send
1779 * @last_stage: if we are at the completion stage
1781 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1782 bool last_stage)
1784 int tmppages, pages = 0;
1785 size_t pagesize_bits =
1786 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1788 do {
1789 /* Check the pages is dirty and if it is send it */
1790 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1791 pss->page++;
1792 continue;
1795 tmppages = ram_save_target_page(rs, pss, last_stage);
1796 if (tmppages < 0) {
1797 return tmppages;
1800 pages += tmppages;
1801 if (pss->block->unsentmap) {
1802 clear_bit(pss->page, pss->block->unsentmap);
1805 pss->page++;
1806 } while ((pss->page & (pagesize_bits - 1)) &&
1807 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1809 /* The offset we leave with is the last one we looked at */
1810 pss->page--;
1811 return pages;
1815 * ram_find_and_save_block: finds a dirty page and sends it to f
1817 * Called within an RCU critical section.
1819 * Returns the number of pages written where zero means no dirty pages
1821 * @rs: current RAM state
1822 * @last_stage: if we are at the completion stage
1824 * On systems where host-page-size > target-page-size it will send all the
1825 * pages in a host page that are dirty.
1828 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1830 PageSearchStatus pss;
1831 int pages = 0;
1832 bool again, found;
1834 /* No dirty page as there is zero RAM */
1835 if (!ram_bytes_total()) {
1836 return pages;
1839 pss.block = rs->last_seen_block;
1840 pss.page = rs->last_page;
1841 pss.complete_round = false;
1843 if (!pss.block) {
1844 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1847 do {
1848 again = true;
1849 found = get_queued_page(rs, &pss);
1851 if (!found) {
1852 /* priority queue empty, so just search for something dirty */
1853 found = find_dirty_block(rs, &pss, &again);
1856 if (found) {
1857 pages = ram_save_host_page(rs, &pss, last_stage);
1859 } while (!pages && again);
1861 rs->last_seen_block = pss.block;
1862 rs->last_page = pss.page;
1864 return pages;
1867 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1869 uint64_t pages = size / TARGET_PAGE_SIZE;
1871 if (zero) {
1872 ram_counters.duplicate += pages;
1873 } else {
1874 ram_counters.normal += pages;
1875 ram_counters.transferred += size;
1876 qemu_update_position(f, size);
1880 uint64_t ram_bytes_total(void)
1882 RAMBlock *block;
1883 uint64_t total = 0;
1885 rcu_read_lock();
1886 RAMBLOCK_FOREACH(block) {
1887 total += block->used_length;
1889 rcu_read_unlock();
1890 return total;
1893 static void xbzrle_load_setup(void)
1895 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1898 static void xbzrle_load_cleanup(void)
1900 g_free(XBZRLE.decoded_buf);
1901 XBZRLE.decoded_buf = NULL;
1904 static void ram_state_cleanup(RAMState **rsp)
1906 if (*rsp) {
1907 migration_page_queue_free(*rsp);
1908 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1909 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1910 g_free(*rsp);
1911 *rsp = NULL;
1915 static void xbzrle_cleanup(void)
1917 XBZRLE_cache_lock();
1918 if (XBZRLE.cache) {
1919 cache_fini(XBZRLE.cache);
1920 g_free(XBZRLE.encoded_buf);
1921 g_free(XBZRLE.current_buf);
1922 g_free(XBZRLE.zero_target_page);
1923 XBZRLE.cache = NULL;
1924 XBZRLE.encoded_buf = NULL;
1925 XBZRLE.current_buf = NULL;
1926 XBZRLE.zero_target_page = NULL;
1928 XBZRLE_cache_unlock();
1931 static void ram_save_cleanup(void *opaque)
1933 RAMState **rsp = opaque;
1934 RAMBlock *block;
1936 /* caller have hold iothread lock or is in a bh, so there is
1937 * no writing race against this migration_bitmap
1939 memory_global_dirty_log_stop();
1941 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1942 g_free(block->bmap);
1943 block->bmap = NULL;
1944 g_free(block->unsentmap);
1945 block->unsentmap = NULL;
1948 xbzrle_cleanup();
1949 compress_threads_save_cleanup();
1950 ram_state_cleanup(rsp);
1953 static void ram_state_reset(RAMState *rs)
1955 rs->last_seen_block = NULL;
1956 rs->last_sent_block = NULL;
1957 rs->last_page = 0;
1958 rs->last_version = ram_list.version;
1959 rs->ram_bulk_stage = true;
1962 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1965 * 'expected' is the value you expect the bitmap mostly to be full
1966 * of; it won't bother printing lines that are all this value.
1967 * If 'todump' is null the migration bitmap is dumped.
1969 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1970 unsigned long pages)
1972 int64_t cur;
1973 int64_t linelen = 128;
1974 char linebuf[129];
1976 for (cur = 0; cur < pages; cur += linelen) {
1977 int64_t curb;
1978 bool found = false;
1980 * Last line; catch the case where the line length
1981 * is longer than remaining ram
1983 if (cur + linelen > pages) {
1984 linelen = pages - cur;
1986 for (curb = 0; curb < linelen; curb++) {
1987 bool thisbit = test_bit(cur + curb, todump);
1988 linebuf[curb] = thisbit ? '1' : '.';
1989 found = found || (thisbit != expected);
1991 if (found) {
1992 linebuf[curb] = '\0';
1993 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1998 /* **** functions for postcopy ***** */
2000 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2002 struct RAMBlock *block;
2004 RAMBLOCK_FOREACH(block) {
2005 unsigned long *bitmap = block->bmap;
2006 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2007 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2009 while (run_start < range) {
2010 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2011 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2012 (run_end - run_start) << TARGET_PAGE_BITS);
2013 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2019 * postcopy_send_discard_bm_ram: discard a RAMBlock
2021 * Returns zero on success
2023 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2024 * Note: At this point the 'unsentmap' is the processed bitmap combined
2025 * with the dirtymap; so a '1' means it's either dirty or unsent.
2027 * @ms: current migration state
2028 * @pds: state for postcopy
2029 * @start: RAMBlock starting page
2030 * @length: RAMBlock size
2032 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2033 PostcopyDiscardState *pds,
2034 RAMBlock *block)
2036 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2037 unsigned long current;
2038 unsigned long *unsentmap = block->unsentmap;
2040 for (current = 0; current < end; ) {
2041 unsigned long one = find_next_bit(unsentmap, end, current);
2043 if (one <= end) {
2044 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2045 unsigned long discard_length;
2047 if (zero >= end) {
2048 discard_length = end - one;
2049 } else {
2050 discard_length = zero - one;
2052 if (discard_length) {
2053 postcopy_discard_send_range(ms, pds, one, discard_length);
2055 current = one + discard_length;
2056 } else {
2057 current = one;
2061 return 0;
2065 * postcopy_each_ram_send_discard: discard all RAMBlocks
2067 * Returns 0 for success or negative for error
2069 * Utility for the outgoing postcopy code.
2070 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2071 * passing it bitmap indexes and name.
2072 * (qemu_ram_foreach_block ends up passing unscaled lengths
2073 * which would mean postcopy code would have to deal with target page)
2075 * @ms: current migration state
2077 static int postcopy_each_ram_send_discard(MigrationState *ms)
2079 struct RAMBlock *block;
2080 int ret;
2082 RAMBLOCK_FOREACH(block) {
2083 PostcopyDiscardState *pds =
2084 postcopy_discard_send_init(ms, block->idstr);
2087 * Postcopy sends chunks of bitmap over the wire, but it
2088 * just needs indexes at this point, avoids it having
2089 * target page specific code.
2091 ret = postcopy_send_discard_bm_ram(ms, pds, block);
2092 postcopy_discard_send_finish(ms, pds);
2093 if (ret) {
2094 return ret;
2098 return 0;
2102 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2104 * Helper for postcopy_chunk_hostpages; it's called twice to
2105 * canonicalize the two bitmaps, that are similar, but one is
2106 * inverted.
2108 * Postcopy requires that all target pages in a hostpage are dirty or
2109 * clean, not a mix. This function canonicalizes the bitmaps.
2111 * @ms: current migration state
2112 * @unsent_pass: if true we need to canonicalize partially unsent host pages
2113 * otherwise we need to canonicalize partially dirty host pages
2114 * @block: block that contains the page we want to canonicalize
2115 * @pds: state for postcopy
2117 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2118 RAMBlock *block,
2119 PostcopyDiscardState *pds)
2121 RAMState *rs = ram_state;
2122 unsigned long *bitmap = block->bmap;
2123 unsigned long *unsentmap = block->unsentmap;
2124 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2125 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2126 unsigned long run_start;
2128 if (block->page_size == TARGET_PAGE_SIZE) {
2129 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2130 return;
2133 if (unsent_pass) {
2134 /* Find a sent page */
2135 run_start = find_next_zero_bit(unsentmap, pages, 0);
2136 } else {
2137 /* Find a dirty page */
2138 run_start = find_next_bit(bitmap, pages, 0);
2141 while (run_start < pages) {
2142 bool do_fixup = false;
2143 unsigned long fixup_start_addr;
2144 unsigned long host_offset;
2147 * If the start of this run of pages is in the middle of a host
2148 * page, then we need to fixup this host page.
2150 host_offset = run_start % host_ratio;
2151 if (host_offset) {
2152 do_fixup = true;
2153 run_start -= host_offset;
2154 fixup_start_addr = run_start;
2155 /* For the next pass */
2156 run_start = run_start + host_ratio;
2157 } else {
2158 /* Find the end of this run */
2159 unsigned long run_end;
2160 if (unsent_pass) {
2161 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2162 } else {
2163 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2166 * If the end isn't at the start of a host page, then the
2167 * run doesn't finish at the end of a host page
2168 * and we need to discard.
2170 host_offset = run_end % host_ratio;
2171 if (host_offset) {
2172 do_fixup = true;
2173 fixup_start_addr = run_end - host_offset;
2175 * This host page has gone, the next loop iteration starts
2176 * from after the fixup
2178 run_start = fixup_start_addr + host_ratio;
2179 } else {
2181 * No discards on this iteration, next loop starts from
2182 * next sent/dirty page
2184 run_start = run_end + 1;
2188 if (do_fixup) {
2189 unsigned long page;
2191 /* Tell the destination to discard this page */
2192 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2193 /* For the unsent_pass we:
2194 * discard partially sent pages
2195 * For the !unsent_pass (dirty) we:
2196 * discard partially dirty pages that were sent
2197 * (any partially sent pages were already discarded
2198 * by the previous unsent_pass)
2200 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2201 host_ratio);
2204 /* Clean up the bitmap */
2205 for (page = fixup_start_addr;
2206 page < fixup_start_addr + host_ratio; page++) {
2207 /* All pages in this host page are now not sent */
2208 set_bit(page, unsentmap);
2211 * Remark them as dirty, updating the count for any pages
2212 * that weren't previously dirty.
2214 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2218 if (unsent_pass) {
2219 /* Find the next sent page for the next iteration */
2220 run_start = find_next_zero_bit(unsentmap, pages, run_start);
2221 } else {
2222 /* Find the next dirty page for the next iteration */
2223 run_start = find_next_bit(bitmap, pages, run_start);
2229 * postcopy_chuck_hostpages: discrad any partially sent host page
2231 * Utility for the outgoing postcopy code.
2233 * Discard any partially sent host-page size chunks, mark any partially
2234 * dirty host-page size chunks as all dirty. In this case the host-page
2235 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2237 * Returns zero on success
2239 * @ms: current migration state
2240 * @block: block we want to work with
2242 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2244 PostcopyDiscardState *pds =
2245 postcopy_discard_send_init(ms, block->idstr);
2247 /* First pass: Discard all partially sent host pages */
2248 postcopy_chunk_hostpages_pass(ms, true, block, pds);
2250 * Second pass: Ensure that all partially dirty host pages are made
2251 * fully dirty.
2253 postcopy_chunk_hostpages_pass(ms, false, block, pds);
2255 postcopy_discard_send_finish(ms, pds);
2256 return 0;
2260 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2262 * Returns zero on success
2264 * Transmit the set of pages to be discarded after precopy to the target
2265 * these are pages that:
2266 * a) Have been previously transmitted but are now dirty again
2267 * b) Pages that have never been transmitted, this ensures that
2268 * any pages on the destination that have been mapped by background
2269 * tasks get discarded (transparent huge pages is the specific concern)
2270 * Hopefully this is pretty sparse
2272 * @ms: current migration state
2274 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2276 RAMState *rs = ram_state;
2277 RAMBlock *block;
2278 int ret;
2280 rcu_read_lock();
2282 /* This should be our last sync, the src is now paused */
2283 migration_bitmap_sync(rs);
2285 /* Easiest way to make sure we don't resume in the middle of a host-page */
2286 rs->last_seen_block = NULL;
2287 rs->last_sent_block = NULL;
2288 rs->last_page = 0;
2290 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2291 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2292 unsigned long *bitmap = block->bmap;
2293 unsigned long *unsentmap = block->unsentmap;
2295 if (!unsentmap) {
2296 /* We don't have a safe way to resize the sentmap, so
2297 * if the bitmap was resized it will be NULL at this
2298 * point.
2300 error_report("migration ram resized during precopy phase");
2301 rcu_read_unlock();
2302 return -EINVAL;
2304 /* Deal with TPS != HPS and huge pages */
2305 ret = postcopy_chunk_hostpages(ms, block);
2306 if (ret) {
2307 rcu_read_unlock();
2308 return ret;
2312 * Update the unsentmap to be unsentmap = unsentmap | dirty
2314 bitmap_or(unsentmap, unsentmap, bitmap, pages);
2315 #ifdef DEBUG_POSTCOPY
2316 ram_debug_dump_bitmap(unsentmap, true, pages);
2317 #endif
2319 trace_ram_postcopy_send_discard_bitmap();
2321 ret = postcopy_each_ram_send_discard(ms);
2322 rcu_read_unlock();
2324 return ret;
2328 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2330 * Returns zero on success
2332 * @rbname: name of the RAMBlock of the request. NULL means the
2333 * same that last one.
2334 * @start: RAMBlock starting page
2335 * @length: RAMBlock size
2337 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2339 int ret = -1;
2341 trace_ram_discard_range(rbname, start, length);
2343 rcu_read_lock();
2344 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2346 if (!rb) {
2347 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2348 goto err;
2351 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2352 length >> qemu_target_page_bits());
2353 ret = ram_block_discard_range(rb, start, length);
2355 err:
2356 rcu_read_unlock();
2358 return ret;
2362 * For every allocation, we will try not to crash the VM if the
2363 * allocation failed.
2365 static int xbzrle_init(void)
2367 Error *local_err = NULL;
2369 if (!migrate_use_xbzrle()) {
2370 return 0;
2373 XBZRLE_cache_lock();
2375 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2376 if (!XBZRLE.zero_target_page) {
2377 error_report("%s: Error allocating zero page", __func__);
2378 goto err_out;
2381 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2382 TARGET_PAGE_SIZE, &local_err);
2383 if (!XBZRLE.cache) {
2384 error_report_err(local_err);
2385 goto free_zero_page;
2388 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2389 if (!XBZRLE.encoded_buf) {
2390 error_report("%s: Error allocating encoded_buf", __func__);
2391 goto free_cache;
2394 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2395 if (!XBZRLE.current_buf) {
2396 error_report("%s: Error allocating current_buf", __func__);
2397 goto free_encoded_buf;
2400 /* We are all good */
2401 XBZRLE_cache_unlock();
2402 return 0;
2404 free_encoded_buf:
2405 g_free(XBZRLE.encoded_buf);
2406 XBZRLE.encoded_buf = NULL;
2407 free_cache:
2408 cache_fini(XBZRLE.cache);
2409 XBZRLE.cache = NULL;
2410 free_zero_page:
2411 g_free(XBZRLE.zero_target_page);
2412 XBZRLE.zero_target_page = NULL;
2413 err_out:
2414 XBZRLE_cache_unlock();
2415 return -ENOMEM;
2418 static int ram_state_init(RAMState **rsp)
2420 *rsp = g_try_new0(RAMState, 1);
2422 if (!*rsp) {
2423 error_report("%s: Init ramstate fail", __func__);
2424 return -1;
2427 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2428 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2429 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2432 * Count the total number of pages used by ram blocks not including any
2433 * gaps due to alignment or unplugs.
2435 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2437 ram_state_reset(*rsp);
2439 return 0;
2442 static void ram_list_init_bitmaps(void)
2444 RAMBlock *block;
2445 unsigned long pages;
2447 /* Skip setting bitmap if there is no RAM */
2448 if (ram_bytes_total()) {
2449 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2450 pages = block->max_length >> TARGET_PAGE_BITS;
2451 block->bmap = bitmap_new(pages);
2452 bitmap_set(block->bmap, 0, pages);
2453 if (migrate_postcopy_ram()) {
2454 block->unsentmap = bitmap_new(pages);
2455 bitmap_set(block->unsentmap, 0, pages);
2461 static void ram_init_bitmaps(RAMState *rs)
2463 /* For memory_global_dirty_log_start below. */
2464 qemu_mutex_lock_iothread();
2465 qemu_mutex_lock_ramlist();
2466 rcu_read_lock();
2468 ram_list_init_bitmaps();
2469 memory_global_dirty_log_start();
2470 migration_bitmap_sync(rs);
2472 rcu_read_unlock();
2473 qemu_mutex_unlock_ramlist();
2474 qemu_mutex_unlock_iothread();
2477 static int ram_init_all(RAMState **rsp)
2479 if (ram_state_init(rsp)) {
2480 return -1;
2483 if (xbzrle_init()) {
2484 ram_state_cleanup(rsp);
2485 return -1;
2488 ram_init_bitmaps(*rsp);
2490 return 0;
2494 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2495 * long-running RCU critical section. When rcu-reclaims in the code
2496 * start to become numerous it will be necessary to reduce the
2497 * granularity of these critical sections.
2501 * ram_save_setup: Setup RAM for migration
2503 * Returns zero to indicate success and negative for error
2505 * @f: QEMUFile where to send the data
2506 * @opaque: RAMState pointer
2508 static int ram_save_setup(QEMUFile *f, void *opaque)
2510 RAMState **rsp = opaque;
2511 RAMBlock *block;
2513 if (compress_threads_save_setup()) {
2514 return -1;
2517 /* migration has already setup the bitmap, reuse it. */
2518 if (!migration_in_colo_state()) {
2519 if (ram_init_all(rsp) != 0) {
2520 compress_threads_save_cleanup();
2521 return -1;
2524 (*rsp)->f = f;
2526 rcu_read_lock();
2528 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2530 RAMBLOCK_FOREACH(block) {
2531 qemu_put_byte(f, strlen(block->idstr));
2532 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2533 qemu_put_be64(f, block->used_length);
2534 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2535 qemu_put_be64(f, block->page_size);
2539 rcu_read_unlock();
2541 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2542 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2544 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2546 return 0;
2550 * ram_save_iterate: iterative stage for migration
2552 * Returns zero to indicate success and negative for error
2554 * @f: QEMUFile where to send the data
2555 * @opaque: RAMState pointer
2557 static int ram_save_iterate(QEMUFile *f, void *opaque)
2559 RAMState **temp = opaque;
2560 RAMState *rs = *temp;
2561 int ret;
2562 int i;
2563 int64_t t0;
2564 int done = 0;
2566 if (blk_mig_bulk_active()) {
2567 /* Avoid transferring ram during bulk phase of block migration as
2568 * the bulk phase will usually take a long time and transferring
2569 * ram updates during that time is pointless. */
2570 goto out;
2573 rcu_read_lock();
2574 if (ram_list.version != rs->last_version) {
2575 ram_state_reset(rs);
2578 /* Read version before ram_list.blocks */
2579 smp_rmb();
2581 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2583 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2584 i = 0;
2585 while ((ret = qemu_file_rate_limit(f)) == 0) {
2586 int pages;
2588 pages = ram_find_and_save_block(rs, false);
2589 /* no more pages to sent */
2590 if (pages == 0) {
2591 done = 1;
2592 break;
2594 rs->iterations++;
2596 /* we want to check in the 1st loop, just in case it was the 1st time
2597 and we had to sync the dirty bitmap.
2598 qemu_get_clock_ns() is a bit expensive, so we only check each some
2599 iterations
2601 if ((i & 63) == 0) {
2602 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2603 if (t1 > MAX_WAIT) {
2604 trace_ram_save_iterate_big_wait(t1, i);
2605 break;
2608 i++;
2610 flush_compressed_data(rs);
2611 rcu_read_unlock();
2614 * Must occur before EOS (or any QEMUFile operation)
2615 * because of RDMA protocol.
2617 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2619 out:
2620 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2621 ram_counters.transferred += 8;
2623 ret = qemu_file_get_error(f);
2624 if (ret < 0) {
2625 return ret;
2628 return done;
2632 * ram_save_complete: function called to send the remaining amount of ram
2634 * Returns zero to indicate success
2636 * Called with iothread lock
2638 * @f: QEMUFile where to send the data
2639 * @opaque: RAMState pointer
2641 static int ram_save_complete(QEMUFile *f, void *opaque)
2643 RAMState **temp = opaque;
2644 RAMState *rs = *temp;
2646 rcu_read_lock();
2648 if (!migration_in_postcopy()) {
2649 migration_bitmap_sync(rs);
2652 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2654 /* try transferring iterative blocks of memory */
2656 /* flush all remaining blocks regardless of rate limiting */
2657 while (true) {
2658 int pages;
2660 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2661 /* no more blocks to sent */
2662 if (pages == 0) {
2663 break;
2667 flush_compressed_data(rs);
2668 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2670 rcu_read_unlock();
2672 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2674 return 0;
2677 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2678 uint64_t *res_precopy_only,
2679 uint64_t *res_compatible,
2680 uint64_t *res_postcopy_only)
2682 RAMState **temp = opaque;
2683 RAMState *rs = *temp;
2684 uint64_t remaining_size;
2686 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2688 if (!migration_in_postcopy() &&
2689 remaining_size < max_size) {
2690 qemu_mutex_lock_iothread();
2691 rcu_read_lock();
2692 migration_bitmap_sync(rs);
2693 rcu_read_unlock();
2694 qemu_mutex_unlock_iothread();
2695 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2698 if (migrate_postcopy_ram()) {
2699 /* We can do postcopy, and all the data is postcopiable */
2700 *res_compatible += remaining_size;
2701 } else {
2702 *res_precopy_only += remaining_size;
2706 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2708 unsigned int xh_len;
2709 int xh_flags;
2710 uint8_t *loaded_data;
2712 /* extract RLE header */
2713 xh_flags = qemu_get_byte(f);
2714 xh_len = qemu_get_be16(f);
2716 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2717 error_report("Failed to load XBZRLE page - wrong compression!");
2718 return -1;
2721 if (xh_len > TARGET_PAGE_SIZE) {
2722 error_report("Failed to load XBZRLE page - len overflow!");
2723 return -1;
2725 loaded_data = XBZRLE.decoded_buf;
2726 /* load data and decode */
2727 /* it can change loaded_data to point to an internal buffer */
2728 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2730 /* decode RLE */
2731 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2732 TARGET_PAGE_SIZE) == -1) {
2733 error_report("Failed to load XBZRLE page - decode error!");
2734 return -1;
2737 return 0;
2741 * ram_block_from_stream: read a RAMBlock id from the migration stream
2743 * Must be called from within a rcu critical section.
2745 * Returns a pointer from within the RCU-protected ram_list.
2747 * @f: QEMUFile where to read the data from
2748 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2750 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2752 static RAMBlock *block = NULL;
2753 char id[256];
2754 uint8_t len;
2756 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2757 if (!block) {
2758 error_report("Ack, bad migration stream!");
2759 return NULL;
2761 return block;
2764 len = qemu_get_byte(f);
2765 qemu_get_buffer(f, (uint8_t *)id, len);
2766 id[len] = 0;
2768 block = qemu_ram_block_by_name(id);
2769 if (!block) {
2770 error_report("Can't find block %s", id);
2771 return NULL;
2774 return block;
2777 static inline void *host_from_ram_block_offset(RAMBlock *block,
2778 ram_addr_t offset)
2780 if (!offset_in_ramblock(block, offset)) {
2781 return NULL;
2784 return block->host + offset;
2788 * ram_handle_compressed: handle the zero page case
2790 * If a page (or a whole RDMA chunk) has been
2791 * determined to be zero, then zap it.
2793 * @host: host address for the zero page
2794 * @ch: what the page is filled from. We only support zero
2795 * @size: size of the zero page
2797 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2799 if (ch != 0 || !is_zero_range(host, size)) {
2800 memset(host, ch, size);
2804 /* return the size after decompression, or negative value on error */
2805 static int
2806 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2807 const uint8_t *source, size_t source_len)
2809 int err;
2811 err = inflateReset(stream);
2812 if (err != Z_OK) {
2813 return -1;
2816 stream->avail_in = source_len;
2817 stream->next_in = (uint8_t *)source;
2818 stream->avail_out = dest_len;
2819 stream->next_out = dest;
2821 err = inflate(stream, Z_NO_FLUSH);
2822 if (err != Z_STREAM_END) {
2823 return -1;
2826 return stream->total_out;
2829 static void *do_data_decompress(void *opaque)
2831 DecompressParam *param = opaque;
2832 unsigned long pagesize;
2833 uint8_t *des;
2834 int len, ret;
2836 qemu_mutex_lock(&param->mutex);
2837 while (!param->quit) {
2838 if (param->des) {
2839 des = param->des;
2840 len = param->len;
2841 param->des = 0;
2842 qemu_mutex_unlock(&param->mutex);
2844 pagesize = TARGET_PAGE_SIZE;
2846 ret = qemu_uncompress_data(&param->stream, des, pagesize,
2847 param->compbuf, len);
2848 if (ret < 0) {
2849 error_report("decompress data failed");
2850 qemu_file_set_error(decomp_file, ret);
2853 qemu_mutex_lock(&decomp_done_lock);
2854 param->done = true;
2855 qemu_cond_signal(&decomp_done_cond);
2856 qemu_mutex_unlock(&decomp_done_lock);
2858 qemu_mutex_lock(&param->mutex);
2859 } else {
2860 qemu_cond_wait(&param->cond, &param->mutex);
2863 qemu_mutex_unlock(&param->mutex);
2865 return NULL;
2868 static int wait_for_decompress_done(void)
2870 int idx, thread_count;
2872 if (!migrate_use_compression()) {
2873 return 0;
2876 thread_count = migrate_decompress_threads();
2877 qemu_mutex_lock(&decomp_done_lock);
2878 for (idx = 0; idx < thread_count; idx++) {
2879 while (!decomp_param[idx].done) {
2880 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2883 qemu_mutex_unlock(&decomp_done_lock);
2884 return qemu_file_get_error(decomp_file);
2887 static void compress_threads_load_cleanup(void)
2889 int i, thread_count;
2891 if (!migrate_use_compression()) {
2892 return;
2894 thread_count = migrate_decompress_threads();
2895 for (i = 0; i < thread_count; i++) {
2897 * we use it as a indicator which shows if the thread is
2898 * properly init'd or not
2900 if (!decomp_param[i].compbuf) {
2901 break;
2904 qemu_mutex_lock(&decomp_param[i].mutex);
2905 decomp_param[i].quit = true;
2906 qemu_cond_signal(&decomp_param[i].cond);
2907 qemu_mutex_unlock(&decomp_param[i].mutex);
2909 for (i = 0; i < thread_count; i++) {
2910 if (!decomp_param[i].compbuf) {
2911 break;
2914 qemu_thread_join(decompress_threads + i);
2915 qemu_mutex_destroy(&decomp_param[i].mutex);
2916 qemu_cond_destroy(&decomp_param[i].cond);
2917 inflateEnd(&decomp_param[i].stream);
2918 g_free(decomp_param[i].compbuf);
2919 decomp_param[i].compbuf = NULL;
2921 g_free(decompress_threads);
2922 g_free(decomp_param);
2923 decompress_threads = NULL;
2924 decomp_param = NULL;
2925 decomp_file = NULL;
2928 static int compress_threads_load_setup(QEMUFile *f)
2930 int i, thread_count;
2932 if (!migrate_use_compression()) {
2933 return 0;
2936 thread_count = migrate_decompress_threads();
2937 decompress_threads = g_new0(QemuThread, thread_count);
2938 decomp_param = g_new0(DecompressParam, thread_count);
2939 qemu_mutex_init(&decomp_done_lock);
2940 qemu_cond_init(&decomp_done_cond);
2941 decomp_file = f;
2942 for (i = 0; i < thread_count; i++) {
2943 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2944 goto exit;
2947 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2948 qemu_mutex_init(&decomp_param[i].mutex);
2949 qemu_cond_init(&decomp_param[i].cond);
2950 decomp_param[i].done = true;
2951 decomp_param[i].quit = false;
2952 qemu_thread_create(decompress_threads + i, "decompress",
2953 do_data_decompress, decomp_param + i,
2954 QEMU_THREAD_JOINABLE);
2956 return 0;
2957 exit:
2958 compress_threads_load_cleanup();
2959 return -1;
2962 static void decompress_data_with_multi_threads(QEMUFile *f,
2963 void *host, int len)
2965 int idx, thread_count;
2967 thread_count = migrate_decompress_threads();
2968 qemu_mutex_lock(&decomp_done_lock);
2969 while (true) {
2970 for (idx = 0; idx < thread_count; idx++) {
2971 if (decomp_param[idx].done) {
2972 decomp_param[idx].done = false;
2973 qemu_mutex_lock(&decomp_param[idx].mutex);
2974 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2975 decomp_param[idx].des = host;
2976 decomp_param[idx].len = len;
2977 qemu_cond_signal(&decomp_param[idx].cond);
2978 qemu_mutex_unlock(&decomp_param[idx].mutex);
2979 break;
2982 if (idx < thread_count) {
2983 break;
2984 } else {
2985 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2988 qemu_mutex_unlock(&decomp_done_lock);
2992 * ram_load_setup: Setup RAM for migration incoming side
2994 * Returns zero to indicate success and negative for error
2996 * @f: QEMUFile where to receive the data
2997 * @opaque: RAMState pointer
2999 static int ram_load_setup(QEMUFile *f, void *opaque)
3001 if (compress_threads_load_setup(f)) {
3002 return -1;
3005 xbzrle_load_setup();
3006 ramblock_recv_map_init();
3007 return 0;
3010 static int ram_load_cleanup(void *opaque)
3012 RAMBlock *rb;
3013 xbzrle_load_cleanup();
3014 compress_threads_load_cleanup();
3016 RAMBLOCK_FOREACH(rb) {
3017 g_free(rb->receivedmap);
3018 rb->receivedmap = NULL;
3020 return 0;
3024 * ram_postcopy_incoming_init: allocate postcopy data structures
3026 * Returns 0 for success and negative if there was one error
3028 * @mis: current migration incoming state
3030 * Allocate data structures etc needed by incoming migration with
3031 * postcopy-ram. postcopy-ram's similarly names
3032 * postcopy_ram_incoming_init does the work.
3034 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3036 unsigned long ram_pages = last_ram_page();
3038 return postcopy_ram_incoming_init(mis, ram_pages);
3042 * ram_load_postcopy: load a page in postcopy case
3044 * Returns 0 for success or -errno in case of error
3046 * Called in postcopy mode by ram_load().
3047 * rcu_read_lock is taken prior to this being called.
3049 * @f: QEMUFile where to send the data
3051 static int ram_load_postcopy(QEMUFile *f)
3053 int flags = 0, ret = 0;
3054 bool place_needed = false;
3055 bool matching_page_sizes = false;
3056 MigrationIncomingState *mis = migration_incoming_get_current();
3057 /* Temporary page that is later 'placed' */
3058 void *postcopy_host_page = postcopy_get_tmp_page(mis);
3059 void *last_host = NULL;
3060 bool all_zero = false;
3062 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3063 ram_addr_t addr;
3064 void *host = NULL;
3065 void *page_buffer = NULL;
3066 void *place_source = NULL;
3067 RAMBlock *block = NULL;
3068 uint8_t ch;
3070 addr = qemu_get_be64(f);
3073 * If qemu file error, we should stop here, and then "addr"
3074 * may be invalid
3076 ret = qemu_file_get_error(f);
3077 if (ret) {
3078 break;
3081 flags = addr & ~TARGET_PAGE_MASK;
3082 addr &= TARGET_PAGE_MASK;
3084 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3085 place_needed = false;
3086 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
3087 block = ram_block_from_stream(f, flags);
3089 host = host_from_ram_block_offset(block, addr);
3090 if (!host) {
3091 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3092 ret = -EINVAL;
3093 break;
3095 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
3097 * Postcopy requires that we place whole host pages atomically;
3098 * these may be huge pages for RAMBlocks that are backed by
3099 * hugetlbfs.
3100 * To make it atomic, the data is read into a temporary page
3101 * that's moved into place later.
3102 * The migration protocol uses, possibly smaller, target-pages
3103 * however the source ensures it always sends all the components
3104 * of a host page in order.
3106 page_buffer = postcopy_host_page +
3107 ((uintptr_t)host & (block->page_size - 1));
3108 /* If all TP are zero then we can optimise the place */
3109 if (!((uintptr_t)host & (block->page_size - 1))) {
3110 all_zero = true;
3111 } else {
3112 /* not the 1st TP within the HP */
3113 if (host != (last_host + TARGET_PAGE_SIZE)) {
3114 error_report("Non-sequential target page %p/%p",
3115 host, last_host);
3116 ret = -EINVAL;
3117 break;
3123 * If it's the last part of a host page then we place the host
3124 * page
3126 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
3127 (block->page_size - 1)) == 0;
3128 place_source = postcopy_host_page;
3130 last_host = host;
3132 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3133 case RAM_SAVE_FLAG_ZERO:
3134 ch = qemu_get_byte(f);
3135 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3136 if (ch) {
3137 all_zero = false;
3139 break;
3141 case RAM_SAVE_FLAG_PAGE:
3142 all_zero = false;
3143 if (!place_needed || !matching_page_sizes) {
3144 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3145 } else {
3146 /* Avoids the qemu_file copy during postcopy, which is
3147 * going to do a copy later; can only do it when we
3148 * do this read in one go (matching page sizes)
3150 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3151 TARGET_PAGE_SIZE);
3153 break;
3154 case RAM_SAVE_FLAG_EOS:
3155 /* normal exit */
3156 break;
3157 default:
3158 error_report("Unknown combination of migration flags: %#x"
3159 " (postcopy mode)", flags);
3160 ret = -EINVAL;
3161 break;
3164 /* Detect for any possible file errors */
3165 if (!ret && qemu_file_get_error(f)) {
3166 ret = qemu_file_get_error(f);
3169 if (!ret && place_needed) {
3170 /* This gets called at the last target page in the host page */
3171 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3173 if (all_zero) {
3174 ret = postcopy_place_page_zero(mis, place_dest,
3175 block);
3176 } else {
3177 ret = postcopy_place_page(mis, place_dest,
3178 place_source, block);
3183 return ret;
3186 static bool postcopy_is_advised(void)
3188 PostcopyState ps = postcopy_state_get();
3189 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3192 static bool postcopy_is_running(void)
3194 PostcopyState ps = postcopy_state_get();
3195 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3198 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3200 int flags = 0, ret = 0, invalid_flags = 0;
3201 static uint64_t seq_iter;
3202 int len = 0;
3204 * If system is running in postcopy mode, page inserts to host memory must
3205 * be atomic
3207 bool postcopy_running = postcopy_is_running();
3208 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3209 bool postcopy_advised = postcopy_is_advised();
3211 seq_iter++;
3213 if (version_id != 4) {
3214 ret = -EINVAL;
3217 if (!migrate_use_compression()) {
3218 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3220 /* This RCU critical section can be very long running.
3221 * When RCU reclaims in the code start to become numerous,
3222 * it will be necessary to reduce the granularity of this
3223 * critical section.
3225 rcu_read_lock();
3227 if (postcopy_running) {
3228 ret = ram_load_postcopy(f);
3231 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3232 ram_addr_t addr, total_ram_bytes;
3233 void *host = NULL;
3234 uint8_t ch;
3236 addr = qemu_get_be64(f);
3237 flags = addr & ~TARGET_PAGE_MASK;
3238 addr &= TARGET_PAGE_MASK;
3240 if (flags & invalid_flags) {
3241 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3242 error_report("Received an unexpected compressed page");
3245 ret = -EINVAL;
3246 break;
3249 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3250 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3251 RAMBlock *block = ram_block_from_stream(f, flags);
3253 host = host_from_ram_block_offset(block, addr);
3254 if (!host) {
3255 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3256 ret = -EINVAL;
3257 break;
3259 ramblock_recv_bitmap_set(block, host);
3260 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3263 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3264 case RAM_SAVE_FLAG_MEM_SIZE:
3265 /* Synchronize RAM block list */
3266 total_ram_bytes = addr;
3267 while (!ret && total_ram_bytes) {
3268 RAMBlock *block;
3269 char id[256];
3270 ram_addr_t length;
3272 len = qemu_get_byte(f);
3273 qemu_get_buffer(f, (uint8_t *)id, len);
3274 id[len] = 0;
3275 length = qemu_get_be64(f);
3277 block = qemu_ram_block_by_name(id);
3278 if (block) {
3279 if (length != block->used_length) {
3280 Error *local_err = NULL;
3282 ret = qemu_ram_resize(block, length,
3283 &local_err);
3284 if (local_err) {
3285 error_report_err(local_err);
3288 /* For postcopy we need to check hugepage sizes match */
3289 if (postcopy_advised &&
3290 block->page_size != qemu_host_page_size) {
3291 uint64_t remote_page_size = qemu_get_be64(f);
3292 if (remote_page_size != block->page_size) {
3293 error_report("Mismatched RAM page size %s "
3294 "(local) %zd != %" PRId64,
3295 id, block->page_size,
3296 remote_page_size);
3297 ret = -EINVAL;
3300 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3301 block->idstr);
3302 } else {
3303 error_report("Unknown ramblock \"%s\", cannot "
3304 "accept migration", id);
3305 ret = -EINVAL;
3308 total_ram_bytes -= length;
3310 break;
3312 case RAM_SAVE_FLAG_ZERO:
3313 ch = qemu_get_byte(f);
3314 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3315 break;
3317 case RAM_SAVE_FLAG_PAGE:
3318 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3319 break;
3321 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3322 len = qemu_get_be32(f);
3323 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3324 error_report("Invalid compressed data length: %d", len);
3325 ret = -EINVAL;
3326 break;
3328 decompress_data_with_multi_threads(f, host, len);
3329 break;
3331 case RAM_SAVE_FLAG_XBZRLE:
3332 if (load_xbzrle(f, addr, host) < 0) {
3333 error_report("Failed to decompress XBZRLE page at "
3334 RAM_ADDR_FMT, addr);
3335 ret = -EINVAL;
3336 break;
3338 break;
3339 case RAM_SAVE_FLAG_EOS:
3340 /* normal exit */
3341 break;
3342 default:
3343 if (flags & RAM_SAVE_FLAG_HOOK) {
3344 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3345 } else {
3346 error_report("Unknown combination of migration flags: %#x",
3347 flags);
3348 ret = -EINVAL;
3351 if (!ret) {
3352 ret = qemu_file_get_error(f);
3356 ret |= wait_for_decompress_done();
3357 rcu_read_unlock();
3358 trace_ram_load_complete(ret, seq_iter);
3359 return ret;
3362 static bool ram_has_postcopy(void *opaque)
3364 return migrate_postcopy_ram();
3368 * Read the received bitmap, revert it as the initial dirty bitmap.
3369 * This is only used when the postcopy migration is paused but wants
3370 * to resume from a middle point.
3372 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3374 int ret = -EINVAL;
3375 QEMUFile *file = s->rp_state.from_dst_file;
3376 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3377 uint64_t local_size = nbits / 8;
3378 uint64_t size, end_mark;
3380 trace_ram_dirty_bitmap_reload_begin(block->idstr);
3382 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3383 error_report("%s: incorrect state %s", __func__,
3384 MigrationStatus_str(s->state));
3385 return -EINVAL;
3389 * Note: see comments in ramblock_recv_bitmap_send() on why we
3390 * need the endianess convertion, and the paddings.
3392 local_size = ROUND_UP(local_size, 8);
3394 /* Add paddings */
3395 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3397 size = qemu_get_be64(file);
3399 /* The size of the bitmap should match with our ramblock */
3400 if (size != local_size) {
3401 error_report("%s: ramblock '%s' bitmap size mismatch "
3402 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3403 block->idstr, size, local_size);
3404 ret = -EINVAL;
3405 goto out;
3408 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3409 end_mark = qemu_get_be64(file);
3411 ret = qemu_file_get_error(file);
3412 if (ret || size != local_size) {
3413 error_report("%s: read bitmap failed for ramblock '%s': %d"
3414 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3415 __func__, block->idstr, ret, local_size, size);
3416 ret = -EIO;
3417 goto out;
3420 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3421 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3422 __func__, block->idstr, end_mark);
3423 ret = -EINVAL;
3424 goto out;
3428 * Endianess convertion. We are during postcopy (though paused).
3429 * The dirty bitmap won't change. We can directly modify it.
3431 bitmap_from_le(block->bmap, le_bitmap, nbits);
3434 * What we received is "received bitmap". Revert it as the initial
3435 * dirty bitmap for this ramblock.
3437 bitmap_complement(block->bmap, block->bmap, nbits);
3439 trace_ram_dirty_bitmap_reload_complete(block->idstr);
3441 ret = 0;
3442 out:
3443 free(le_bitmap);
3444 return ret;
3447 static SaveVMHandlers savevm_ram_handlers = {
3448 .save_setup = ram_save_setup,
3449 .save_live_iterate = ram_save_iterate,
3450 .save_live_complete_postcopy = ram_save_complete,
3451 .save_live_complete_precopy = ram_save_complete,
3452 .has_postcopy = ram_has_postcopy,
3453 .save_live_pending = ram_save_pending,
3454 .load_state = ram_load,
3455 .save_cleanup = ram_save_cleanup,
3456 .load_setup = ram_load_setup,
3457 .load_cleanup = ram_load_cleanup,
3460 void ram_mig_init(void)
3462 qemu_mutex_init(&XBZRLE.lock);
3463 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);