tcg: Optionally log FPU state in TCG -d cpu logging
[qemu.git] / migration / ram.c
blob912810c18e0fdc17f20a52ff8edcd47f4a96bbf4
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qemu/cutils.h"
33 #include "qemu/bitops.h"
34 #include "qemu/bitmap.h"
35 #include "qemu/main-loop.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "migration/page_cache.h"
44 #include "qemu/error-report.h"
45 #include "qapi/error.h"
46 #include "qapi/qapi-events-migration.h"
47 #include "qapi/qmp/qerror.h"
48 #include "trace.h"
49 #include "exec/ram_addr.h"
50 #include "exec/target_page.h"
51 #include "qemu/rcu_queue.h"
52 #include "migration/colo.h"
53 #include "migration/block.h"
55 /***********************************************************/
56 /* ram save/restore */
58 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
59 * worked for pages that where filled with the same char. We switched
60 * it to only search for the zero value. And to avoid confusion with
61 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
64 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
65 #define RAM_SAVE_FLAG_ZERO 0x02
66 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
67 #define RAM_SAVE_FLAG_PAGE 0x08
68 #define RAM_SAVE_FLAG_EOS 0x10
69 #define RAM_SAVE_FLAG_CONTINUE 0x20
70 #define RAM_SAVE_FLAG_XBZRLE 0x40
71 /* 0x80 is reserved in migration.h start with 0x100 next */
72 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
74 static inline bool is_zero_range(uint8_t *p, uint64_t size)
76 return buffer_is_zero(p, size);
79 XBZRLECacheStats xbzrle_counters;
81 /* struct contains XBZRLE cache and a static page
82 used by the compression */
83 static struct {
84 /* buffer used for XBZRLE encoding */
85 uint8_t *encoded_buf;
86 /* buffer for storing page content */
87 uint8_t *current_buf;
88 /* Cache for XBZRLE, Protected by lock. */
89 PageCache *cache;
90 QemuMutex lock;
91 /* it will store a page full of zeros */
92 uint8_t *zero_target_page;
93 /* buffer used for XBZRLE decoding */
94 uint8_t *decoded_buf;
95 } XBZRLE;
97 static void XBZRLE_cache_lock(void)
99 if (migrate_use_xbzrle())
100 qemu_mutex_lock(&XBZRLE.lock);
103 static void XBZRLE_cache_unlock(void)
105 if (migrate_use_xbzrle())
106 qemu_mutex_unlock(&XBZRLE.lock);
110 * xbzrle_cache_resize: resize the xbzrle cache
112 * This function is called from qmp_migrate_set_cache_size in main
113 * thread, possibly while a migration is in progress. A running
114 * migration may be using the cache and might finish during this call,
115 * hence changes to the cache are protected by XBZRLE.lock().
117 * Returns 0 for success or -1 for error
119 * @new_size: new cache size
120 * @errp: set *errp if the check failed, with reason
122 int xbzrle_cache_resize(int64_t new_size, Error **errp)
124 PageCache *new_cache;
125 int64_t ret = 0;
127 /* Check for truncation */
128 if (new_size != (size_t)new_size) {
129 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
130 "exceeding address space");
131 return -1;
134 if (new_size == migrate_xbzrle_cache_size()) {
135 /* nothing to do */
136 return 0;
139 XBZRLE_cache_lock();
141 if (XBZRLE.cache != NULL) {
142 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
143 if (!new_cache) {
144 ret = -1;
145 goto out;
148 cache_fini(XBZRLE.cache);
149 XBZRLE.cache = new_cache;
151 out:
152 XBZRLE_cache_unlock();
153 return ret;
156 static void ramblock_recv_map_init(void)
158 RAMBlock *rb;
160 RAMBLOCK_FOREACH(rb) {
161 assert(!rb->receivedmap);
162 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
166 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
168 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
169 rb->receivedmap);
172 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
174 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
177 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
179 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
182 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
183 size_t nr)
185 bitmap_set_atomic(rb->receivedmap,
186 ramblock_recv_bitmap_offset(host_addr, rb),
187 nr);
191 * An outstanding page request, on the source, having been received
192 * and queued
194 struct RAMSrcPageRequest {
195 RAMBlock *rb;
196 hwaddr offset;
197 hwaddr len;
199 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
202 /* State of RAM for migration */
203 struct RAMState {
204 /* QEMUFile used for this migration */
205 QEMUFile *f;
206 /* Last block that we have visited searching for dirty pages */
207 RAMBlock *last_seen_block;
208 /* Last block from where we have sent data */
209 RAMBlock *last_sent_block;
210 /* Last dirty target page we have sent */
211 ram_addr_t last_page;
212 /* last ram version we have seen */
213 uint32_t last_version;
214 /* We are in the first round */
215 bool ram_bulk_stage;
216 /* How many times we have dirty too many pages */
217 int dirty_rate_high_cnt;
218 /* these variables are used for bitmap sync */
219 /* last time we did a full bitmap_sync */
220 int64_t time_last_bitmap_sync;
221 /* bytes transferred at start_time */
222 uint64_t bytes_xfer_prev;
223 /* number of dirty pages since start_time */
224 uint64_t num_dirty_pages_period;
225 /* xbzrle misses since the beginning of the period */
226 uint64_t xbzrle_cache_miss_prev;
227 /* number of iterations at the beginning of period */
228 uint64_t iterations_prev;
229 /* Iterations since start */
230 uint64_t iterations;
231 /* number of dirty bits in the bitmap */
232 uint64_t migration_dirty_pages;
233 /* protects modification of the bitmap */
234 QemuMutex bitmap_mutex;
235 /* The RAMBlock used in the last src_page_requests */
236 RAMBlock *last_req_rb;
237 /* Queue of outstanding page requests from the destination */
238 QemuMutex src_page_req_mutex;
239 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
241 typedef struct RAMState RAMState;
243 static RAMState *ram_state;
245 uint64_t ram_bytes_remaining(void)
247 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
251 MigrationStats ram_counters;
253 /* used by the search for pages to send */
254 struct PageSearchStatus {
255 /* Current block being searched */
256 RAMBlock *block;
257 /* Current page to search from */
258 unsigned long page;
259 /* Set once we wrap around */
260 bool complete_round;
262 typedef struct PageSearchStatus PageSearchStatus;
264 struct CompressParam {
265 bool done;
266 bool quit;
267 QEMUFile *file;
268 QemuMutex mutex;
269 QemuCond cond;
270 RAMBlock *block;
271 ram_addr_t offset;
273 /* internally used fields */
274 z_stream stream;
275 uint8_t *originbuf;
277 typedef struct CompressParam CompressParam;
279 struct DecompressParam {
280 bool done;
281 bool quit;
282 QemuMutex mutex;
283 QemuCond cond;
284 void *des;
285 uint8_t *compbuf;
286 int len;
287 z_stream stream;
289 typedef struct DecompressParam DecompressParam;
291 static CompressParam *comp_param;
292 static QemuThread *compress_threads;
293 /* comp_done_cond is used to wake up the migration thread when
294 * one of the compression threads has finished the compression.
295 * comp_done_lock is used to co-work with comp_done_cond.
297 static QemuMutex comp_done_lock;
298 static QemuCond comp_done_cond;
299 /* The empty QEMUFileOps will be used by file in CompressParam */
300 static const QEMUFileOps empty_ops = { };
302 static QEMUFile *decomp_file;
303 static DecompressParam *decomp_param;
304 static QemuThread *decompress_threads;
305 static QemuMutex decomp_done_lock;
306 static QemuCond decomp_done_cond;
308 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
309 ram_addr_t offset, uint8_t *source_buf);
311 static void *do_data_compress(void *opaque)
313 CompressParam *param = opaque;
314 RAMBlock *block;
315 ram_addr_t offset;
317 qemu_mutex_lock(&param->mutex);
318 while (!param->quit) {
319 if (param->block) {
320 block = param->block;
321 offset = param->offset;
322 param->block = NULL;
323 qemu_mutex_unlock(&param->mutex);
325 do_compress_ram_page(param->file, &param->stream, block, offset,
326 param->originbuf);
328 qemu_mutex_lock(&comp_done_lock);
329 param->done = true;
330 qemu_cond_signal(&comp_done_cond);
331 qemu_mutex_unlock(&comp_done_lock);
333 qemu_mutex_lock(&param->mutex);
334 } else {
335 qemu_cond_wait(&param->cond, &param->mutex);
338 qemu_mutex_unlock(&param->mutex);
340 return NULL;
343 static inline void terminate_compression_threads(void)
345 int idx, thread_count;
347 thread_count = migrate_compress_threads();
349 for (idx = 0; idx < thread_count; idx++) {
350 qemu_mutex_lock(&comp_param[idx].mutex);
351 comp_param[idx].quit = true;
352 qemu_cond_signal(&comp_param[idx].cond);
353 qemu_mutex_unlock(&comp_param[idx].mutex);
357 static void compress_threads_save_cleanup(void)
359 int i, thread_count;
361 if (!migrate_use_compression()) {
362 return;
364 terminate_compression_threads();
365 thread_count = migrate_compress_threads();
366 for (i = 0; i < thread_count; i++) {
368 * we use it as a indicator which shows if the thread is
369 * properly init'd or not
371 if (!comp_param[i].file) {
372 break;
374 qemu_thread_join(compress_threads + i);
375 qemu_mutex_destroy(&comp_param[i].mutex);
376 qemu_cond_destroy(&comp_param[i].cond);
377 deflateEnd(&comp_param[i].stream);
378 g_free(comp_param[i].originbuf);
379 qemu_fclose(comp_param[i].file);
380 comp_param[i].file = NULL;
382 qemu_mutex_destroy(&comp_done_lock);
383 qemu_cond_destroy(&comp_done_cond);
384 g_free(compress_threads);
385 g_free(comp_param);
386 compress_threads = NULL;
387 comp_param = NULL;
390 static int compress_threads_save_setup(void)
392 int i, thread_count;
394 if (!migrate_use_compression()) {
395 return 0;
397 thread_count = migrate_compress_threads();
398 compress_threads = g_new0(QemuThread, thread_count);
399 comp_param = g_new0(CompressParam, thread_count);
400 qemu_cond_init(&comp_done_cond);
401 qemu_mutex_init(&comp_done_lock);
402 for (i = 0; i < thread_count; i++) {
403 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
404 if (!comp_param[i].originbuf) {
405 goto exit;
408 if (deflateInit(&comp_param[i].stream,
409 migrate_compress_level()) != Z_OK) {
410 g_free(comp_param[i].originbuf);
411 goto exit;
414 /* comp_param[i].file is just used as a dummy buffer to save data,
415 * set its ops to empty.
417 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
418 comp_param[i].done = true;
419 comp_param[i].quit = false;
420 qemu_mutex_init(&comp_param[i].mutex);
421 qemu_cond_init(&comp_param[i].cond);
422 qemu_thread_create(compress_threads + i, "compress",
423 do_data_compress, comp_param + i,
424 QEMU_THREAD_JOINABLE);
426 return 0;
428 exit:
429 compress_threads_save_cleanup();
430 return -1;
433 /* Multiple fd's */
435 struct MultiFDSendParams {
436 uint8_t id;
437 char *name;
438 QemuThread thread;
439 QemuSemaphore sem;
440 QemuMutex mutex;
441 bool quit;
443 typedef struct MultiFDSendParams MultiFDSendParams;
445 struct {
446 MultiFDSendParams *params;
447 /* number of created threads */
448 int count;
449 } *multifd_send_state;
451 static void terminate_multifd_send_threads(Error *errp)
453 int i;
455 for (i = 0; i < multifd_send_state->count; i++) {
456 MultiFDSendParams *p = &multifd_send_state->params[i];
458 qemu_mutex_lock(&p->mutex);
459 p->quit = true;
460 qemu_sem_post(&p->sem);
461 qemu_mutex_unlock(&p->mutex);
465 int multifd_save_cleanup(Error **errp)
467 int i;
468 int ret = 0;
470 if (!migrate_use_multifd()) {
471 return 0;
473 terminate_multifd_send_threads(NULL);
474 for (i = 0; i < multifd_send_state->count; i++) {
475 MultiFDSendParams *p = &multifd_send_state->params[i];
477 qemu_thread_join(&p->thread);
478 qemu_mutex_destroy(&p->mutex);
479 qemu_sem_destroy(&p->sem);
480 g_free(p->name);
481 p->name = NULL;
483 g_free(multifd_send_state->params);
484 multifd_send_state->params = NULL;
485 g_free(multifd_send_state);
486 multifd_send_state = NULL;
487 return ret;
490 static void *multifd_send_thread(void *opaque)
492 MultiFDSendParams *p = opaque;
494 while (true) {
495 qemu_mutex_lock(&p->mutex);
496 if (p->quit) {
497 qemu_mutex_unlock(&p->mutex);
498 break;
500 qemu_mutex_unlock(&p->mutex);
501 qemu_sem_wait(&p->sem);
504 return NULL;
507 int multifd_save_setup(void)
509 int thread_count;
510 uint8_t i;
512 if (!migrate_use_multifd()) {
513 return 0;
515 thread_count = migrate_multifd_channels();
516 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
517 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
518 multifd_send_state->count = 0;
519 for (i = 0; i < thread_count; i++) {
520 MultiFDSendParams *p = &multifd_send_state->params[i];
522 qemu_mutex_init(&p->mutex);
523 qemu_sem_init(&p->sem, 0);
524 p->quit = false;
525 p->id = i;
526 p->name = g_strdup_printf("multifdsend_%d", i);
527 qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
528 QEMU_THREAD_JOINABLE);
530 multifd_send_state->count++;
532 return 0;
535 struct MultiFDRecvParams {
536 uint8_t id;
537 char *name;
538 QemuThread thread;
539 QemuSemaphore sem;
540 QemuMutex mutex;
541 bool quit;
543 typedef struct MultiFDRecvParams MultiFDRecvParams;
545 struct {
546 MultiFDRecvParams *params;
547 /* number of created threads */
548 int count;
549 } *multifd_recv_state;
551 static void terminate_multifd_recv_threads(Error *errp)
553 int i;
555 for (i = 0; i < multifd_recv_state->count; i++) {
556 MultiFDRecvParams *p = &multifd_recv_state->params[i];
558 qemu_mutex_lock(&p->mutex);
559 p->quit = true;
560 qemu_sem_post(&p->sem);
561 qemu_mutex_unlock(&p->mutex);
565 int multifd_load_cleanup(Error **errp)
567 int i;
568 int ret = 0;
570 if (!migrate_use_multifd()) {
571 return 0;
573 terminate_multifd_recv_threads(NULL);
574 for (i = 0; i < multifd_recv_state->count; i++) {
575 MultiFDRecvParams *p = &multifd_recv_state->params[i];
577 qemu_thread_join(&p->thread);
578 qemu_mutex_destroy(&p->mutex);
579 qemu_sem_destroy(&p->sem);
580 g_free(p->name);
581 p->name = NULL;
583 g_free(multifd_recv_state->params);
584 multifd_recv_state->params = NULL;
585 g_free(multifd_recv_state);
586 multifd_recv_state = NULL;
588 return ret;
591 static void *multifd_recv_thread(void *opaque)
593 MultiFDRecvParams *p = opaque;
595 while (true) {
596 qemu_mutex_lock(&p->mutex);
597 if (p->quit) {
598 qemu_mutex_unlock(&p->mutex);
599 break;
601 qemu_mutex_unlock(&p->mutex);
602 qemu_sem_wait(&p->sem);
605 return NULL;
608 int multifd_load_setup(void)
610 int thread_count;
611 uint8_t i;
613 if (!migrate_use_multifd()) {
614 return 0;
616 thread_count = migrate_multifd_channels();
617 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
618 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
619 multifd_recv_state->count = 0;
620 for (i = 0; i < thread_count; i++) {
621 MultiFDRecvParams *p = &multifd_recv_state->params[i];
623 qemu_mutex_init(&p->mutex);
624 qemu_sem_init(&p->sem, 0);
625 p->quit = false;
626 p->id = i;
627 p->name = g_strdup_printf("multifdrecv_%d", i);
628 qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
629 QEMU_THREAD_JOINABLE);
630 multifd_recv_state->count++;
632 return 0;
636 * save_page_header: write page header to wire
638 * If this is the 1st block, it also writes the block identification
640 * Returns the number of bytes written
642 * @f: QEMUFile where to send the data
643 * @block: block that contains the page we want to send
644 * @offset: offset inside the block for the page
645 * in the lower bits, it contains flags
647 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
648 ram_addr_t offset)
650 size_t size, len;
652 if (block == rs->last_sent_block) {
653 offset |= RAM_SAVE_FLAG_CONTINUE;
655 qemu_put_be64(f, offset);
656 size = 8;
658 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
659 len = strlen(block->idstr);
660 qemu_put_byte(f, len);
661 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
662 size += 1 + len;
663 rs->last_sent_block = block;
665 return size;
669 * mig_throttle_guest_down: throotle down the guest
671 * Reduce amount of guest cpu execution to hopefully slow down memory
672 * writes. If guest dirty memory rate is reduced below the rate at
673 * which we can transfer pages to the destination then we should be
674 * able to complete migration. Some workloads dirty memory way too
675 * fast and will not effectively converge, even with auto-converge.
677 static void mig_throttle_guest_down(void)
679 MigrationState *s = migrate_get_current();
680 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
681 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
683 /* We have not started throttling yet. Let's start it. */
684 if (!cpu_throttle_active()) {
685 cpu_throttle_set(pct_initial);
686 } else {
687 /* Throttling already on, just increase the rate */
688 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
693 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
695 * @rs: current RAM state
696 * @current_addr: address for the zero page
698 * Update the xbzrle cache to reflect a page that's been sent as all 0.
699 * The important thing is that a stale (not-yet-0'd) page be replaced
700 * by the new data.
701 * As a bonus, if the page wasn't in the cache it gets added so that
702 * when a small write is made into the 0'd page it gets XBZRLE sent.
704 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
706 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
707 return;
710 /* We don't care if this fails to allocate a new cache page
711 * as long as it updated an old one */
712 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
713 ram_counters.dirty_sync_count);
716 #define ENCODING_FLAG_XBZRLE 0x1
719 * save_xbzrle_page: compress and send current page
721 * Returns: 1 means that we wrote the page
722 * 0 means that page is identical to the one already sent
723 * -1 means that xbzrle would be longer than normal
725 * @rs: current RAM state
726 * @current_data: pointer to the address of the page contents
727 * @current_addr: addr of the page
728 * @block: block that contains the page we want to send
729 * @offset: offset inside the block for the page
730 * @last_stage: if we are at the completion stage
732 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
733 ram_addr_t current_addr, RAMBlock *block,
734 ram_addr_t offset, bool last_stage)
736 int encoded_len = 0, bytes_xbzrle;
737 uint8_t *prev_cached_page;
739 if (!cache_is_cached(XBZRLE.cache, current_addr,
740 ram_counters.dirty_sync_count)) {
741 xbzrle_counters.cache_miss++;
742 if (!last_stage) {
743 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
744 ram_counters.dirty_sync_count) == -1) {
745 return -1;
746 } else {
747 /* update *current_data when the page has been
748 inserted into cache */
749 *current_data = get_cached_data(XBZRLE.cache, current_addr);
752 return -1;
755 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
757 /* save current buffer into memory */
758 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
760 /* XBZRLE encoding (if there is no overflow) */
761 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
762 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
763 TARGET_PAGE_SIZE);
764 if (encoded_len == 0) {
765 trace_save_xbzrle_page_skipping();
766 return 0;
767 } else if (encoded_len == -1) {
768 trace_save_xbzrle_page_overflow();
769 xbzrle_counters.overflow++;
770 /* update data in the cache */
771 if (!last_stage) {
772 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
773 *current_data = prev_cached_page;
775 return -1;
778 /* we need to update the data in the cache, in order to get the same data */
779 if (!last_stage) {
780 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
783 /* Send XBZRLE based compressed page */
784 bytes_xbzrle = save_page_header(rs, rs->f, block,
785 offset | RAM_SAVE_FLAG_XBZRLE);
786 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
787 qemu_put_be16(rs->f, encoded_len);
788 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
789 bytes_xbzrle += encoded_len + 1 + 2;
790 xbzrle_counters.pages++;
791 xbzrle_counters.bytes += bytes_xbzrle;
792 ram_counters.transferred += bytes_xbzrle;
794 return 1;
798 * migration_bitmap_find_dirty: find the next dirty page from start
800 * Called with rcu_read_lock() to protect migration_bitmap
802 * Returns the byte offset within memory region of the start of a dirty page
804 * @rs: current RAM state
805 * @rb: RAMBlock where to search for dirty pages
806 * @start: page where we start the search
808 static inline
809 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
810 unsigned long start)
812 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
813 unsigned long *bitmap = rb->bmap;
814 unsigned long next;
816 if (rs->ram_bulk_stage && start > 0) {
817 next = start + 1;
818 } else {
819 next = find_next_bit(bitmap, size, start);
822 return next;
825 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
826 RAMBlock *rb,
827 unsigned long page)
829 bool ret;
831 ret = test_and_clear_bit(page, rb->bmap);
833 if (ret) {
834 rs->migration_dirty_pages--;
836 return ret;
839 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
840 ram_addr_t start, ram_addr_t length)
842 rs->migration_dirty_pages +=
843 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
844 &rs->num_dirty_pages_period);
848 * ram_pagesize_summary: calculate all the pagesizes of a VM
850 * Returns a summary bitmap of the page sizes of all RAMBlocks
852 * For VMs with just normal pages this is equivalent to the host page
853 * size. If it's got some huge pages then it's the OR of all the
854 * different page sizes.
856 uint64_t ram_pagesize_summary(void)
858 RAMBlock *block;
859 uint64_t summary = 0;
861 RAMBLOCK_FOREACH(block) {
862 summary |= block->page_size;
865 return summary;
868 static void migration_bitmap_sync(RAMState *rs)
870 RAMBlock *block;
871 int64_t end_time;
872 uint64_t bytes_xfer_now;
874 ram_counters.dirty_sync_count++;
876 if (!rs->time_last_bitmap_sync) {
877 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
880 trace_migration_bitmap_sync_start();
881 memory_global_dirty_log_sync();
883 qemu_mutex_lock(&rs->bitmap_mutex);
884 rcu_read_lock();
885 RAMBLOCK_FOREACH(block) {
886 migration_bitmap_sync_range(rs, block, 0, block->used_length);
888 rcu_read_unlock();
889 qemu_mutex_unlock(&rs->bitmap_mutex);
891 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
893 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
895 /* more than 1 second = 1000 millisecons */
896 if (end_time > rs->time_last_bitmap_sync + 1000) {
897 /* calculate period counters */
898 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
899 / (end_time - rs->time_last_bitmap_sync);
900 bytes_xfer_now = ram_counters.transferred;
902 /* During block migration the auto-converge logic incorrectly detects
903 * that ram migration makes no progress. Avoid this by disabling the
904 * throttling logic during the bulk phase of block migration. */
905 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
906 /* The following detection logic can be refined later. For now:
907 Check to see if the dirtied bytes is 50% more than the approx.
908 amount of bytes that just got transferred since the last time we
909 were in this routine. If that happens twice, start or increase
910 throttling */
912 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
913 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
914 (++rs->dirty_rate_high_cnt >= 2)) {
915 trace_migration_throttle();
916 rs->dirty_rate_high_cnt = 0;
917 mig_throttle_guest_down();
921 if (migrate_use_xbzrle()) {
922 if (rs->iterations_prev != rs->iterations) {
923 xbzrle_counters.cache_miss_rate =
924 (double)(xbzrle_counters.cache_miss -
925 rs->xbzrle_cache_miss_prev) /
926 (rs->iterations - rs->iterations_prev);
928 rs->iterations_prev = rs->iterations;
929 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
932 /* reset period counters */
933 rs->time_last_bitmap_sync = end_time;
934 rs->num_dirty_pages_period = 0;
935 rs->bytes_xfer_prev = bytes_xfer_now;
937 if (migrate_use_events()) {
938 qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
943 * save_zero_page: send the zero page to the stream
945 * Returns the number of pages written.
947 * @rs: current RAM state
948 * @block: block that contains the page we want to send
949 * @offset: offset inside the block for the page
951 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
953 uint8_t *p = block->host + offset;
954 int pages = -1;
956 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
957 ram_counters.duplicate++;
958 ram_counters.transferred +=
959 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
960 qemu_put_byte(rs->f, 0);
961 ram_counters.transferred += 1;
962 pages = 1;
965 return pages;
968 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
970 if (!migrate_release_ram() || !migration_in_postcopy()) {
971 return;
974 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
978 * @pages: the number of pages written by the control path,
979 * < 0 - error
980 * > 0 - number of pages written
982 * Return true if the pages has been saved, otherwise false is returned.
984 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
985 int *pages)
987 uint64_t bytes_xmit = 0;
988 int ret;
990 *pages = -1;
991 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
992 &bytes_xmit);
993 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
994 return false;
997 if (bytes_xmit) {
998 ram_counters.transferred += bytes_xmit;
999 *pages = 1;
1002 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1003 return true;
1006 if (bytes_xmit > 0) {
1007 ram_counters.normal++;
1008 } else if (bytes_xmit == 0) {
1009 ram_counters.duplicate++;
1012 return true;
1016 * directly send the page to the stream
1018 * Returns the number of pages written.
1020 * @rs: current RAM state
1021 * @block: block that contains the page we want to send
1022 * @offset: offset inside the block for the page
1023 * @buf: the page to be sent
1024 * @async: send to page asyncly
1026 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1027 uint8_t *buf, bool async)
1029 ram_counters.transferred += save_page_header(rs, rs->f, block,
1030 offset | RAM_SAVE_FLAG_PAGE);
1031 if (async) {
1032 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1033 migrate_release_ram() &
1034 migration_in_postcopy());
1035 } else {
1036 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1038 ram_counters.transferred += TARGET_PAGE_SIZE;
1039 ram_counters.normal++;
1040 return 1;
1044 * ram_save_page: send the given page to the stream
1046 * Returns the number of pages written.
1047 * < 0 - error
1048 * >=0 - Number of pages written - this might legally be 0
1049 * if xbzrle noticed the page was the same.
1051 * @rs: current RAM state
1052 * @block: block that contains the page we want to send
1053 * @offset: offset inside the block for the page
1054 * @last_stage: if we are at the completion stage
1056 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1058 int pages = -1;
1059 uint8_t *p;
1060 bool send_async = true;
1061 RAMBlock *block = pss->block;
1062 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1063 ram_addr_t current_addr = block->offset + offset;
1065 p = block->host + offset;
1066 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1068 XBZRLE_cache_lock();
1069 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1070 migrate_use_xbzrle()) {
1071 pages = save_xbzrle_page(rs, &p, current_addr, block,
1072 offset, last_stage);
1073 if (!last_stage) {
1074 /* Can't send this cached data async, since the cache page
1075 * might get updated before it gets to the wire
1077 send_async = false;
1081 /* XBZRLE overflow or normal page */
1082 if (pages == -1) {
1083 pages = save_normal_page(rs, block, offset, p, send_async);
1086 XBZRLE_cache_unlock();
1088 return pages;
1091 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1092 ram_addr_t offset, uint8_t *source_buf)
1094 RAMState *rs = ram_state;
1095 int bytes_sent, blen;
1096 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1098 bytes_sent = save_page_header(rs, f, block, offset |
1099 RAM_SAVE_FLAG_COMPRESS_PAGE);
1102 * copy it to a internal buffer to avoid it being modified by VM
1103 * so that we can catch up the error during compression and
1104 * decompression
1106 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1107 blen = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1108 if (blen < 0) {
1109 bytes_sent = 0;
1110 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1111 error_report("compressed data failed!");
1112 } else {
1113 bytes_sent += blen;
1114 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1117 return bytes_sent;
1120 static void flush_compressed_data(RAMState *rs)
1122 int idx, len, thread_count;
1124 if (!migrate_use_compression()) {
1125 return;
1127 thread_count = migrate_compress_threads();
1129 qemu_mutex_lock(&comp_done_lock);
1130 for (idx = 0; idx < thread_count; idx++) {
1131 while (!comp_param[idx].done) {
1132 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1135 qemu_mutex_unlock(&comp_done_lock);
1137 for (idx = 0; idx < thread_count; idx++) {
1138 qemu_mutex_lock(&comp_param[idx].mutex);
1139 if (!comp_param[idx].quit) {
1140 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1141 ram_counters.transferred += len;
1143 qemu_mutex_unlock(&comp_param[idx].mutex);
1147 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1148 ram_addr_t offset)
1150 param->block = block;
1151 param->offset = offset;
1154 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1155 ram_addr_t offset)
1157 int idx, thread_count, bytes_xmit = -1, pages = -1;
1159 thread_count = migrate_compress_threads();
1160 qemu_mutex_lock(&comp_done_lock);
1161 while (true) {
1162 for (idx = 0; idx < thread_count; idx++) {
1163 if (comp_param[idx].done) {
1164 comp_param[idx].done = false;
1165 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1166 qemu_mutex_lock(&comp_param[idx].mutex);
1167 set_compress_params(&comp_param[idx], block, offset);
1168 qemu_cond_signal(&comp_param[idx].cond);
1169 qemu_mutex_unlock(&comp_param[idx].mutex);
1170 pages = 1;
1171 ram_counters.normal++;
1172 ram_counters.transferred += bytes_xmit;
1173 break;
1176 if (pages > 0) {
1177 break;
1178 } else {
1179 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1182 qemu_mutex_unlock(&comp_done_lock);
1184 return pages;
1188 * find_dirty_block: find the next dirty page and update any state
1189 * associated with the search process.
1191 * Returns if a page is found
1193 * @rs: current RAM state
1194 * @pss: data about the state of the current dirty page scan
1195 * @again: set to false if the search has scanned the whole of RAM
1197 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1199 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1200 if (pss->complete_round && pss->block == rs->last_seen_block &&
1201 pss->page >= rs->last_page) {
1203 * We've been once around the RAM and haven't found anything.
1204 * Give up.
1206 *again = false;
1207 return false;
1209 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1210 /* Didn't find anything in this RAM Block */
1211 pss->page = 0;
1212 pss->block = QLIST_NEXT_RCU(pss->block, next);
1213 if (!pss->block) {
1214 /* Hit the end of the list */
1215 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1216 /* Flag that we've looped */
1217 pss->complete_round = true;
1218 rs->ram_bulk_stage = false;
1219 if (migrate_use_xbzrle()) {
1220 /* If xbzrle is on, stop using the data compression at this
1221 * point. In theory, xbzrle can do better than compression.
1223 flush_compressed_data(rs);
1226 /* Didn't find anything this time, but try again on the new block */
1227 *again = true;
1228 return false;
1229 } else {
1230 /* Can go around again, but... */
1231 *again = true;
1232 /* We've found something so probably don't need to */
1233 return true;
1238 * unqueue_page: gets a page of the queue
1240 * Helper for 'get_queued_page' - gets a page off the queue
1242 * Returns the block of the page (or NULL if none available)
1244 * @rs: current RAM state
1245 * @offset: used to return the offset within the RAMBlock
1247 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1249 RAMBlock *block = NULL;
1251 qemu_mutex_lock(&rs->src_page_req_mutex);
1252 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1253 struct RAMSrcPageRequest *entry =
1254 QSIMPLEQ_FIRST(&rs->src_page_requests);
1255 block = entry->rb;
1256 *offset = entry->offset;
1258 if (entry->len > TARGET_PAGE_SIZE) {
1259 entry->len -= TARGET_PAGE_SIZE;
1260 entry->offset += TARGET_PAGE_SIZE;
1261 } else {
1262 memory_region_unref(block->mr);
1263 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1264 g_free(entry);
1267 qemu_mutex_unlock(&rs->src_page_req_mutex);
1269 return block;
1273 * get_queued_page: unqueue a page from the postocpy requests
1275 * Skips pages that are already sent (!dirty)
1277 * Returns if a queued page is found
1279 * @rs: current RAM state
1280 * @pss: data about the state of the current dirty page scan
1282 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1284 RAMBlock *block;
1285 ram_addr_t offset;
1286 bool dirty;
1288 do {
1289 block = unqueue_page(rs, &offset);
1291 * We're sending this page, and since it's postcopy nothing else
1292 * will dirty it, and we must make sure it doesn't get sent again
1293 * even if this queue request was received after the background
1294 * search already sent it.
1296 if (block) {
1297 unsigned long page;
1299 page = offset >> TARGET_PAGE_BITS;
1300 dirty = test_bit(page, block->bmap);
1301 if (!dirty) {
1302 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1303 page, test_bit(page, block->unsentmap));
1304 } else {
1305 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1309 } while (block && !dirty);
1311 if (block) {
1313 * As soon as we start servicing pages out of order, then we have
1314 * to kill the bulk stage, since the bulk stage assumes
1315 * in (migration_bitmap_find_and_reset_dirty) that every page is
1316 * dirty, that's no longer true.
1318 rs->ram_bulk_stage = false;
1321 * We want the background search to continue from the queued page
1322 * since the guest is likely to want other pages near to the page
1323 * it just requested.
1325 pss->block = block;
1326 pss->page = offset >> TARGET_PAGE_BITS;
1329 return !!block;
1333 * migration_page_queue_free: drop any remaining pages in the ram
1334 * request queue
1336 * It should be empty at the end anyway, but in error cases there may
1337 * be some left. in case that there is any page left, we drop it.
1340 static void migration_page_queue_free(RAMState *rs)
1342 struct RAMSrcPageRequest *mspr, *next_mspr;
1343 /* This queue generally should be empty - but in the case of a failed
1344 * migration might have some droppings in.
1346 rcu_read_lock();
1347 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1348 memory_region_unref(mspr->rb->mr);
1349 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1350 g_free(mspr);
1352 rcu_read_unlock();
1356 * ram_save_queue_pages: queue the page for transmission
1358 * A request from postcopy destination for example.
1360 * Returns zero on success or negative on error
1362 * @rbname: Name of the RAMBLock of the request. NULL means the
1363 * same that last one.
1364 * @start: starting address from the start of the RAMBlock
1365 * @len: length (in bytes) to send
1367 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1369 RAMBlock *ramblock;
1370 RAMState *rs = ram_state;
1372 ram_counters.postcopy_requests++;
1373 rcu_read_lock();
1374 if (!rbname) {
1375 /* Reuse last RAMBlock */
1376 ramblock = rs->last_req_rb;
1378 if (!ramblock) {
1380 * Shouldn't happen, we can't reuse the last RAMBlock if
1381 * it's the 1st request.
1383 error_report("ram_save_queue_pages no previous block");
1384 goto err;
1386 } else {
1387 ramblock = qemu_ram_block_by_name(rbname);
1389 if (!ramblock) {
1390 /* We shouldn't be asked for a non-existent RAMBlock */
1391 error_report("ram_save_queue_pages no block '%s'", rbname);
1392 goto err;
1394 rs->last_req_rb = ramblock;
1396 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1397 if (start+len > ramblock->used_length) {
1398 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1399 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1400 __func__, start, len, ramblock->used_length);
1401 goto err;
1404 struct RAMSrcPageRequest *new_entry =
1405 g_malloc0(sizeof(struct RAMSrcPageRequest));
1406 new_entry->rb = ramblock;
1407 new_entry->offset = start;
1408 new_entry->len = len;
1410 memory_region_ref(ramblock->mr);
1411 qemu_mutex_lock(&rs->src_page_req_mutex);
1412 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1413 qemu_mutex_unlock(&rs->src_page_req_mutex);
1414 rcu_read_unlock();
1416 return 0;
1418 err:
1419 rcu_read_unlock();
1420 return -1;
1423 static bool save_page_use_compression(RAMState *rs)
1425 if (!migrate_use_compression()) {
1426 return false;
1430 * If xbzrle is on, stop using the data compression after first
1431 * round of migration even if compression is enabled. In theory,
1432 * xbzrle can do better than compression.
1434 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1435 return true;
1438 return false;
1442 * ram_save_target_page: save one target page
1444 * Returns the number of pages written
1446 * @rs: current RAM state
1447 * @pss: data about the page we want to send
1448 * @last_stage: if we are at the completion stage
1450 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1451 bool last_stage)
1453 RAMBlock *block = pss->block;
1454 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1455 int res;
1457 if (control_save_page(rs, block, offset, &res)) {
1458 return res;
1462 * When starting the process of a new block, the first page of
1463 * the block should be sent out before other pages in the same
1464 * block, and all the pages in last block should have been sent
1465 * out, keeping this order is important, because the 'cont' flag
1466 * is used to avoid resending the block name.
1468 if (block != rs->last_sent_block && save_page_use_compression(rs)) {
1469 flush_compressed_data(rs);
1472 res = save_zero_page(rs, block, offset);
1473 if (res > 0) {
1474 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1475 * page would be stale
1477 if (!save_page_use_compression(rs)) {
1478 XBZRLE_cache_lock();
1479 xbzrle_cache_zero_page(rs, block->offset + offset);
1480 XBZRLE_cache_unlock();
1482 ram_release_pages(block->idstr, offset, res);
1483 return res;
1487 * Make sure the first page is sent out before other pages.
1489 * we post it as normal page as compression will take much
1490 * CPU resource.
1492 if (block == rs->last_sent_block && save_page_use_compression(rs)) {
1493 res = compress_page_with_multi_thread(rs, block, offset);
1496 return ram_save_page(rs, pss, last_stage);
1500 * ram_save_host_page: save a whole host page
1502 * Starting at *offset send pages up to the end of the current host
1503 * page. It's valid for the initial offset to point into the middle of
1504 * a host page in which case the remainder of the hostpage is sent.
1505 * Only dirty target pages are sent. Note that the host page size may
1506 * be a huge page for this block.
1507 * The saving stops at the boundary of the used_length of the block
1508 * if the RAMBlock isn't a multiple of the host page size.
1510 * Returns the number of pages written or negative on error
1512 * @rs: current RAM state
1513 * @ms: current migration state
1514 * @pss: data about the page we want to send
1515 * @last_stage: if we are at the completion stage
1517 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1518 bool last_stage)
1520 int tmppages, pages = 0;
1521 size_t pagesize_bits =
1522 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1524 do {
1525 /* Check the pages is dirty and if it is send it */
1526 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1527 pss->page++;
1528 continue;
1531 tmppages = ram_save_target_page(rs, pss, last_stage);
1532 if (tmppages < 0) {
1533 return tmppages;
1536 pages += tmppages;
1537 if (pss->block->unsentmap) {
1538 clear_bit(pss->page, pss->block->unsentmap);
1541 pss->page++;
1542 } while ((pss->page & (pagesize_bits - 1)) &&
1543 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1545 /* The offset we leave with is the last one we looked at */
1546 pss->page--;
1547 return pages;
1551 * ram_find_and_save_block: finds a dirty page and sends it to f
1553 * Called within an RCU critical section.
1555 * Returns the number of pages written where zero means no dirty pages
1557 * @rs: current RAM state
1558 * @last_stage: if we are at the completion stage
1560 * On systems where host-page-size > target-page-size it will send all the
1561 * pages in a host page that are dirty.
1564 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1566 PageSearchStatus pss;
1567 int pages = 0;
1568 bool again, found;
1570 /* No dirty page as there is zero RAM */
1571 if (!ram_bytes_total()) {
1572 return pages;
1575 pss.block = rs->last_seen_block;
1576 pss.page = rs->last_page;
1577 pss.complete_round = false;
1579 if (!pss.block) {
1580 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1583 do {
1584 again = true;
1585 found = get_queued_page(rs, &pss);
1587 if (!found) {
1588 /* priority queue empty, so just search for something dirty */
1589 found = find_dirty_block(rs, &pss, &again);
1592 if (found) {
1593 pages = ram_save_host_page(rs, &pss, last_stage);
1595 } while (!pages && again);
1597 rs->last_seen_block = pss.block;
1598 rs->last_page = pss.page;
1600 return pages;
1603 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1605 uint64_t pages = size / TARGET_PAGE_SIZE;
1607 if (zero) {
1608 ram_counters.duplicate += pages;
1609 } else {
1610 ram_counters.normal += pages;
1611 ram_counters.transferred += size;
1612 qemu_update_position(f, size);
1616 uint64_t ram_bytes_total(void)
1618 RAMBlock *block;
1619 uint64_t total = 0;
1621 rcu_read_lock();
1622 RAMBLOCK_FOREACH(block) {
1623 total += block->used_length;
1625 rcu_read_unlock();
1626 return total;
1629 static void xbzrle_load_setup(void)
1631 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1634 static void xbzrle_load_cleanup(void)
1636 g_free(XBZRLE.decoded_buf);
1637 XBZRLE.decoded_buf = NULL;
1640 static void ram_state_cleanup(RAMState **rsp)
1642 if (*rsp) {
1643 migration_page_queue_free(*rsp);
1644 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1645 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1646 g_free(*rsp);
1647 *rsp = NULL;
1651 static void xbzrle_cleanup(void)
1653 XBZRLE_cache_lock();
1654 if (XBZRLE.cache) {
1655 cache_fini(XBZRLE.cache);
1656 g_free(XBZRLE.encoded_buf);
1657 g_free(XBZRLE.current_buf);
1658 g_free(XBZRLE.zero_target_page);
1659 XBZRLE.cache = NULL;
1660 XBZRLE.encoded_buf = NULL;
1661 XBZRLE.current_buf = NULL;
1662 XBZRLE.zero_target_page = NULL;
1664 XBZRLE_cache_unlock();
1667 static void ram_save_cleanup(void *opaque)
1669 RAMState **rsp = opaque;
1670 RAMBlock *block;
1672 /* caller have hold iothread lock or is in a bh, so there is
1673 * no writing race against this migration_bitmap
1675 memory_global_dirty_log_stop();
1677 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1678 g_free(block->bmap);
1679 block->bmap = NULL;
1680 g_free(block->unsentmap);
1681 block->unsentmap = NULL;
1684 xbzrle_cleanup();
1685 compress_threads_save_cleanup();
1686 ram_state_cleanup(rsp);
1689 static void ram_state_reset(RAMState *rs)
1691 rs->last_seen_block = NULL;
1692 rs->last_sent_block = NULL;
1693 rs->last_page = 0;
1694 rs->last_version = ram_list.version;
1695 rs->ram_bulk_stage = true;
1698 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1701 * 'expected' is the value you expect the bitmap mostly to be full
1702 * of; it won't bother printing lines that are all this value.
1703 * If 'todump' is null the migration bitmap is dumped.
1705 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1706 unsigned long pages)
1708 int64_t cur;
1709 int64_t linelen = 128;
1710 char linebuf[129];
1712 for (cur = 0; cur < pages; cur += linelen) {
1713 int64_t curb;
1714 bool found = false;
1716 * Last line; catch the case where the line length
1717 * is longer than remaining ram
1719 if (cur + linelen > pages) {
1720 linelen = pages - cur;
1722 for (curb = 0; curb < linelen; curb++) {
1723 bool thisbit = test_bit(cur + curb, todump);
1724 linebuf[curb] = thisbit ? '1' : '.';
1725 found = found || (thisbit != expected);
1727 if (found) {
1728 linebuf[curb] = '\0';
1729 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1734 /* **** functions for postcopy ***** */
1736 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1738 struct RAMBlock *block;
1740 RAMBLOCK_FOREACH(block) {
1741 unsigned long *bitmap = block->bmap;
1742 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1743 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1745 while (run_start < range) {
1746 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1747 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1748 (run_end - run_start) << TARGET_PAGE_BITS);
1749 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1755 * postcopy_send_discard_bm_ram: discard a RAMBlock
1757 * Returns zero on success
1759 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1760 * Note: At this point the 'unsentmap' is the processed bitmap combined
1761 * with the dirtymap; so a '1' means it's either dirty or unsent.
1763 * @ms: current migration state
1764 * @pds: state for postcopy
1765 * @start: RAMBlock starting page
1766 * @length: RAMBlock size
1768 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1769 PostcopyDiscardState *pds,
1770 RAMBlock *block)
1772 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1773 unsigned long current;
1774 unsigned long *unsentmap = block->unsentmap;
1776 for (current = 0; current < end; ) {
1777 unsigned long one = find_next_bit(unsentmap, end, current);
1779 if (one <= end) {
1780 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1781 unsigned long discard_length;
1783 if (zero >= end) {
1784 discard_length = end - one;
1785 } else {
1786 discard_length = zero - one;
1788 if (discard_length) {
1789 postcopy_discard_send_range(ms, pds, one, discard_length);
1791 current = one + discard_length;
1792 } else {
1793 current = one;
1797 return 0;
1801 * postcopy_each_ram_send_discard: discard all RAMBlocks
1803 * Returns 0 for success or negative for error
1805 * Utility for the outgoing postcopy code.
1806 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1807 * passing it bitmap indexes and name.
1808 * (qemu_ram_foreach_block ends up passing unscaled lengths
1809 * which would mean postcopy code would have to deal with target page)
1811 * @ms: current migration state
1813 static int postcopy_each_ram_send_discard(MigrationState *ms)
1815 struct RAMBlock *block;
1816 int ret;
1818 RAMBLOCK_FOREACH(block) {
1819 PostcopyDiscardState *pds =
1820 postcopy_discard_send_init(ms, block->idstr);
1823 * Postcopy sends chunks of bitmap over the wire, but it
1824 * just needs indexes at this point, avoids it having
1825 * target page specific code.
1827 ret = postcopy_send_discard_bm_ram(ms, pds, block);
1828 postcopy_discard_send_finish(ms, pds);
1829 if (ret) {
1830 return ret;
1834 return 0;
1838 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1840 * Helper for postcopy_chunk_hostpages; it's called twice to
1841 * canonicalize the two bitmaps, that are similar, but one is
1842 * inverted.
1844 * Postcopy requires that all target pages in a hostpage are dirty or
1845 * clean, not a mix. This function canonicalizes the bitmaps.
1847 * @ms: current migration state
1848 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1849 * otherwise we need to canonicalize partially dirty host pages
1850 * @block: block that contains the page we want to canonicalize
1851 * @pds: state for postcopy
1853 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1854 RAMBlock *block,
1855 PostcopyDiscardState *pds)
1857 RAMState *rs = ram_state;
1858 unsigned long *bitmap = block->bmap;
1859 unsigned long *unsentmap = block->unsentmap;
1860 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1861 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1862 unsigned long run_start;
1864 if (block->page_size == TARGET_PAGE_SIZE) {
1865 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1866 return;
1869 if (unsent_pass) {
1870 /* Find a sent page */
1871 run_start = find_next_zero_bit(unsentmap, pages, 0);
1872 } else {
1873 /* Find a dirty page */
1874 run_start = find_next_bit(bitmap, pages, 0);
1877 while (run_start < pages) {
1878 bool do_fixup = false;
1879 unsigned long fixup_start_addr;
1880 unsigned long host_offset;
1883 * If the start of this run of pages is in the middle of a host
1884 * page, then we need to fixup this host page.
1886 host_offset = run_start % host_ratio;
1887 if (host_offset) {
1888 do_fixup = true;
1889 run_start -= host_offset;
1890 fixup_start_addr = run_start;
1891 /* For the next pass */
1892 run_start = run_start + host_ratio;
1893 } else {
1894 /* Find the end of this run */
1895 unsigned long run_end;
1896 if (unsent_pass) {
1897 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1898 } else {
1899 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1902 * If the end isn't at the start of a host page, then the
1903 * run doesn't finish at the end of a host page
1904 * and we need to discard.
1906 host_offset = run_end % host_ratio;
1907 if (host_offset) {
1908 do_fixup = true;
1909 fixup_start_addr = run_end - host_offset;
1911 * This host page has gone, the next loop iteration starts
1912 * from after the fixup
1914 run_start = fixup_start_addr + host_ratio;
1915 } else {
1917 * No discards on this iteration, next loop starts from
1918 * next sent/dirty page
1920 run_start = run_end + 1;
1924 if (do_fixup) {
1925 unsigned long page;
1927 /* Tell the destination to discard this page */
1928 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1929 /* For the unsent_pass we:
1930 * discard partially sent pages
1931 * For the !unsent_pass (dirty) we:
1932 * discard partially dirty pages that were sent
1933 * (any partially sent pages were already discarded
1934 * by the previous unsent_pass)
1936 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1937 host_ratio);
1940 /* Clean up the bitmap */
1941 for (page = fixup_start_addr;
1942 page < fixup_start_addr + host_ratio; page++) {
1943 /* All pages in this host page are now not sent */
1944 set_bit(page, unsentmap);
1947 * Remark them as dirty, updating the count for any pages
1948 * that weren't previously dirty.
1950 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1954 if (unsent_pass) {
1955 /* Find the next sent page for the next iteration */
1956 run_start = find_next_zero_bit(unsentmap, pages, run_start);
1957 } else {
1958 /* Find the next dirty page for the next iteration */
1959 run_start = find_next_bit(bitmap, pages, run_start);
1965 * postcopy_chuck_hostpages: discrad any partially sent host page
1967 * Utility for the outgoing postcopy code.
1969 * Discard any partially sent host-page size chunks, mark any partially
1970 * dirty host-page size chunks as all dirty. In this case the host-page
1971 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1973 * Returns zero on success
1975 * @ms: current migration state
1976 * @block: block we want to work with
1978 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1980 PostcopyDiscardState *pds =
1981 postcopy_discard_send_init(ms, block->idstr);
1983 /* First pass: Discard all partially sent host pages */
1984 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1986 * Second pass: Ensure that all partially dirty host pages are made
1987 * fully dirty.
1989 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1991 postcopy_discard_send_finish(ms, pds);
1992 return 0;
1996 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1998 * Returns zero on success
2000 * Transmit the set of pages to be discarded after precopy to the target
2001 * these are pages that:
2002 * a) Have been previously transmitted but are now dirty again
2003 * b) Pages that have never been transmitted, this ensures that
2004 * any pages on the destination that have been mapped by background
2005 * tasks get discarded (transparent huge pages is the specific concern)
2006 * Hopefully this is pretty sparse
2008 * @ms: current migration state
2010 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2012 RAMState *rs = ram_state;
2013 RAMBlock *block;
2014 int ret;
2016 rcu_read_lock();
2018 /* This should be our last sync, the src is now paused */
2019 migration_bitmap_sync(rs);
2021 /* Easiest way to make sure we don't resume in the middle of a host-page */
2022 rs->last_seen_block = NULL;
2023 rs->last_sent_block = NULL;
2024 rs->last_page = 0;
2026 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2027 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2028 unsigned long *bitmap = block->bmap;
2029 unsigned long *unsentmap = block->unsentmap;
2031 if (!unsentmap) {
2032 /* We don't have a safe way to resize the sentmap, so
2033 * if the bitmap was resized it will be NULL at this
2034 * point.
2036 error_report("migration ram resized during precopy phase");
2037 rcu_read_unlock();
2038 return -EINVAL;
2040 /* Deal with TPS != HPS and huge pages */
2041 ret = postcopy_chunk_hostpages(ms, block);
2042 if (ret) {
2043 rcu_read_unlock();
2044 return ret;
2048 * Update the unsentmap to be unsentmap = unsentmap | dirty
2050 bitmap_or(unsentmap, unsentmap, bitmap, pages);
2051 #ifdef DEBUG_POSTCOPY
2052 ram_debug_dump_bitmap(unsentmap, true, pages);
2053 #endif
2055 trace_ram_postcopy_send_discard_bitmap();
2057 ret = postcopy_each_ram_send_discard(ms);
2058 rcu_read_unlock();
2060 return ret;
2064 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2066 * Returns zero on success
2068 * @rbname: name of the RAMBlock of the request. NULL means the
2069 * same that last one.
2070 * @start: RAMBlock starting page
2071 * @length: RAMBlock size
2073 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2075 int ret = -1;
2077 trace_ram_discard_range(rbname, start, length);
2079 rcu_read_lock();
2080 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2082 if (!rb) {
2083 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2084 goto err;
2087 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2088 length >> qemu_target_page_bits());
2089 ret = ram_block_discard_range(rb, start, length);
2091 err:
2092 rcu_read_unlock();
2094 return ret;
2098 * For every allocation, we will try not to crash the VM if the
2099 * allocation failed.
2101 static int xbzrle_init(void)
2103 Error *local_err = NULL;
2105 if (!migrate_use_xbzrle()) {
2106 return 0;
2109 XBZRLE_cache_lock();
2111 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2112 if (!XBZRLE.zero_target_page) {
2113 error_report("%s: Error allocating zero page", __func__);
2114 goto err_out;
2117 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2118 TARGET_PAGE_SIZE, &local_err);
2119 if (!XBZRLE.cache) {
2120 error_report_err(local_err);
2121 goto free_zero_page;
2124 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2125 if (!XBZRLE.encoded_buf) {
2126 error_report("%s: Error allocating encoded_buf", __func__);
2127 goto free_cache;
2130 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2131 if (!XBZRLE.current_buf) {
2132 error_report("%s: Error allocating current_buf", __func__);
2133 goto free_encoded_buf;
2136 /* We are all good */
2137 XBZRLE_cache_unlock();
2138 return 0;
2140 free_encoded_buf:
2141 g_free(XBZRLE.encoded_buf);
2142 XBZRLE.encoded_buf = NULL;
2143 free_cache:
2144 cache_fini(XBZRLE.cache);
2145 XBZRLE.cache = NULL;
2146 free_zero_page:
2147 g_free(XBZRLE.zero_target_page);
2148 XBZRLE.zero_target_page = NULL;
2149 err_out:
2150 XBZRLE_cache_unlock();
2151 return -ENOMEM;
2154 static int ram_state_init(RAMState **rsp)
2156 *rsp = g_try_new0(RAMState, 1);
2158 if (!*rsp) {
2159 error_report("%s: Init ramstate fail", __func__);
2160 return -1;
2163 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2164 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2165 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2168 * Count the total number of pages used by ram blocks not including any
2169 * gaps due to alignment or unplugs.
2171 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2173 ram_state_reset(*rsp);
2175 return 0;
2178 static void ram_list_init_bitmaps(void)
2180 RAMBlock *block;
2181 unsigned long pages;
2183 /* Skip setting bitmap if there is no RAM */
2184 if (ram_bytes_total()) {
2185 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2186 pages = block->max_length >> TARGET_PAGE_BITS;
2187 block->bmap = bitmap_new(pages);
2188 bitmap_set(block->bmap, 0, pages);
2189 if (migrate_postcopy_ram()) {
2190 block->unsentmap = bitmap_new(pages);
2191 bitmap_set(block->unsentmap, 0, pages);
2197 static void ram_init_bitmaps(RAMState *rs)
2199 /* For memory_global_dirty_log_start below. */
2200 qemu_mutex_lock_iothread();
2201 qemu_mutex_lock_ramlist();
2202 rcu_read_lock();
2204 ram_list_init_bitmaps();
2205 memory_global_dirty_log_start();
2206 migration_bitmap_sync(rs);
2208 rcu_read_unlock();
2209 qemu_mutex_unlock_ramlist();
2210 qemu_mutex_unlock_iothread();
2213 static int ram_init_all(RAMState **rsp)
2215 if (ram_state_init(rsp)) {
2216 return -1;
2219 if (xbzrle_init()) {
2220 ram_state_cleanup(rsp);
2221 return -1;
2224 ram_init_bitmaps(*rsp);
2226 return 0;
2230 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2231 * long-running RCU critical section. When rcu-reclaims in the code
2232 * start to become numerous it will be necessary to reduce the
2233 * granularity of these critical sections.
2237 * ram_save_setup: Setup RAM for migration
2239 * Returns zero to indicate success and negative for error
2241 * @f: QEMUFile where to send the data
2242 * @opaque: RAMState pointer
2244 static int ram_save_setup(QEMUFile *f, void *opaque)
2246 RAMState **rsp = opaque;
2247 RAMBlock *block;
2249 if (compress_threads_save_setup()) {
2250 return -1;
2253 /* migration has already setup the bitmap, reuse it. */
2254 if (!migration_in_colo_state()) {
2255 if (ram_init_all(rsp) != 0) {
2256 compress_threads_save_cleanup();
2257 return -1;
2260 (*rsp)->f = f;
2262 rcu_read_lock();
2264 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2266 RAMBLOCK_FOREACH(block) {
2267 qemu_put_byte(f, strlen(block->idstr));
2268 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2269 qemu_put_be64(f, block->used_length);
2270 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2271 qemu_put_be64(f, block->page_size);
2275 rcu_read_unlock();
2277 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2278 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2280 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2282 return 0;
2286 * ram_save_iterate: iterative stage for migration
2288 * Returns zero to indicate success and negative for error
2290 * @f: QEMUFile where to send the data
2291 * @opaque: RAMState pointer
2293 static int ram_save_iterate(QEMUFile *f, void *opaque)
2295 RAMState **temp = opaque;
2296 RAMState *rs = *temp;
2297 int ret;
2298 int i;
2299 int64_t t0;
2300 int done = 0;
2302 if (blk_mig_bulk_active()) {
2303 /* Avoid transferring ram during bulk phase of block migration as
2304 * the bulk phase will usually take a long time and transferring
2305 * ram updates during that time is pointless. */
2306 goto out;
2309 rcu_read_lock();
2310 if (ram_list.version != rs->last_version) {
2311 ram_state_reset(rs);
2314 /* Read version before ram_list.blocks */
2315 smp_rmb();
2317 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2319 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2320 i = 0;
2321 while ((ret = qemu_file_rate_limit(f)) == 0) {
2322 int pages;
2324 pages = ram_find_and_save_block(rs, false);
2325 /* no more pages to sent */
2326 if (pages == 0) {
2327 done = 1;
2328 break;
2330 rs->iterations++;
2332 /* we want to check in the 1st loop, just in case it was the 1st time
2333 and we had to sync the dirty bitmap.
2334 qemu_get_clock_ns() is a bit expensive, so we only check each some
2335 iterations
2337 if ((i & 63) == 0) {
2338 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2339 if (t1 > MAX_WAIT) {
2340 trace_ram_save_iterate_big_wait(t1, i);
2341 break;
2344 i++;
2346 flush_compressed_data(rs);
2347 rcu_read_unlock();
2350 * Must occur before EOS (or any QEMUFile operation)
2351 * because of RDMA protocol.
2353 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2355 out:
2356 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2357 ram_counters.transferred += 8;
2359 ret = qemu_file_get_error(f);
2360 if (ret < 0) {
2361 return ret;
2364 return done;
2368 * ram_save_complete: function called to send the remaining amount of ram
2370 * Returns zero to indicate success
2372 * Called with iothread lock
2374 * @f: QEMUFile where to send the data
2375 * @opaque: RAMState pointer
2377 static int ram_save_complete(QEMUFile *f, void *opaque)
2379 RAMState **temp = opaque;
2380 RAMState *rs = *temp;
2382 rcu_read_lock();
2384 if (!migration_in_postcopy()) {
2385 migration_bitmap_sync(rs);
2388 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2390 /* try transferring iterative blocks of memory */
2392 /* flush all remaining blocks regardless of rate limiting */
2393 while (true) {
2394 int pages;
2396 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2397 /* no more blocks to sent */
2398 if (pages == 0) {
2399 break;
2403 flush_compressed_data(rs);
2404 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2406 rcu_read_unlock();
2408 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2410 return 0;
2413 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2414 uint64_t *res_precopy_only,
2415 uint64_t *res_compatible,
2416 uint64_t *res_postcopy_only)
2418 RAMState **temp = opaque;
2419 RAMState *rs = *temp;
2420 uint64_t remaining_size;
2422 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2424 if (!migration_in_postcopy() &&
2425 remaining_size < max_size) {
2426 qemu_mutex_lock_iothread();
2427 rcu_read_lock();
2428 migration_bitmap_sync(rs);
2429 rcu_read_unlock();
2430 qemu_mutex_unlock_iothread();
2431 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2434 if (migrate_postcopy_ram()) {
2435 /* We can do postcopy, and all the data is postcopiable */
2436 *res_compatible += remaining_size;
2437 } else {
2438 *res_precopy_only += remaining_size;
2442 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2444 unsigned int xh_len;
2445 int xh_flags;
2446 uint8_t *loaded_data;
2448 /* extract RLE header */
2449 xh_flags = qemu_get_byte(f);
2450 xh_len = qemu_get_be16(f);
2452 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2453 error_report("Failed to load XBZRLE page - wrong compression!");
2454 return -1;
2457 if (xh_len > TARGET_PAGE_SIZE) {
2458 error_report("Failed to load XBZRLE page - len overflow!");
2459 return -1;
2461 loaded_data = XBZRLE.decoded_buf;
2462 /* load data and decode */
2463 /* it can change loaded_data to point to an internal buffer */
2464 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2466 /* decode RLE */
2467 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2468 TARGET_PAGE_SIZE) == -1) {
2469 error_report("Failed to load XBZRLE page - decode error!");
2470 return -1;
2473 return 0;
2477 * ram_block_from_stream: read a RAMBlock id from the migration stream
2479 * Must be called from within a rcu critical section.
2481 * Returns a pointer from within the RCU-protected ram_list.
2483 * @f: QEMUFile where to read the data from
2484 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2486 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2488 static RAMBlock *block = NULL;
2489 char id[256];
2490 uint8_t len;
2492 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2493 if (!block) {
2494 error_report("Ack, bad migration stream!");
2495 return NULL;
2497 return block;
2500 len = qemu_get_byte(f);
2501 qemu_get_buffer(f, (uint8_t *)id, len);
2502 id[len] = 0;
2504 block = qemu_ram_block_by_name(id);
2505 if (!block) {
2506 error_report("Can't find block %s", id);
2507 return NULL;
2510 return block;
2513 static inline void *host_from_ram_block_offset(RAMBlock *block,
2514 ram_addr_t offset)
2516 if (!offset_in_ramblock(block, offset)) {
2517 return NULL;
2520 return block->host + offset;
2524 * ram_handle_compressed: handle the zero page case
2526 * If a page (or a whole RDMA chunk) has been
2527 * determined to be zero, then zap it.
2529 * @host: host address for the zero page
2530 * @ch: what the page is filled from. We only support zero
2531 * @size: size of the zero page
2533 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2535 if (ch != 0 || !is_zero_range(host, size)) {
2536 memset(host, ch, size);
2540 /* return the size after decompression, or negative value on error */
2541 static int
2542 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2543 const uint8_t *source, size_t source_len)
2545 int err;
2547 err = inflateReset(stream);
2548 if (err != Z_OK) {
2549 return -1;
2552 stream->avail_in = source_len;
2553 stream->next_in = (uint8_t *)source;
2554 stream->avail_out = dest_len;
2555 stream->next_out = dest;
2557 err = inflate(stream, Z_NO_FLUSH);
2558 if (err != Z_STREAM_END) {
2559 return -1;
2562 return stream->total_out;
2565 static void *do_data_decompress(void *opaque)
2567 DecompressParam *param = opaque;
2568 unsigned long pagesize;
2569 uint8_t *des;
2570 int len, ret;
2572 qemu_mutex_lock(&param->mutex);
2573 while (!param->quit) {
2574 if (param->des) {
2575 des = param->des;
2576 len = param->len;
2577 param->des = 0;
2578 qemu_mutex_unlock(&param->mutex);
2580 pagesize = TARGET_PAGE_SIZE;
2582 ret = qemu_uncompress_data(&param->stream, des, pagesize,
2583 param->compbuf, len);
2584 if (ret < 0) {
2585 error_report("decompress data failed");
2586 qemu_file_set_error(decomp_file, ret);
2589 qemu_mutex_lock(&decomp_done_lock);
2590 param->done = true;
2591 qemu_cond_signal(&decomp_done_cond);
2592 qemu_mutex_unlock(&decomp_done_lock);
2594 qemu_mutex_lock(&param->mutex);
2595 } else {
2596 qemu_cond_wait(&param->cond, &param->mutex);
2599 qemu_mutex_unlock(&param->mutex);
2601 return NULL;
2604 static int wait_for_decompress_done(void)
2606 int idx, thread_count;
2608 if (!migrate_use_compression()) {
2609 return 0;
2612 thread_count = migrate_decompress_threads();
2613 qemu_mutex_lock(&decomp_done_lock);
2614 for (idx = 0; idx < thread_count; idx++) {
2615 while (!decomp_param[idx].done) {
2616 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2619 qemu_mutex_unlock(&decomp_done_lock);
2620 return qemu_file_get_error(decomp_file);
2623 static void compress_threads_load_cleanup(void)
2625 int i, thread_count;
2627 if (!migrate_use_compression()) {
2628 return;
2630 thread_count = migrate_decompress_threads();
2631 for (i = 0; i < thread_count; i++) {
2633 * we use it as a indicator which shows if the thread is
2634 * properly init'd or not
2636 if (!decomp_param[i].compbuf) {
2637 break;
2640 qemu_mutex_lock(&decomp_param[i].mutex);
2641 decomp_param[i].quit = true;
2642 qemu_cond_signal(&decomp_param[i].cond);
2643 qemu_mutex_unlock(&decomp_param[i].mutex);
2645 for (i = 0; i < thread_count; i++) {
2646 if (!decomp_param[i].compbuf) {
2647 break;
2650 qemu_thread_join(decompress_threads + i);
2651 qemu_mutex_destroy(&decomp_param[i].mutex);
2652 qemu_cond_destroy(&decomp_param[i].cond);
2653 inflateEnd(&decomp_param[i].stream);
2654 g_free(decomp_param[i].compbuf);
2655 decomp_param[i].compbuf = NULL;
2657 g_free(decompress_threads);
2658 g_free(decomp_param);
2659 decompress_threads = NULL;
2660 decomp_param = NULL;
2661 decomp_file = NULL;
2664 static int compress_threads_load_setup(QEMUFile *f)
2666 int i, thread_count;
2668 if (!migrate_use_compression()) {
2669 return 0;
2672 thread_count = migrate_decompress_threads();
2673 decompress_threads = g_new0(QemuThread, thread_count);
2674 decomp_param = g_new0(DecompressParam, thread_count);
2675 qemu_mutex_init(&decomp_done_lock);
2676 qemu_cond_init(&decomp_done_cond);
2677 decomp_file = f;
2678 for (i = 0; i < thread_count; i++) {
2679 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2680 goto exit;
2683 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2684 qemu_mutex_init(&decomp_param[i].mutex);
2685 qemu_cond_init(&decomp_param[i].cond);
2686 decomp_param[i].done = true;
2687 decomp_param[i].quit = false;
2688 qemu_thread_create(decompress_threads + i, "decompress",
2689 do_data_decompress, decomp_param + i,
2690 QEMU_THREAD_JOINABLE);
2692 return 0;
2693 exit:
2694 compress_threads_load_cleanup();
2695 return -1;
2698 static void decompress_data_with_multi_threads(QEMUFile *f,
2699 void *host, int len)
2701 int idx, thread_count;
2703 thread_count = migrate_decompress_threads();
2704 qemu_mutex_lock(&decomp_done_lock);
2705 while (true) {
2706 for (idx = 0; idx < thread_count; idx++) {
2707 if (decomp_param[idx].done) {
2708 decomp_param[idx].done = false;
2709 qemu_mutex_lock(&decomp_param[idx].mutex);
2710 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2711 decomp_param[idx].des = host;
2712 decomp_param[idx].len = len;
2713 qemu_cond_signal(&decomp_param[idx].cond);
2714 qemu_mutex_unlock(&decomp_param[idx].mutex);
2715 break;
2718 if (idx < thread_count) {
2719 break;
2720 } else {
2721 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2724 qemu_mutex_unlock(&decomp_done_lock);
2728 * ram_load_setup: Setup RAM for migration incoming side
2730 * Returns zero to indicate success and negative for error
2732 * @f: QEMUFile where to receive the data
2733 * @opaque: RAMState pointer
2735 static int ram_load_setup(QEMUFile *f, void *opaque)
2737 if (compress_threads_load_setup(f)) {
2738 return -1;
2741 xbzrle_load_setup();
2742 ramblock_recv_map_init();
2743 return 0;
2746 static int ram_load_cleanup(void *opaque)
2748 RAMBlock *rb;
2749 xbzrle_load_cleanup();
2750 compress_threads_load_cleanup();
2752 RAMBLOCK_FOREACH(rb) {
2753 g_free(rb->receivedmap);
2754 rb->receivedmap = NULL;
2756 return 0;
2760 * ram_postcopy_incoming_init: allocate postcopy data structures
2762 * Returns 0 for success and negative if there was one error
2764 * @mis: current migration incoming state
2766 * Allocate data structures etc needed by incoming migration with
2767 * postcopy-ram. postcopy-ram's similarly names
2768 * postcopy_ram_incoming_init does the work.
2770 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2772 unsigned long ram_pages = last_ram_page();
2774 return postcopy_ram_incoming_init(mis, ram_pages);
2778 * ram_load_postcopy: load a page in postcopy case
2780 * Returns 0 for success or -errno in case of error
2782 * Called in postcopy mode by ram_load().
2783 * rcu_read_lock is taken prior to this being called.
2785 * @f: QEMUFile where to send the data
2787 static int ram_load_postcopy(QEMUFile *f)
2789 int flags = 0, ret = 0;
2790 bool place_needed = false;
2791 bool matching_page_sizes = false;
2792 MigrationIncomingState *mis = migration_incoming_get_current();
2793 /* Temporary page that is later 'placed' */
2794 void *postcopy_host_page = postcopy_get_tmp_page(mis);
2795 void *last_host = NULL;
2796 bool all_zero = false;
2798 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2799 ram_addr_t addr;
2800 void *host = NULL;
2801 void *page_buffer = NULL;
2802 void *place_source = NULL;
2803 RAMBlock *block = NULL;
2804 uint8_t ch;
2806 addr = qemu_get_be64(f);
2809 * If qemu file error, we should stop here, and then "addr"
2810 * may be invalid
2812 ret = qemu_file_get_error(f);
2813 if (ret) {
2814 break;
2817 flags = addr & ~TARGET_PAGE_MASK;
2818 addr &= TARGET_PAGE_MASK;
2820 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2821 place_needed = false;
2822 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2823 block = ram_block_from_stream(f, flags);
2825 host = host_from_ram_block_offset(block, addr);
2826 if (!host) {
2827 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2828 ret = -EINVAL;
2829 break;
2831 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2833 * Postcopy requires that we place whole host pages atomically;
2834 * these may be huge pages for RAMBlocks that are backed by
2835 * hugetlbfs.
2836 * To make it atomic, the data is read into a temporary page
2837 * that's moved into place later.
2838 * The migration protocol uses, possibly smaller, target-pages
2839 * however the source ensures it always sends all the components
2840 * of a host page in order.
2842 page_buffer = postcopy_host_page +
2843 ((uintptr_t)host & (block->page_size - 1));
2844 /* If all TP are zero then we can optimise the place */
2845 if (!((uintptr_t)host & (block->page_size - 1))) {
2846 all_zero = true;
2847 } else {
2848 /* not the 1st TP within the HP */
2849 if (host != (last_host + TARGET_PAGE_SIZE)) {
2850 error_report("Non-sequential target page %p/%p",
2851 host, last_host);
2852 ret = -EINVAL;
2853 break;
2859 * If it's the last part of a host page then we place the host
2860 * page
2862 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2863 (block->page_size - 1)) == 0;
2864 place_source = postcopy_host_page;
2866 last_host = host;
2868 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2869 case RAM_SAVE_FLAG_ZERO:
2870 ch = qemu_get_byte(f);
2871 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2872 if (ch) {
2873 all_zero = false;
2875 break;
2877 case RAM_SAVE_FLAG_PAGE:
2878 all_zero = false;
2879 if (!place_needed || !matching_page_sizes) {
2880 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2881 } else {
2882 /* Avoids the qemu_file copy during postcopy, which is
2883 * going to do a copy later; can only do it when we
2884 * do this read in one go (matching page sizes)
2886 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2887 TARGET_PAGE_SIZE);
2889 break;
2890 case RAM_SAVE_FLAG_EOS:
2891 /* normal exit */
2892 break;
2893 default:
2894 error_report("Unknown combination of migration flags: %#x"
2895 " (postcopy mode)", flags);
2896 ret = -EINVAL;
2897 break;
2900 /* Detect for any possible file errors */
2901 if (!ret && qemu_file_get_error(f)) {
2902 ret = qemu_file_get_error(f);
2905 if (!ret && place_needed) {
2906 /* This gets called at the last target page in the host page */
2907 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2909 if (all_zero) {
2910 ret = postcopy_place_page_zero(mis, place_dest,
2911 block);
2912 } else {
2913 ret = postcopy_place_page(mis, place_dest,
2914 place_source, block);
2919 return ret;
2922 static bool postcopy_is_advised(void)
2924 PostcopyState ps = postcopy_state_get();
2925 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
2928 static bool postcopy_is_running(void)
2930 PostcopyState ps = postcopy_state_get();
2931 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
2934 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2936 int flags = 0, ret = 0, invalid_flags = 0;
2937 static uint64_t seq_iter;
2938 int len = 0;
2940 * If system is running in postcopy mode, page inserts to host memory must
2941 * be atomic
2943 bool postcopy_running = postcopy_is_running();
2944 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2945 bool postcopy_advised = postcopy_is_advised();
2947 seq_iter++;
2949 if (version_id != 4) {
2950 ret = -EINVAL;
2953 if (!migrate_use_compression()) {
2954 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2956 /* This RCU critical section can be very long running.
2957 * When RCU reclaims in the code start to become numerous,
2958 * it will be necessary to reduce the granularity of this
2959 * critical section.
2961 rcu_read_lock();
2963 if (postcopy_running) {
2964 ret = ram_load_postcopy(f);
2967 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2968 ram_addr_t addr, total_ram_bytes;
2969 void *host = NULL;
2970 uint8_t ch;
2972 addr = qemu_get_be64(f);
2973 flags = addr & ~TARGET_PAGE_MASK;
2974 addr &= TARGET_PAGE_MASK;
2976 if (flags & invalid_flags) {
2977 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2978 error_report("Received an unexpected compressed page");
2981 ret = -EINVAL;
2982 break;
2985 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2986 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2987 RAMBlock *block = ram_block_from_stream(f, flags);
2989 host = host_from_ram_block_offset(block, addr);
2990 if (!host) {
2991 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2992 ret = -EINVAL;
2993 break;
2995 ramblock_recv_bitmap_set(block, host);
2996 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2999 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3000 case RAM_SAVE_FLAG_MEM_SIZE:
3001 /* Synchronize RAM block list */
3002 total_ram_bytes = addr;
3003 while (!ret && total_ram_bytes) {
3004 RAMBlock *block;
3005 char id[256];
3006 ram_addr_t length;
3008 len = qemu_get_byte(f);
3009 qemu_get_buffer(f, (uint8_t *)id, len);
3010 id[len] = 0;
3011 length = qemu_get_be64(f);
3013 block = qemu_ram_block_by_name(id);
3014 if (block) {
3015 if (length != block->used_length) {
3016 Error *local_err = NULL;
3018 ret = qemu_ram_resize(block, length,
3019 &local_err);
3020 if (local_err) {
3021 error_report_err(local_err);
3024 /* For postcopy we need to check hugepage sizes match */
3025 if (postcopy_advised &&
3026 block->page_size != qemu_host_page_size) {
3027 uint64_t remote_page_size = qemu_get_be64(f);
3028 if (remote_page_size != block->page_size) {
3029 error_report("Mismatched RAM page size %s "
3030 "(local) %zd != %" PRId64,
3031 id, block->page_size,
3032 remote_page_size);
3033 ret = -EINVAL;
3036 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3037 block->idstr);
3038 } else {
3039 error_report("Unknown ramblock \"%s\", cannot "
3040 "accept migration", id);
3041 ret = -EINVAL;
3044 total_ram_bytes -= length;
3046 break;
3048 case RAM_SAVE_FLAG_ZERO:
3049 ch = qemu_get_byte(f);
3050 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3051 break;
3053 case RAM_SAVE_FLAG_PAGE:
3054 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3055 break;
3057 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3058 len = qemu_get_be32(f);
3059 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3060 error_report("Invalid compressed data length: %d", len);
3061 ret = -EINVAL;
3062 break;
3064 decompress_data_with_multi_threads(f, host, len);
3065 break;
3067 case RAM_SAVE_FLAG_XBZRLE:
3068 if (load_xbzrle(f, addr, host) < 0) {
3069 error_report("Failed to decompress XBZRLE page at "
3070 RAM_ADDR_FMT, addr);
3071 ret = -EINVAL;
3072 break;
3074 break;
3075 case RAM_SAVE_FLAG_EOS:
3076 /* normal exit */
3077 break;
3078 default:
3079 if (flags & RAM_SAVE_FLAG_HOOK) {
3080 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3081 } else {
3082 error_report("Unknown combination of migration flags: %#x",
3083 flags);
3084 ret = -EINVAL;
3087 if (!ret) {
3088 ret = qemu_file_get_error(f);
3092 ret |= wait_for_decompress_done();
3093 rcu_read_unlock();
3094 trace_ram_load_complete(ret, seq_iter);
3095 return ret;
3098 static bool ram_has_postcopy(void *opaque)
3100 return migrate_postcopy_ram();
3103 static SaveVMHandlers savevm_ram_handlers = {
3104 .save_setup = ram_save_setup,
3105 .save_live_iterate = ram_save_iterate,
3106 .save_live_complete_postcopy = ram_save_complete,
3107 .save_live_complete_precopy = ram_save_complete,
3108 .has_postcopy = ram_has_postcopy,
3109 .save_live_pending = ram_save_pending,
3110 .load_state = ram_load,
3111 .save_cleanup = ram_save_cleanup,
3112 .load_setup = ram_load_setup,
3113 .load_cleanup = ram_load_cleanup,
3116 void ram_mig_init(void)
3118 qemu_mutex_init(&XBZRLE.lock);
3119 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);