tests/boot-serial: Check the 40p machine, too
[qemu.git] / migration / ram.c
blob590fceb7e93f9c575d712d481355ff220c447ca2
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qemu/cutils.h"
33 #include "qemu/bitops.h"
34 #include "qemu/bitmap.h"
35 #include "qemu/main-loop.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "migration/page_cache.h"
44 #include "qemu/error-report.h"
45 #include "qapi/error.h"
46 #include "qapi/qapi-events-migration.h"
47 #include "qapi/qmp/qerror.h"
48 #include "trace.h"
49 #include "exec/ram_addr.h"
50 #include "exec/target_page.h"
51 #include "qemu/rcu_queue.h"
52 #include "migration/colo.h"
53 #include "migration/block.h"
55 /***********************************************************/
56 /* ram save/restore */
58 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
59 * worked for pages that where filled with the same char. We switched
60 * it to only search for the zero value. And to avoid confusion with
61 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
64 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
65 #define RAM_SAVE_FLAG_ZERO 0x02
66 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
67 #define RAM_SAVE_FLAG_PAGE 0x08
68 #define RAM_SAVE_FLAG_EOS 0x10
69 #define RAM_SAVE_FLAG_CONTINUE 0x20
70 #define RAM_SAVE_FLAG_XBZRLE 0x40
71 /* 0x80 is reserved in migration.h start with 0x100 next */
72 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
74 static inline bool is_zero_range(uint8_t *p, uint64_t size)
76 return buffer_is_zero(p, size);
79 XBZRLECacheStats xbzrle_counters;
81 /* struct contains XBZRLE cache and a static page
82 used by the compression */
83 static struct {
84 /* buffer used for XBZRLE encoding */
85 uint8_t *encoded_buf;
86 /* buffer for storing page content */
87 uint8_t *current_buf;
88 /* Cache for XBZRLE, Protected by lock. */
89 PageCache *cache;
90 QemuMutex lock;
91 /* it will store a page full of zeros */
92 uint8_t *zero_target_page;
93 /* buffer used for XBZRLE decoding */
94 uint8_t *decoded_buf;
95 } XBZRLE;
97 static void XBZRLE_cache_lock(void)
99 if (migrate_use_xbzrle())
100 qemu_mutex_lock(&XBZRLE.lock);
103 static void XBZRLE_cache_unlock(void)
105 if (migrate_use_xbzrle())
106 qemu_mutex_unlock(&XBZRLE.lock);
110 * xbzrle_cache_resize: resize the xbzrle cache
112 * This function is called from qmp_migrate_set_cache_size in main
113 * thread, possibly while a migration is in progress. A running
114 * migration may be using the cache and might finish during this call,
115 * hence changes to the cache are protected by XBZRLE.lock().
117 * Returns 0 for success or -1 for error
119 * @new_size: new cache size
120 * @errp: set *errp if the check failed, with reason
122 int xbzrle_cache_resize(int64_t new_size, Error **errp)
124 PageCache *new_cache;
125 int64_t ret = 0;
127 /* Check for truncation */
128 if (new_size != (size_t)new_size) {
129 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
130 "exceeding address space");
131 return -1;
134 if (new_size == migrate_xbzrle_cache_size()) {
135 /* nothing to do */
136 return 0;
139 XBZRLE_cache_lock();
141 if (XBZRLE.cache != NULL) {
142 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
143 if (!new_cache) {
144 ret = -1;
145 goto out;
148 cache_fini(XBZRLE.cache);
149 XBZRLE.cache = new_cache;
151 out:
152 XBZRLE_cache_unlock();
153 return ret;
156 static void ramblock_recv_map_init(void)
158 RAMBlock *rb;
160 RAMBLOCK_FOREACH(rb) {
161 assert(!rb->receivedmap);
162 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
166 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
168 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
169 rb->receivedmap);
172 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
174 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
177 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
178 size_t nr)
180 bitmap_set_atomic(rb->receivedmap,
181 ramblock_recv_bitmap_offset(host_addr, rb),
182 nr);
186 * An outstanding page request, on the source, having been received
187 * and queued
189 struct RAMSrcPageRequest {
190 RAMBlock *rb;
191 hwaddr offset;
192 hwaddr len;
194 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
197 /* State of RAM for migration */
198 struct RAMState {
199 /* QEMUFile used for this migration */
200 QEMUFile *f;
201 /* Last block that we have visited searching for dirty pages */
202 RAMBlock *last_seen_block;
203 /* Last block from where we have sent data */
204 RAMBlock *last_sent_block;
205 /* Last dirty target page we have sent */
206 ram_addr_t last_page;
207 /* last ram version we have seen */
208 uint32_t last_version;
209 /* We are in the first round */
210 bool ram_bulk_stage;
211 /* How many times we have dirty too many pages */
212 int dirty_rate_high_cnt;
213 /* these variables are used for bitmap sync */
214 /* last time we did a full bitmap_sync */
215 int64_t time_last_bitmap_sync;
216 /* bytes transferred at start_time */
217 uint64_t bytes_xfer_prev;
218 /* number of dirty pages since start_time */
219 uint64_t num_dirty_pages_period;
220 /* xbzrle misses since the beginning of the period */
221 uint64_t xbzrle_cache_miss_prev;
222 /* number of iterations at the beginning of period */
223 uint64_t iterations_prev;
224 /* Iterations since start */
225 uint64_t iterations;
226 /* number of dirty bits in the bitmap */
227 uint64_t migration_dirty_pages;
228 /* protects modification of the bitmap */
229 QemuMutex bitmap_mutex;
230 /* The RAMBlock used in the last src_page_requests */
231 RAMBlock *last_req_rb;
232 /* Queue of outstanding page requests from the destination */
233 QemuMutex src_page_req_mutex;
234 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
236 typedef struct RAMState RAMState;
238 static RAMState *ram_state;
240 uint64_t ram_bytes_remaining(void)
242 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
246 MigrationStats ram_counters;
248 /* used by the search for pages to send */
249 struct PageSearchStatus {
250 /* Current block being searched */
251 RAMBlock *block;
252 /* Current page to search from */
253 unsigned long page;
254 /* Set once we wrap around */
255 bool complete_round;
257 typedef struct PageSearchStatus PageSearchStatus;
259 struct CompressParam {
260 bool done;
261 bool quit;
262 QEMUFile *file;
263 QemuMutex mutex;
264 QemuCond cond;
265 RAMBlock *block;
266 ram_addr_t offset;
268 typedef struct CompressParam CompressParam;
270 struct DecompressParam {
271 bool done;
272 bool quit;
273 QemuMutex mutex;
274 QemuCond cond;
275 void *des;
276 uint8_t *compbuf;
277 int len;
279 typedef struct DecompressParam DecompressParam;
281 static CompressParam *comp_param;
282 static QemuThread *compress_threads;
283 /* comp_done_cond is used to wake up the migration thread when
284 * one of the compression threads has finished the compression.
285 * comp_done_lock is used to co-work with comp_done_cond.
287 static QemuMutex comp_done_lock;
288 static QemuCond comp_done_cond;
289 /* The empty QEMUFileOps will be used by file in CompressParam */
290 static const QEMUFileOps empty_ops = { };
292 static DecompressParam *decomp_param;
293 static QemuThread *decompress_threads;
294 static QemuMutex decomp_done_lock;
295 static QemuCond decomp_done_cond;
297 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
298 ram_addr_t offset);
300 static void *do_data_compress(void *opaque)
302 CompressParam *param = opaque;
303 RAMBlock *block;
304 ram_addr_t offset;
306 qemu_mutex_lock(&param->mutex);
307 while (!param->quit) {
308 if (param->block) {
309 block = param->block;
310 offset = param->offset;
311 param->block = NULL;
312 qemu_mutex_unlock(&param->mutex);
314 do_compress_ram_page(param->file, block, offset);
316 qemu_mutex_lock(&comp_done_lock);
317 param->done = true;
318 qemu_cond_signal(&comp_done_cond);
319 qemu_mutex_unlock(&comp_done_lock);
321 qemu_mutex_lock(&param->mutex);
322 } else {
323 qemu_cond_wait(&param->cond, &param->mutex);
326 qemu_mutex_unlock(&param->mutex);
328 return NULL;
331 static inline void terminate_compression_threads(void)
333 int idx, thread_count;
335 thread_count = migrate_compress_threads();
337 for (idx = 0; idx < thread_count; idx++) {
338 qemu_mutex_lock(&comp_param[idx].mutex);
339 comp_param[idx].quit = true;
340 qemu_cond_signal(&comp_param[idx].cond);
341 qemu_mutex_unlock(&comp_param[idx].mutex);
345 static void compress_threads_save_cleanup(void)
347 int i, thread_count;
349 if (!migrate_use_compression()) {
350 return;
352 terminate_compression_threads();
353 thread_count = migrate_compress_threads();
354 for (i = 0; i < thread_count; i++) {
355 qemu_thread_join(compress_threads + i);
356 qemu_fclose(comp_param[i].file);
357 qemu_mutex_destroy(&comp_param[i].mutex);
358 qemu_cond_destroy(&comp_param[i].cond);
360 qemu_mutex_destroy(&comp_done_lock);
361 qemu_cond_destroy(&comp_done_cond);
362 g_free(compress_threads);
363 g_free(comp_param);
364 compress_threads = NULL;
365 comp_param = NULL;
368 static void compress_threads_save_setup(void)
370 int i, thread_count;
372 if (!migrate_use_compression()) {
373 return;
375 thread_count = migrate_compress_threads();
376 compress_threads = g_new0(QemuThread, thread_count);
377 comp_param = g_new0(CompressParam, thread_count);
378 qemu_cond_init(&comp_done_cond);
379 qemu_mutex_init(&comp_done_lock);
380 for (i = 0; i < thread_count; i++) {
381 /* comp_param[i].file is just used as a dummy buffer to save data,
382 * set its ops to empty.
384 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
385 comp_param[i].done = true;
386 comp_param[i].quit = false;
387 qemu_mutex_init(&comp_param[i].mutex);
388 qemu_cond_init(&comp_param[i].cond);
389 qemu_thread_create(compress_threads + i, "compress",
390 do_data_compress, comp_param + i,
391 QEMU_THREAD_JOINABLE);
395 /* Multiple fd's */
397 struct MultiFDSendParams {
398 uint8_t id;
399 char *name;
400 QemuThread thread;
401 QemuSemaphore sem;
402 QemuMutex mutex;
403 bool quit;
405 typedef struct MultiFDSendParams MultiFDSendParams;
407 struct {
408 MultiFDSendParams *params;
409 /* number of created threads */
410 int count;
411 } *multifd_send_state;
413 static void terminate_multifd_send_threads(Error *errp)
415 int i;
417 for (i = 0; i < multifd_send_state->count; i++) {
418 MultiFDSendParams *p = &multifd_send_state->params[i];
420 qemu_mutex_lock(&p->mutex);
421 p->quit = true;
422 qemu_sem_post(&p->sem);
423 qemu_mutex_unlock(&p->mutex);
427 int multifd_save_cleanup(Error **errp)
429 int i;
430 int ret = 0;
432 if (!migrate_use_multifd()) {
433 return 0;
435 terminate_multifd_send_threads(NULL);
436 for (i = 0; i < multifd_send_state->count; i++) {
437 MultiFDSendParams *p = &multifd_send_state->params[i];
439 qemu_thread_join(&p->thread);
440 qemu_mutex_destroy(&p->mutex);
441 qemu_sem_destroy(&p->sem);
442 g_free(p->name);
443 p->name = NULL;
445 g_free(multifd_send_state->params);
446 multifd_send_state->params = NULL;
447 g_free(multifd_send_state);
448 multifd_send_state = NULL;
449 return ret;
452 static void *multifd_send_thread(void *opaque)
454 MultiFDSendParams *p = opaque;
456 while (true) {
457 qemu_mutex_lock(&p->mutex);
458 if (p->quit) {
459 qemu_mutex_unlock(&p->mutex);
460 break;
462 qemu_mutex_unlock(&p->mutex);
463 qemu_sem_wait(&p->sem);
466 return NULL;
469 int multifd_save_setup(void)
471 int thread_count;
472 uint8_t i;
474 if (!migrate_use_multifd()) {
475 return 0;
477 thread_count = migrate_multifd_channels();
478 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
479 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
480 multifd_send_state->count = 0;
481 for (i = 0; i < thread_count; i++) {
482 MultiFDSendParams *p = &multifd_send_state->params[i];
484 qemu_mutex_init(&p->mutex);
485 qemu_sem_init(&p->sem, 0);
486 p->quit = false;
487 p->id = i;
488 p->name = g_strdup_printf("multifdsend_%d", i);
489 qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
490 QEMU_THREAD_JOINABLE);
492 multifd_send_state->count++;
494 return 0;
497 struct MultiFDRecvParams {
498 uint8_t id;
499 char *name;
500 QemuThread thread;
501 QemuSemaphore sem;
502 QemuMutex mutex;
503 bool quit;
505 typedef struct MultiFDRecvParams MultiFDRecvParams;
507 struct {
508 MultiFDRecvParams *params;
509 /* number of created threads */
510 int count;
511 } *multifd_recv_state;
513 static void terminate_multifd_recv_threads(Error *errp)
515 int i;
517 for (i = 0; i < multifd_recv_state->count; i++) {
518 MultiFDRecvParams *p = &multifd_recv_state->params[i];
520 qemu_mutex_lock(&p->mutex);
521 p->quit = true;
522 qemu_sem_post(&p->sem);
523 qemu_mutex_unlock(&p->mutex);
527 int multifd_load_cleanup(Error **errp)
529 int i;
530 int ret = 0;
532 if (!migrate_use_multifd()) {
533 return 0;
535 terminate_multifd_recv_threads(NULL);
536 for (i = 0; i < multifd_recv_state->count; i++) {
537 MultiFDRecvParams *p = &multifd_recv_state->params[i];
539 qemu_thread_join(&p->thread);
540 qemu_mutex_destroy(&p->mutex);
541 qemu_sem_destroy(&p->sem);
542 g_free(p->name);
543 p->name = NULL;
545 g_free(multifd_recv_state->params);
546 multifd_recv_state->params = NULL;
547 g_free(multifd_recv_state);
548 multifd_recv_state = NULL;
550 return ret;
553 static void *multifd_recv_thread(void *opaque)
555 MultiFDRecvParams *p = opaque;
557 while (true) {
558 qemu_mutex_lock(&p->mutex);
559 if (p->quit) {
560 qemu_mutex_unlock(&p->mutex);
561 break;
563 qemu_mutex_unlock(&p->mutex);
564 qemu_sem_wait(&p->sem);
567 return NULL;
570 int multifd_load_setup(void)
572 int thread_count;
573 uint8_t i;
575 if (!migrate_use_multifd()) {
576 return 0;
578 thread_count = migrate_multifd_channels();
579 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
580 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
581 multifd_recv_state->count = 0;
582 for (i = 0; i < thread_count; i++) {
583 MultiFDRecvParams *p = &multifd_recv_state->params[i];
585 qemu_mutex_init(&p->mutex);
586 qemu_sem_init(&p->sem, 0);
587 p->quit = false;
588 p->id = i;
589 p->name = g_strdup_printf("multifdrecv_%d", i);
590 qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
591 QEMU_THREAD_JOINABLE);
592 multifd_recv_state->count++;
594 return 0;
598 * save_page_header: write page header to wire
600 * If this is the 1st block, it also writes the block identification
602 * Returns the number of bytes written
604 * @f: QEMUFile where to send the data
605 * @block: block that contains the page we want to send
606 * @offset: offset inside the block for the page
607 * in the lower bits, it contains flags
609 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
610 ram_addr_t offset)
612 size_t size, len;
614 if (block == rs->last_sent_block) {
615 offset |= RAM_SAVE_FLAG_CONTINUE;
617 qemu_put_be64(f, offset);
618 size = 8;
620 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
621 len = strlen(block->idstr);
622 qemu_put_byte(f, len);
623 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
624 size += 1 + len;
625 rs->last_sent_block = block;
627 return size;
631 * mig_throttle_guest_down: throotle down the guest
633 * Reduce amount of guest cpu execution to hopefully slow down memory
634 * writes. If guest dirty memory rate is reduced below the rate at
635 * which we can transfer pages to the destination then we should be
636 * able to complete migration. Some workloads dirty memory way too
637 * fast and will not effectively converge, even with auto-converge.
639 static void mig_throttle_guest_down(void)
641 MigrationState *s = migrate_get_current();
642 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
643 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
645 /* We have not started throttling yet. Let's start it. */
646 if (!cpu_throttle_active()) {
647 cpu_throttle_set(pct_initial);
648 } else {
649 /* Throttling already on, just increase the rate */
650 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
655 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
657 * @rs: current RAM state
658 * @current_addr: address for the zero page
660 * Update the xbzrle cache to reflect a page that's been sent as all 0.
661 * The important thing is that a stale (not-yet-0'd) page be replaced
662 * by the new data.
663 * As a bonus, if the page wasn't in the cache it gets added so that
664 * when a small write is made into the 0'd page it gets XBZRLE sent.
666 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
668 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
669 return;
672 /* We don't care if this fails to allocate a new cache page
673 * as long as it updated an old one */
674 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
675 ram_counters.dirty_sync_count);
678 #define ENCODING_FLAG_XBZRLE 0x1
681 * save_xbzrle_page: compress and send current page
683 * Returns: 1 means that we wrote the page
684 * 0 means that page is identical to the one already sent
685 * -1 means that xbzrle would be longer than normal
687 * @rs: current RAM state
688 * @current_data: pointer to the address of the page contents
689 * @current_addr: addr of the page
690 * @block: block that contains the page we want to send
691 * @offset: offset inside the block for the page
692 * @last_stage: if we are at the completion stage
694 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
695 ram_addr_t current_addr, RAMBlock *block,
696 ram_addr_t offset, bool last_stage)
698 int encoded_len = 0, bytes_xbzrle;
699 uint8_t *prev_cached_page;
701 if (!cache_is_cached(XBZRLE.cache, current_addr,
702 ram_counters.dirty_sync_count)) {
703 xbzrle_counters.cache_miss++;
704 if (!last_stage) {
705 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
706 ram_counters.dirty_sync_count) == -1) {
707 return -1;
708 } else {
709 /* update *current_data when the page has been
710 inserted into cache */
711 *current_data = get_cached_data(XBZRLE.cache, current_addr);
714 return -1;
717 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
719 /* save current buffer into memory */
720 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
722 /* XBZRLE encoding (if there is no overflow) */
723 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
724 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
725 TARGET_PAGE_SIZE);
726 if (encoded_len == 0) {
727 trace_save_xbzrle_page_skipping();
728 return 0;
729 } else if (encoded_len == -1) {
730 trace_save_xbzrle_page_overflow();
731 xbzrle_counters.overflow++;
732 /* update data in the cache */
733 if (!last_stage) {
734 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
735 *current_data = prev_cached_page;
737 return -1;
740 /* we need to update the data in the cache, in order to get the same data */
741 if (!last_stage) {
742 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
745 /* Send XBZRLE based compressed page */
746 bytes_xbzrle = save_page_header(rs, rs->f, block,
747 offset | RAM_SAVE_FLAG_XBZRLE);
748 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
749 qemu_put_be16(rs->f, encoded_len);
750 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
751 bytes_xbzrle += encoded_len + 1 + 2;
752 xbzrle_counters.pages++;
753 xbzrle_counters.bytes += bytes_xbzrle;
754 ram_counters.transferred += bytes_xbzrle;
756 return 1;
760 * migration_bitmap_find_dirty: find the next dirty page from start
762 * Called with rcu_read_lock() to protect migration_bitmap
764 * Returns the byte offset within memory region of the start of a dirty page
766 * @rs: current RAM state
767 * @rb: RAMBlock where to search for dirty pages
768 * @start: page where we start the search
770 static inline
771 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
772 unsigned long start)
774 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
775 unsigned long *bitmap = rb->bmap;
776 unsigned long next;
778 if (rs->ram_bulk_stage && start > 0) {
779 next = start + 1;
780 } else {
781 next = find_next_bit(bitmap, size, start);
784 return next;
787 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
788 RAMBlock *rb,
789 unsigned long page)
791 bool ret;
793 ret = test_and_clear_bit(page, rb->bmap);
795 if (ret) {
796 rs->migration_dirty_pages--;
798 return ret;
801 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
802 ram_addr_t start, ram_addr_t length)
804 rs->migration_dirty_pages +=
805 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
806 &rs->num_dirty_pages_period);
810 * ram_pagesize_summary: calculate all the pagesizes of a VM
812 * Returns a summary bitmap of the page sizes of all RAMBlocks
814 * For VMs with just normal pages this is equivalent to the host page
815 * size. If it's got some huge pages then it's the OR of all the
816 * different page sizes.
818 uint64_t ram_pagesize_summary(void)
820 RAMBlock *block;
821 uint64_t summary = 0;
823 RAMBLOCK_FOREACH(block) {
824 summary |= block->page_size;
827 return summary;
830 static void migration_bitmap_sync(RAMState *rs)
832 RAMBlock *block;
833 int64_t end_time;
834 uint64_t bytes_xfer_now;
836 ram_counters.dirty_sync_count++;
838 if (!rs->time_last_bitmap_sync) {
839 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
842 trace_migration_bitmap_sync_start();
843 memory_global_dirty_log_sync();
845 qemu_mutex_lock(&rs->bitmap_mutex);
846 rcu_read_lock();
847 RAMBLOCK_FOREACH(block) {
848 migration_bitmap_sync_range(rs, block, 0, block->used_length);
850 rcu_read_unlock();
851 qemu_mutex_unlock(&rs->bitmap_mutex);
853 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
855 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
857 /* more than 1 second = 1000 millisecons */
858 if (end_time > rs->time_last_bitmap_sync + 1000) {
859 /* calculate period counters */
860 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
861 / (end_time - rs->time_last_bitmap_sync);
862 bytes_xfer_now = ram_counters.transferred;
864 /* During block migration the auto-converge logic incorrectly detects
865 * that ram migration makes no progress. Avoid this by disabling the
866 * throttling logic during the bulk phase of block migration. */
867 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
868 /* The following detection logic can be refined later. For now:
869 Check to see if the dirtied bytes is 50% more than the approx.
870 amount of bytes that just got transferred since the last time we
871 were in this routine. If that happens twice, start or increase
872 throttling */
874 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
875 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
876 (++rs->dirty_rate_high_cnt >= 2)) {
877 trace_migration_throttle();
878 rs->dirty_rate_high_cnt = 0;
879 mig_throttle_guest_down();
883 if (migrate_use_xbzrle()) {
884 if (rs->iterations_prev != rs->iterations) {
885 xbzrle_counters.cache_miss_rate =
886 (double)(xbzrle_counters.cache_miss -
887 rs->xbzrle_cache_miss_prev) /
888 (rs->iterations - rs->iterations_prev);
890 rs->iterations_prev = rs->iterations;
891 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
894 /* reset period counters */
895 rs->time_last_bitmap_sync = end_time;
896 rs->num_dirty_pages_period = 0;
897 rs->bytes_xfer_prev = bytes_xfer_now;
899 if (migrate_use_events()) {
900 qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
905 * save_zero_page: send the zero page to the stream
907 * Returns the number of pages written.
909 * @rs: current RAM state
910 * @block: block that contains the page we want to send
911 * @offset: offset inside the block for the page
913 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
915 uint8_t *p = block->host + offset;
916 int pages = -1;
918 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
919 ram_counters.duplicate++;
920 ram_counters.transferred +=
921 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
922 qemu_put_byte(rs->f, 0);
923 ram_counters.transferred += 1;
924 pages = 1;
927 return pages;
930 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
932 if (!migrate_release_ram() || !migration_in_postcopy()) {
933 return;
936 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
940 * ram_save_page: send the given page to the stream
942 * Returns the number of pages written.
943 * < 0 - error
944 * >=0 - Number of pages written - this might legally be 0
945 * if xbzrle noticed the page was the same.
947 * @rs: current RAM state
948 * @block: block that contains the page we want to send
949 * @offset: offset inside the block for the page
950 * @last_stage: if we are at the completion stage
952 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
954 int pages = -1;
955 uint64_t bytes_xmit;
956 ram_addr_t current_addr;
957 uint8_t *p;
958 int ret;
959 bool send_async = true;
960 RAMBlock *block = pss->block;
961 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
963 p = block->host + offset;
964 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
966 /* In doubt sent page as normal */
967 bytes_xmit = 0;
968 ret = ram_control_save_page(rs->f, block->offset,
969 offset, TARGET_PAGE_SIZE, &bytes_xmit);
970 if (bytes_xmit) {
971 ram_counters.transferred += bytes_xmit;
972 pages = 1;
975 XBZRLE_cache_lock();
977 current_addr = block->offset + offset;
979 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
980 if (ret != RAM_SAVE_CONTROL_DELAYED) {
981 if (bytes_xmit > 0) {
982 ram_counters.normal++;
983 } else if (bytes_xmit == 0) {
984 ram_counters.duplicate++;
987 } else {
988 pages = save_zero_page(rs, block, offset);
989 if (pages > 0) {
990 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
991 * page would be stale
993 xbzrle_cache_zero_page(rs, current_addr);
994 ram_release_pages(block->idstr, offset, pages);
995 } else if (!rs->ram_bulk_stage &&
996 !migration_in_postcopy() && migrate_use_xbzrle()) {
997 pages = save_xbzrle_page(rs, &p, current_addr, block,
998 offset, last_stage);
999 if (!last_stage) {
1000 /* Can't send this cached data async, since the cache page
1001 * might get updated before it gets to the wire
1003 send_async = false;
1008 /* XBZRLE overflow or normal page */
1009 if (pages == -1) {
1010 ram_counters.transferred +=
1011 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
1012 if (send_async) {
1013 qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
1014 migrate_release_ram() &
1015 migration_in_postcopy());
1016 } else {
1017 qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
1019 ram_counters.transferred += TARGET_PAGE_SIZE;
1020 pages = 1;
1021 ram_counters.normal++;
1024 XBZRLE_cache_unlock();
1026 return pages;
1029 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
1030 ram_addr_t offset)
1032 RAMState *rs = ram_state;
1033 int bytes_sent, blen;
1034 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1036 bytes_sent = save_page_header(rs, f, block, offset |
1037 RAM_SAVE_FLAG_COMPRESS_PAGE);
1038 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1039 migrate_compress_level());
1040 if (blen < 0) {
1041 bytes_sent = 0;
1042 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1043 error_report("compressed data failed!");
1044 } else {
1045 bytes_sent += blen;
1046 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1049 return bytes_sent;
1052 static void flush_compressed_data(RAMState *rs)
1054 int idx, len, thread_count;
1056 if (!migrate_use_compression()) {
1057 return;
1059 thread_count = migrate_compress_threads();
1061 qemu_mutex_lock(&comp_done_lock);
1062 for (idx = 0; idx < thread_count; idx++) {
1063 while (!comp_param[idx].done) {
1064 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1067 qemu_mutex_unlock(&comp_done_lock);
1069 for (idx = 0; idx < thread_count; idx++) {
1070 qemu_mutex_lock(&comp_param[idx].mutex);
1071 if (!comp_param[idx].quit) {
1072 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1073 ram_counters.transferred += len;
1075 qemu_mutex_unlock(&comp_param[idx].mutex);
1079 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1080 ram_addr_t offset)
1082 param->block = block;
1083 param->offset = offset;
1086 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1087 ram_addr_t offset)
1089 int idx, thread_count, bytes_xmit = -1, pages = -1;
1091 thread_count = migrate_compress_threads();
1092 qemu_mutex_lock(&comp_done_lock);
1093 while (true) {
1094 for (idx = 0; idx < thread_count; idx++) {
1095 if (comp_param[idx].done) {
1096 comp_param[idx].done = false;
1097 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1098 qemu_mutex_lock(&comp_param[idx].mutex);
1099 set_compress_params(&comp_param[idx], block, offset);
1100 qemu_cond_signal(&comp_param[idx].cond);
1101 qemu_mutex_unlock(&comp_param[idx].mutex);
1102 pages = 1;
1103 ram_counters.normal++;
1104 ram_counters.transferred += bytes_xmit;
1105 break;
1108 if (pages > 0) {
1109 break;
1110 } else {
1111 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1114 qemu_mutex_unlock(&comp_done_lock);
1116 return pages;
1120 * ram_save_compressed_page: compress the given page and send it to the stream
1122 * Returns the number of pages written.
1124 * @rs: current RAM state
1125 * @block: block that contains the page we want to send
1126 * @offset: offset inside the block for the page
1127 * @last_stage: if we are at the completion stage
1129 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
1130 bool last_stage)
1132 int pages = -1;
1133 uint64_t bytes_xmit = 0;
1134 uint8_t *p;
1135 int ret, blen;
1136 RAMBlock *block = pss->block;
1137 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1139 p = block->host + offset;
1141 ret = ram_control_save_page(rs->f, block->offset,
1142 offset, TARGET_PAGE_SIZE, &bytes_xmit);
1143 if (bytes_xmit) {
1144 ram_counters.transferred += bytes_xmit;
1145 pages = 1;
1147 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
1148 if (ret != RAM_SAVE_CONTROL_DELAYED) {
1149 if (bytes_xmit > 0) {
1150 ram_counters.normal++;
1151 } else if (bytes_xmit == 0) {
1152 ram_counters.duplicate++;
1155 } else {
1156 /* When starting the process of a new block, the first page of
1157 * the block should be sent out before other pages in the same
1158 * block, and all the pages in last block should have been sent
1159 * out, keeping this order is important, because the 'cont' flag
1160 * is used to avoid resending the block name.
1162 if (block != rs->last_sent_block) {
1163 flush_compressed_data(rs);
1164 pages = save_zero_page(rs, block, offset);
1165 if (pages == -1) {
1166 /* Make sure the first page is sent out before other pages */
1167 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1168 RAM_SAVE_FLAG_COMPRESS_PAGE);
1169 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1170 migrate_compress_level());
1171 if (blen > 0) {
1172 ram_counters.transferred += bytes_xmit + blen;
1173 ram_counters.normal++;
1174 pages = 1;
1175 } else {
1176 qemu_file_set_error(rs->f, blen);
1177 error_report("compressed data failed!");
1180 if (pages > 0) {
1181 ram_release_pages(block->idstr, offset, pages);
1183 } else {
1184 pages = save_zero_page(rs, block, offset);
1185 if (pages == -1) {
1186 pages = compress_page_with_multi_thread(rs, block, offset);
1187 } else {
1188 ram_release_pages(block->idstr, offset, pages);
1193 return pages;
1197 * find_dirty_block: find the next dirty page and update any state
1198 * associated with the search process.
1200 * Returns if a page is found
1202 * @rs: current RAM state
1203 * @pss: data about the state of the current dirty page scan
1204 * @again: set to false if the search has scanned the whole of RAM
1206 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1208 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1209 if (pss->complete_round && pss->block == rs->last_seen_block &&
1210 pss->page >= rs->last_page) {
1212 * We've been once around the RAM and haven't found anything.
1213 * Give up.
1215 *again = false;
1216 return false;
1218 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1219 /* Didn't find anything in this RAM Block */
1220 pss->page = 0;
1221 pss->block = QLIST_NEXT_RCU(pss->block, next);
1222 if (!pss->block) {
1223 /* Hit the end of the list */
1224 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1225 /* Flag that we've looped */
1226 pss->complete_round = true;
1227 rs->ram_bulk_stage = false;
1228 if (migrate_use_xbzrle()) {
1229 /* If xbzrle is on, stop using the data compression at this
1230 * point. In theory, xbzrle can do better than compression.
1232 flush_compressed_data(rs);
1235 /* Didn't find anything this time, but try again on the new block */
1236 *again = true;
1237 return false;
1238 } else {
1239 /* Can go around again, but... */
1240 *again = true;
1241 /* We've found something so probably don't need to */
1242 return true;
1247 * unqueue_page: gets a page of the queue
1249 * Helper for 'get_queued_page' - gets a page off the queue
1251 * Returns the block of the page (or NULL if none available)
1253 * @rs: current RAM state
1254 * @offset: used to return the offset within the RAMBlock
1256 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1258 RAMBlock *block = NULL;
1260 qemu_mutex_lock(&rs->src_page_req_mutex);
1261 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1262 struct RAMSrcPageRequest *entry =
1263 QSIMPLEQ_FIRST(&rs->src_page_requests);
1264 block = entry->rb;
1265 *offset = entry->offset;
1267 if (entry->len > TARGET_PAGE_SIZE) {
1268 entry->len -= TARGET_PAGE_SIZE;
1269 entry->offset += TARGET_PAGE_SIZE;
1270 } else {
1271 memory_region_unref(block->mr);
1272 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1273 g_free(entry);
1276 qemu_mutex_unlock(&rs->src_page_req_mutex);
1278 return block;
1282 * get_queued_page: unqueue a page from the postocpy requests
1284 * Skips pages that are already sent (!dirty)
1286 * Returns if a queued page is found
1288 * @rs: current RAM state
1289 * @pss: data about the state of the current dirty page scan
1291 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1293 RAMBlock *block;
1294 ram_addr_t offset;
1295 bool dirty;
1297 do {
1298 block = unqueue_page(rs, &offset);
1300 * We're sending this page, and since it's postcopy nothing else
1301 * will dirty it, and we must make sure it doesn't get sent again
1302 * even if this queue request was received after the background
1303 * search already sent it.
1305 if (block) {
1306 unsigned long page;
1308 page = offset >> TARGET_PAGE_BITS;
1309 dirty = test_bit(page, block->bmap);
1310 if (!dirty) {
1311 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1312 page, test_bit(page, block->unsentmap));
1313 } else {
1314 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1318 } while (block && !dirty);
1320 if (block) {
1322 * As soon as we start servicing pages out of order, then we have
1323 * to kill the bulk stage, since the bulk stage assumes
1324 * in (migration_bitmap_find_and_reset_dirty) that every page is
1325 * dirty, that's no longer true.
1327 rs->ram_bulk_stage = false;
1330 * We want the background search to continue from the queued page
1331 * since the guest is likely to want other pages near to the page
1332 * it just requested.
1334 pss->block = block;
1335 pss->page = offset >> TARGET_PAGE_BITS;
1338 return !!block;
1342 * migration_page_queue_free: drop any remaining pages in the ram
1343 * request queue
1345 * It should be empty at the end anyway, but in error cases there may
1346 * be some left. in case that there is any page left, we drop it.
1349 static void migration_page_queue_free(RAMState *rs)
1351 struct RAMSrcPageRequest *mspr, *next_mspr;
1352 /* This queue generally should be empty - but in the case of a failed
1353 * migration might have some droppings in.
1355 rcu_read_lock();
1356 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1357 memory_region_unref(mspr->rb->mr);
1358 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1359 g_free(mspr);
1361 rcu_read_unlock();
1365 * ram_save_queue_pages: queue the page for transmission
1367 * A request from postcopy destination for example.
1369 * Returns zero on success or negative on error
1371 * @rbname: Name of the RAMBLock of the request. NULL means the
1372 * same that last one.
1373 * @start: starting address from the start of the RAMBlock
1374 * @len: length (in bytes) to send
1376 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1378 RAMBlock *ramblock;
1379 RAMState *rs = ram_state;
1381 ram_counters.postcopy_requests++;
1382 rcu_read_lock();
1383 if (!rbname) {
1384 /* Reuse last RAMBlock */
1385 ramblock = rs->last_req_rb;
1387 if (!ramblock) {
1389 * Shouldn't happen, we can't reuse the last RAMBlock if
1390 * it's the 1st request.
1392 error_report("ram_save_queue_pages no previous block");
1393 goto err;
1395 } else {
1396 ramblock = qemu_ram_block_by_name(rbname);
1398 if (!ramblock) {
1399 /* We shouldn't be asked for a non-existent RAMBlock */
1400 error_report("ram_save_queue_pages no block '%s'", rbname);
1401 goto err;
1403 rs->last_req_rb = ramblock;
1405 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1406 if (start+len > ramblock->used_length) {
1407 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1408 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1409 __func__, start, len, ramblock->used_length);
1410 goto err;
1413 struct RAMSrcPageRequest *new_entry =
1414 g_malloc0(sizeof(struct RAMSrcPageRequest));
1415 new_entry->rb = ramblock;
1416 new_entry->offset = start;
1417 new_entry->len = len;
1419 memory_region_ref(ramblock->mr);
1420 qemu_mutex_lock(&rs->src_page_req_mutex);
1421 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1422 qemu_mutex_unlock(&rs->src_page_req_mutex);
1423 rcu_read_unlock();
1425 return 0;
1427 err:
1428 rcu_read_unlock();
1429 return -1;
1433 * ram_save_target_page: save one target page
1435 * Returns the number of pages written
1437 * @rs: current RAM state
1438 * @ms: current migration state
1439 * @pss: data about the page we want to send
1440 * @last_stage: if we are at the completion stage
1442 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1443 bool last_stage)
1445 int res = 0;
1447 /* Check the pages is dirty and if it is send it */
1448 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1450 * If xbzrle is on, stop using the data compression after first
1451 * round of migration even if compression is enabled. In theory,
1452 * xbzrle can do better than compression.
1454 if (migrate_use_compression() &&
1455 (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1456 res = ram_save_compressed_page(rs, pss, last_stage);
1457 } else {
1458 res = ram_save_page(rs, pss, last_stage);
1461 if (res < 0) {
1462 return res;
1464 if (pss->block->unsentmap) {
1465 clear_bit(pss->page, pss->block->unsentmap);
1469 return res;
1473 * ram_save_host_page: save a whole host page
1475 * Starting at *offset send pages up to the end of the current host
1476 * page. It's valid for the initial offset to point into the middle of
1477 * a host page in which case the remainder of the hostpage is sent.
1478 * Only dirty target pages are sent. Note that the host page size may
1479 * be a huge page for this block.
1480 * The saving stops at the boundary of the used_length of the block
1481 * if the RAMBlock isn't a multiple of the host page size.
1483 * Returns the number of pages written or negative on error
1485 * @rs: current RAM state
1486 * @ms: current migration state
1487 * @pss: data about the page we want to send
1488 * @last_stage: if we are at the completion stage
1490 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1491 bool last_stage)
1493 int tmppages, pages = 0;
1494 size_t pagesize_bits =
1495 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1497 do {
1498 tmppages = ram_save_target_page(rs, pss, last_stage);
1499 if (tmppages < 0) {
1500 return tmppages;
1503 pages += tmppages;
1504 pss->page++;
1505 } while ((pss->page & (pagesize_bits - 1)) &&
1506 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1508 /* The offset we leave with is the last one we looked at */
1509 pss->page--;
1510 return pages;
1514 * ram_find_and_save_block: finds a dirty page and sends it to f
1516 * Called within an RCU critical section.
1518 * Returns the number of pages written where zero means no dirty pages
1520 * @rs: current RAM state
1521 * @last_stage: if we are at the completion stage
1523 * On systems where host-page-size > target-page-size it will send all the
1524 * pages in a host page that are dirty.
1527 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1529 PageSearchStatus pss;
1530 int pages = 0;
1531 bool again, found;
1533 /* No dirty page as there is zero RAM */
1534 if (!ram_bytes_total()) {
1535 return pages;
1538 pss.block = rs->last_seen_block;
1539 pss.page = rs->last_page;
1540 pss.complete_round = false;
1542 if (!pss.block) {
1543 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1546 do {
1547 again = true;
1548 found = get_queued_page(rs, &pss);
1550 if (!found) {
1551 /* priority queue empty, so just search for something dirty */
1552 found = find_dirty_block(rs, &pss, &again);
1555 if (found) {
1556 pages = ram_save_host_page(rs, &pss, last_stage);
1558 } while (!pages && again);
1560 rs->last_seen_block = pss.block;
1561 rs->last_page = pss.page;
1563 return pages;
1566 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1568 uint64_t pages = size / TARGET_PAGE_SIZE;
1570 if (zero) {
1571 ram_counters.duplicate += pages;
1572 } else {
1573 ram_counters.normal += pages;
1574 ram_counters.transferred += size;
1575 qemu_update_position(f, size);
1579 uint64_t ram_bytes_total(void)
1581 RAMBlock *block;
1582 uint64_t total = 0;
1584 rcu_read_lock();
1585 RAMBLOCK_FOREACH(block) {
1586 total += block->used_length;
1588 rcu_read_unlock();
1589 return total;
1592 static void xbzrle_load_setup(void)
1594 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1597 static void xbzrle_load_cleanup(void)
1599 g_free(XBZRLE.decoded_buf);
1600 XBZRLE.decoded_buf = NULL;
1603 static void ram_state_cleanup(RAMState **rsp)
1605 if (*rsp) {
1606 migration_page_queue_free(*rsp);
1607 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1608 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1609 g_free(*rsp);
1610 *rsp = NULL;
1614 static void xbzrle_cleanup(void)
1616 XBZRLE_cache_lock();
1617 if (XBZRLE.cache) {
1618 cache_fini(XBZRLE.cache);
1619 g_free(XBZRLE.encoded_buf);
1620 g_free(XBZRLE.current_buf);
1621 g_free(XBZRLE.zero_target_page);
1622 XBZRLE.cache = NULL;
1623 XBZRLE.encoded_buf = NULL;
1624 XBZRLE.current_buf = NULL;
1625 XBZRLE.zero_target_page = NULL;
1627 XBZRLE_cache_unlock();
1630 static void ram_save_cleanup(void *opaque)
1632 RAMState **rsp = opaque;
1633 RAMBlock *block;
1635 /* caller have hold iothread lock or is in a bh, so there is
1636 * no writing race against this migration_bitmap
1638 memory_global_dirty_log_stop();
1640 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1641 g_free(block->bmap);
1642 block->bmap = NULL;
1643 g_free(block->unsentmap);
1644 block->unsentmap = NULL;
1647 xbzrle_cleanup();
1648 compress_threads_save_cleanup();
1649 ram_state_cleanup(rsp);
1652 static void ram_state_reset(RAMState *rs)
1654 rs->last_seen_block = NULL;
1655 rs->last_sent_block = NULL;
1656 rs->last_page = 0;
1657 rs->last_version = ram_list.version;
1658 rs->ram_bulk_stage = true;
1661 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1664 * 'expected' is the value you expect the bitmap mostly to be full
1665 * of; it won't bother printing lines that are all this value.
1666 * If 'todump' is null the migration bitmap is dumped.
1668 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1669 unsigned long pages)
1671 int64_t cur;
1672 int64_t linelen = 128;
1673 char linebuf[129];
1675 for (cur = 0; cur < pages; cur += linelen) {
1676 int64_t curb;
1677 bool found = false;
1679 * Last line; catch the case where the line length
1680 * is longer than remaining ram
1682 if (cur + linelen > pages) {
1683 linelen = pages - cur;
1685 for (curb = 0; curb < linelen; curb++) {
1686 bool thisbit = test_bit(cur + curb, todump);
1687 linebuf[curb] = thisbit ? '1' : '.';
1688 found = found || (thisbit != expected);
1690 if (found) {
1691 linebuf[curb] = '\0';
1692 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1697 /* **** functions for postcopy ***** */
1699 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1701 struct RAMBlock *block;
1703 RAMBLOCK_FOREACH(block) {
1704 unsigned long *bitmap = block->bmap;
1705 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1706 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1708 while (run_start < range) {
1709 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1710 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1711 (run_end - run_start) << TARGET_PAGE_BITS);
1712 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1718 * postcopy_send_discard_bm_ram: discard a RAMBlock
1720 * Returns zero on success
1722 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1723 * Note: At this point the 'unsentmap' is the processed bitmap combined
1724 * with the dirtymap; so a '1' means it's either dirty or unsent.
1726 * @ms: current migration state
1727 * @pds: state for postcopy
1728 * @start: RAMBlock starting page
1729 * @length: RAMBlock size
1731 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1732 PostcopyDiscardState *pds,
1733 RAMBlock *block)
1735 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1736 unsigned long current;
1737 unsigned long *unsentmap = block->unsentmap;
1739 for (current = 0; current < end; ) {
1740 unsigned long one = find_next_bit(unsentmap, end, current);
1742 if (one <= end) {
1743 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1744 unsigned long discard_length;
1746 if (zero >= end) {
1747 discard_length = end - one;
1748 } else {
1749 discard_length = zero - one;
1751 if (discard_length) {
1752 postcopy_discard_send_range(ms, pds, one, discard_length);
1754 current = one + discard_length;
1755 } else {
1756 current = one;
1760 return 0;
1764 * postcopy_each_ram_send_discard: discard all RAMBlocks
1766 * Returns 0 for success or negative for error
1768 * Utility for the outgoing postcopy code.
1769 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1770 * passing it bitmap indexes and name.
1771 * (qemu_ram_foreach_block ends up passing unscaled lengths
1772 * which would mean postcopy code would have to deal with target page)
1774 * @ms: current migration state
1776 static int postcopy_each_ram_send_discard(MigrationState *ms)
1778 struct RAMBlock *block;
1779 int ret;
1781 RAMBLOCK_FOREACH(block) {
1782 PostcopyDiscardState *pds =
1783 postcopy_discard_send_init(ms, block->idstr);
1786 * Postcopy sends chunks of bitmap over the wire, but it
1787 * just needs indexes at this point, avoids it having
1788 * target page specific code.
1790 ret = postcopy_send_discard_bm_ram(ms, pds, block);
1791 postcopy_discard_send_finish(ms, pds);
1792 if (ret) {
1793 return ret;
1797 return 0;
1801 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1803 * Helper for postcopy_chunk_hostpages; it's called twice to
1804 * canonicalize the two bitmaps, that are similar, but one is
1805 * inverted.
1807 * Postcopy requires that all target pages in a hostpage are dirty or
1808 * clean, not a mix. This function canonicalizes the bitmaps.
1810 * @ms: current migration state
1811 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1812 * otherwise we need to canonicalize partially dirty host pages
1813 * @block: block that contains the page we want to canonicalize
1814 * @pds: state for postcopy
1816 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1817 RAMBlock *block,
1818 PostcopyDiscardState *pds)
1820 RAMState *rs = ram_state;
1821 unsigned long *bitmap = block->bmap;
1822 unsigned long *unsentmap = block->unsentmap;
1823 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1824 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1825 unsigned long run_start;
1827 if (block->page_size == TARGET_PAGE_SIZE) {
1828 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1829 return;
1832 if (unsent_pass) {
1833 /* Find a sent page */
1834 run_start = find_next_zero_bit(unsentmap, pages, 0);
1835 } else {
1836 /* Find a dirty page */
1837 run_start = find_next_bit(bitmap, pages, 0);
1840 while (run_start < pages) {
1841 bool do_fixup = false;
1842 unsigned long fixup_start_addr;
1843 unsigned long host_offset;
1846 * If the start of this run of pages is in the middle of a host
1847 * page, then we need to fixup this host page.
1849 host_offset = run_start % host_ratio;
1850 if (host_offset) {
1851 do_fixup = true;
1852 run_start -= host_offset;
1853 fixup_start_addr = run_start;
1854 /* For the next pass */
1855 run_start = run_start + host_ratio;
1856 } else {
1857 /* Find the end of this run */
1858 unsigned long run_end;
1859 if (unsent_pass) {
1860 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1861 } else {
1862 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1865 * If the end isn't at the start of a host page, then the
1866 * run doesn't finish at the end of a host page
1867 * and we need to discard.
1869 host_offset = run_end % host_ratio;
1870 if (host_offset) {
1871 do_fixup = true;
1872 fixup_start_addr = run_end - host_offset;
1874 * This host page has gone, the next loop iteration starts
1875 * from after the fixup
1877 run_start = fixup_start_addr + host_ratio;
1878 } else {
1880 * No discards on this iteration, next loop starts from
1881 * next sent/dirty page
1883 run_start = run_end + 1;
1887 if (do_fixup) {
1888 unsigned long page;
1890 /* Tell the destination to discard this page */
1891 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1892 /* For the unsent_pass we:
1893 * discard partially sent pages
1894 * For the !unsent_pass (dirty) we:
1895 * discard partially dirty pages that were sent
1896 * (any partially sent pages were already discarded
1897 * by the previous unsent_pass)
1899 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1900 host_ratio);
1903 /* Clean up the bitmap */
1904 for (page = fixup_start_addr;
1905 page < fixup_start_addr + host_ratio; page++) {
1906 /* All pages in this host page are now not sent */
1907 set_bit(page, unsentmap);
1910 * Remark them as dirty, updating the count for any pages
1911 * that weren't previously dirty.
1913 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1917 if (unsent_pass) {
1918 /* Find the next sent page for the next iteration */
1919 run_start = find_next_zero_bit(unsentmap, pages, run_start);
1920 } else {
1921 /* Find the next dirty page for the next iteration */
1922 run_start = find_next_bit(bitmap, pages, run_start);
1928 * postcopy_chuck_hostpages: discrad any partially sent host page
1930 * Utility for the outgoing postcopy code.
1932 * Discard any partially sent host-page size chunks, mark any partially
1933 * dirty host-page size chunks as all dirty. In this case the host-page
1934 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1936 * Returns zero on success
1938 * @ms: current migration state
1939 * @block: block we want to work with
1941 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1943 PostcopyDiscardState *pds =
1944 postcopy_discard_send_init(ms, block->idstr);
1946 /* First pass: Discard all partially sent host pages */
1947 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1949 * Second pass: Ensure that all partially dirty host pages are made
1950 * fully dirty.
1952 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1954 postcopy_discard_send_finish(ms, pds);
1955 return 0;
1959 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1961 * Returns zero on success
1963 * Transmit the set of pages to be discarded after precopy to the target
1964 * these are pages that:
1965 * a) Have been previously transmitted but are now dirty again
1966 * b) Pages that have never been transmitted, this ensures that
1967 * any pages on the destination that have been mapped by background
1968 * tasks get discarded (transparent huge pages is the specific concern)
1969 * Hopefully this is pretty sparse
1971 * @ms: current migration state
1973 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1975 RAMState *rs = ram_state;
1976 RAMBlock *block;
1977 int ret;
1979 rcu_read_lock();
1981 /* This should be our last sync, the src is now paused */
1982 migration_bitmap_sync(rs);
1984 /* Easiest way to make sure we don't resume in the middle of a host-page */
1985 rs->last_seen_block = NULL;
1986 rs->last_sent_block = NULL;
1987 rs->last_page = 0;
1989 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1990 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1991 unsigned long *bitmap = block->bmap;
1992 unsigned long *unsentmap = block->unsentmap;
1994 if (!unsentmap) {
1995 /* We don't have a safe way to resize the sentmap, so
1996 * if the bitmap was resized it will be NULL at this
1997 * point.
1999 error_report("migration ram resized during precopy phase");
2000 rcu_read_unlock();
2001 return -EINVAL;
2003 /* Deal with TPS != HPS and huge pages */
2004 ret = postcopy_chunk_hostpages(ms, block);
2005 if (ret) {
2006 rcu_read_unlock();
2007 return ret;
2011 * Update the unsentmap to be unsentmap = unsentmap | dirty
2013 bitmap_or(unsentmap, unsentmap, bitmap, pages);
2014 #ifdef DEBUG_POSTCOPY
2015 ram_debug_dump_bitmap(unsentmap, true, pages);
2016 #endif
2018 trace_ram_postcopy_send_discard_bitmap();
2020 ret = postcopy_each_ram_send_discard(ms);
2021 rcu_read_unlock();
2023 return ret;
2027 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2029 * Returns zero on success
2031 * @rbname: name of the RAMBlock of the request. NULL means the
2032 * same that last one.
2033 * @start: RAMBlock starting page
2034 * @length: RAMBlock size
2036 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2038 int ret = -1;
2040 trace_ram_discard_range(rbname, start, length);
2042 rcu_read_lock();
2043 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2045 if (!rb) {
2046 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2047 goto err;
2050 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2051 length >> qemu_target_page_bits());
2052 ret = ram_block_discard_range(rb, start, length);
2054 err:
2055 rcu_read_unlock();
2057 return ret;
2061 * For every allocation, we will try not to crash the VM if the
2062 * allocation failed.
2064 static int xbzrle_init(void)
2066 Error *local_err = NULL;
2068 if (!migrate_use_xbzrle()) {
2069 return 0;
2072 XBZRLE_cache_lock();
2074 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2075 if (!XBZRLE.zero_target_page) {
2076 error_report("%s: Error allocating zero page", __func__);
2077 goto err_out;
2080 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2081 TARGET_PAGE_SIZE, &local_err);
2082 if (!XBZRLE.cache) {
2083 error_report_err(local_err);
2084 goto free_zero_page;
2087 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2088 if (!XBZRLE.encoded_buf) {
2089 error_report("%s: Error allocating encoded_buf", __func__);
2090 goto free_cache;
2093 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2094 if (!XBZRLE.current_buf) {
2095 error_report("%s: Error allocating current_buf", __func__);
2096 goto free_encoded_buf;
2099 /* We are all good */
2100 XBZRLE_cache_unlock();
2101 return 0;
2103 free_encoded_buf:
2104 g_free(XBZRLE.encoded_buf);
2105 XBZRLE.encoded_buf = NULL;
2106 free_cache:
2107 cache_fini(XBZRLE.cache);
2108 XBZRLE.cache = NULL;
2109 free_zero_page:
2110 g_free(XBZRLE.zero_target_page);
2111 XBZRLE.zero_target_page = NULL;
2112 err_out:
2113 XBZRLE_cache_unlock();
2114 return -ENOMEM;
2117 static int ram_state_init(RAMState **rsp)
2119 *rsp = g_try_new0(RAMState, 1);
2121 if (!*rsp) {
2122 error_report("%s: Init ramstate fail", __func__);
2123 return -1;
2126 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2127 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2128 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2131 * Count the total number of pages used by ram blocks not including any
2132 * gaps due to alignment or unplugs.
2134 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2136 ram_state_reset(*rsp);
2138 return 0;
2141 static void ram_list_init_bitmaps(void)
2143 RAMBlock *block;
2144 unsigned long pages;
2146 /* Skip setting bitmap if there is no RAM */
2147 if (ram_bytes_total()) {
2148 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2149 pages = block->max_length >> TARGET_PAGE_BITS;
2150 block->bmap = bitmap_new(pages);
2151 bitmap_set(block->bmap, 0, pages);
2152 if (migrate_postcopy_ram()) {
2153 block->unsentmap = bitmap_new(pages);
2154 bitmap_set(block->unsentmap, 0, pages);
2160 static void ram_init_bitmaps(RAMState *rs)
2162 /* For memory_global_dirty_log_start below. */
2163 qemu_mutex_lock_iothread();
2164 qemu_mutex_lock_ramlist();
2165 rcu_read_lock();
2167 ram_list_init_bitmaps();
2168 memory_global_dirty_log_start();
2169 migration_bitmap_sync(rs);
2171 rcu_read_unlock();
2172 qemu_mutex_unlock_ramlist();
2173 qemu_mutex_unlock_iothread();
2176 static int ram_init_all(RAMState **rsp)
2178 if (ram_state_init(rsp)) {
2179 return -1;
2182 if (xbzrle_init()) {
2183 ram_state_cleanup(rsp);
2184 return -1;
2187 ram_init_bitmaps(*rsp);
2189 return 0;
2193 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2194 * long-running RCU critical section. When rcu-reclaims in the code
2195 * start to become numerous it will be necessary to reduce the
2196 * granularity of these critical sections.
2200 * ram_save_setup: Setup RAM for migration
2202 * Returns zero to indicate success and negative for error
2204 * @f: QEMUFile where to send the data
2205 * @opaque: RAMState pointer
2207 static int ram_save_setup(QEMUFile *f, void *opaque)
2209 RAMState **rsp = opaque;
2210 RAMBlock *block;
2212 /* migration has already setup the bitmap, reuse it. */
2213 if (!migration_in_colo_state()) {
2214 if (ram_init_all(rsp) != 0) {
2215 return -1;
2218 (*rsp)->f = f;
2220 rcu_read_lock();
2222 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2224 RAMBLOCK_FOREACH(block) {
2225 qemu_put_byte(f, strlen(block->idstr));
2226 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2227 qemu_put_be64(f, block->used_length);
2228 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2229 qemu_put_be64(f, block->page_size);
2233 rcu_read_unlock();
2234 compress_threads_save_setup();
2236 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2237 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2239 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2241 return 0;
2245 * ram_save_iterate: iterative stage for migration
2247 * Returns zero to indicate success and negative for error
2249 * @f: QEMUFile where to send the data
2250 * @opaque: RAMState pointer
2252 static int ram_save_iterate(QEMUFile *f, void *opaque)
2254 RAMState **temp = opaque;
2255 RAMState *rs = *temp;
2256 int ret;
2257 int i;
2258 int64_t t0;
2259 int done = 0;
2261 if (blk_mig_bulk_active()) {
2262 /* Avoid transferring ram during bulk phase of block migration as
2263 * the bulk phase will usually take a long time and transferring
2264 * ram updates during that time is pointless. */
2265 goto out;
2268 rcu_read_lock();
2269 if (ram_list.version != rs->last_version) {
2270 ram_state_reset(rs);
2273 /* Read version before ram_list.blocks */
2274 smp_rmb();
2276 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2278 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2279 i = 0;
2280 while ((ret = qemu_file_rate_limit(f)) == 0) {
2281 int pages;
2283 pages = ram_find_and_save_block(rs, false);
2284 /* no more pages to sent */
2285 if (pages == 0) {
2286 done = 1;
2287 break;
2289 rs->iterations++;
2291 /* we want to check in the 1st loop, just in case it was the 1st time
2292 and we had to sync the dirty bitmap.
2293 qemu_get_clock_ns() is a bit expensive, so we only check each some
2294 iterations
2296 if ((i & 63) == 0) {
2297 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2298 if (t1 > MAX_WAIT) {
2299 trace_ram_save_iterate_big_wait(t1, i);
2300 break;
2303 i++;
2305 flush_compressed_data(rs);
2306 rcu_read_unlock();
2309 * Must occur before EOS (or any QEMUFile operation)
2310 * because of RDMA protocol.
2312 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2314 out:
2315 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2316 ram_counters.transferred += 8;
2318 ret = qemu_file_get_error(f);
2319 if (ret < 0) {
2320 return ret;
2323 return done;
2327 * ram_save_complete: function called to send the remaining amount of ram
2329 * Returns zero to indicate success
2331 * Called with iothread lock
2333 * @f: QEMUFile where to send the data
2334 * @opaque: RAMState pointer
2336 static int ram_save_complete(QEMUFile *f, void *opaque)
2338 RAMState **temp = opaque;
2339 RAMState *rs = *temp;
2341 rcu_read_lock();
2343 if (!migration_in_postcopy()) {
2344 migration_bitmap_sync(rs);
2347 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2349 /* try transferring iterative blocks of memory */
2351 /* flush all remaining blocks regardless of rate limiting */
2352 while (true) {
2353 int pages;
2355 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2356 /* no more blocks to sent */
2357 if (pages == 0) {
2358 break;
2362 flush_compressed_data(rs);
2363 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2365 rcu_read_unlock();
2367 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2369 return 0;
2372 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2373 uint64_t *res_precopy_only,
2374 uint64_t *res_compatible,
2375 uint64_t *res_postcopy_only)
2377 RAMState **temp = opaque;
2378 RAMState *rs = *temp;
2379 uint64_t remaining_size;
2381 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2383 if (!migration_in_postcopy() &&
2384 remaining_size < max_size) {
2385 qemu_mutex_lock_iothread();
2386 rcu_read_lock();
2387 migration_bitmap_sync(rs);
2388 rcu_read_unlock();
2389 qemu_mutex_unlock_iothread();
2390 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2393 if (migrate_postcopy_ram()) {
2394 /* We can do postcopy, and all the data is postcopiable */
2395 *res_compatible += remaining_size;
2396 } else {
2397 *res_precopy_only += remaining_size;
2401 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2403 unsigned int xh_len;
2404 int xh_flags;
2405 uint8_t *loaded_data;
2407 /* extract RLE header */
2408 xh_flags = qemu_get_byte(f);
2409 xh_len = qemu_get_be16(f);
2411 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2412 error_report("Failed to load XBZRLE page - wrong compression!");
2413 return -1;
2416 if (xh_len > TARGET_PAGE_SIZE) {
2417 error_report("Failed to load XBZRLE page - len overflow!");
2418 return -1;
2420 loaded_data = XBZRLE.decoded_buf;
2421 /* load data and decode */
2422 /* it can change loaded_data to point to an internal buffer */
2423 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2425 /* decode RLE */
2426 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2427 TARGET_PAGE_SIZE) == -1) {
2428 error_report("Failed to load XBZRLE page - decode error!");
2429 return -1;
2432 return 0;
2436 * ram_block_from_stream: read a RAMBlock id from the migration stream
2438 * Must be called from within a rcu critical section.
2440 * Returns a pointer from within the RCU-protected ram_list.
2442 * @f: QEMUFile where to read the data from
2443 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2445 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2447 static RAMBlock *block = NULL;
2448 char id[256];
2449 uint8_t len;
2451 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2452 if (!block) {
2453 error_report("Ack, bad migration stream!");
2454 return NULL;
2456 return block;
2459 len = qemu_get_byte(f);
2460 qemu_get_buffer(f, (uint8_t *)id, len);
2461 id[len] = 0;
2463 block = qemu_ram_block_by_name(id);
2464 if (!block) {
2465 error_report("Can't find block %s", id);
2466 return NULL;
2469 return block;
2472 static inline void *host_from_ram_block_offset(RAMBlock *block,
2473 ram_addr_t offset)
2475 if (!offset_in_ramblock(block, offset)) {
2476 return NULL;
2479 return block->host + offset;
2483 * ram_handle_compressed: handle the zero page case
2485 * If a page (or a whole RDMA chunk) has been
2486 * determined to be zero, then zap it.
2488 * @host: host address for the zero page
2489 * @ch: what the page is filled from. We only support zero
2490 * @size: size of the zero page
2492 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2494 if (ch != 0 || !is_zero_range(host, size)) {
2495 memset(host, ch, size);
2499 static void *do_data_decompress(void *opaque)
2501 DecompressParam *param = opaque;
2502 unsigned long pagesize;
2503 uint8_t *des;
2504 int len;
2506 qemu_mutex_lock(&param->mutex);
2507 while (!param->quit) {
2508 if (param->des) {
2509 des = param->des;
2510 len = param->len;
2511 param->des = 0;
2512 qemu_mutex_unlock(&param->mutex);
2514 pagesize = TARGET_PAGE_SIZE;
2515 /* uncompress() will return failed in some case, especially
2516 * when the page is dirted when doing the compression, it's
2517 * not a problem because the dirty page will be retransferred
2518 * and uncompress() won't break the data in other pages.
2520 uncompress((Bytef *)des, &pagesize,
2521 (const Bytef *)param->compbuf, len);
2523 qemu_mutex_lock(&decomp_done_lock);
2524 param->done = true;
2525 qemu_cond_signal(&decomp_done_cond);
2526 qemu_mutex_unlock(&decomp_done_lock);
2528 qemu_mutex_lock(&param->mutex);
2529 } else {
2530 qemu_cond_wait(&param->cond, &param->mutex);
2533 qemu_mutex_unlock(&param->mutex);
2535 return NULL;
2538 static void wait_for_decompress_done(void)
2540 int idx, thread_count;
2542 if (!migrate_use_compression()) {
2543 return;
2546 thread_count = migrate_decompress_threads();
2547 qemu_mutex_lock(&decomp_done_lock);
2548 for (idx = 0; idx < thread_count; idx++) {
2549 while (!decomp_param[idx].done) {
2550 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2553 qemu_mutex_unlock(&decomp_done_lock);
2556 static void compress_threads_load_setup(void)
2558 int i, thread_count;
2560 if (!migrate_use_compression()) {
2561 return;
2563 thread_count = migrate_decompress_threads();
2564 decompress_threads = g_new0(QemuThread, thread_count);
2565 decomp_param = g_new0(DecompressParam, thread_count);
2566 qemu_mutex_init(&decomp_done_lock);
2567 qemu_cond_init(&decomp_done_cond);
2568 for (i = 0; i < thread_count; i++) {
2569 qemu_mutex_init(&decomp_param[i].mutex);
2570 qemu_cond_init(&decomp_param[i].cond);
2571 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2572 decomp_param[i].done = true;
2573 decomp_param[i].quit = false;
2574 qemu_thread_create(decompress_threads + i, "decompress",
2575 do_data_decompress, decomp_param + i,
2576 QEMU_THREAD_JOINABLE);
2580 static void compress_threads_load_cleanup(void)
2582 int i, thread_count;
2584 if (!migrate_use_compression()) {
2585 return;
2587 thread_count = migrate_decompress_threads();
2588 for (i = 0; i < thread_count; i++) {
2589 qemu_mutex_lock(&decomp_param[i].mutex);
2590 decomp_param[i].quit = true;
2591 qemu_cond_signal(&decomp_param[i].cond);
2592 qemu_mutex_unlock(&decomp_param[i].mutex);
2594 for (i = 0; i < thread_count; i++) {
2595 qemu_thread_join(decompress_threads + i);
2596 qemu_mutex_destroy(&decomp_param[i].mutex);
2597 qemu_cond_destroy(&decomp_param[i].cond);
2598 g_free(decomp_param[i].compbuf);
2600 g_free(decompress_threads);
2601 g_free(decomp_param);
2602 decompress_threads = NULL;
2603 decomp_param = NULL;
2606 static void decompress_data_with_multi_threads(QEMUFile *f,
2607 void *host, int len)
2609 int idx, thread_count;
2611 thread_count = migrate_decompress_threads();
2612 qemu_mutex_lock(&decomp_done_lock);
2613 while (true) {
2614 for (idx = 0; idx < thread_count; idx++) {
2615 if (decomp_param[idx].done) {
2616 decomp_param[idx].done = false;
2617 qemu_mutex_lock(&decomp_param[idx].mutex);
2618 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2619 decomp_param[idx].des = host;
2620 decomp_param[idx].len = len;
2621 qemu_cond_signal(&decomp_param[idx].cond);
2622 qemu_mutex_unlock(&decomp_param[idx].mutex);
2623 break;
2626 if (idx < thread_count) {
2627 break;
2628 } else {
2629 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2632 qemu_mutex_unlock(&decomp_done_lock);
2636 * ram_load_setup: Setup RAM for migration incoming side
2638 * Returns zero to indicate success and negative for error
2640 * @f: QEMUFile where to receive the data
2641 * @opaque: RAMState pointer
2643 static int ram_load_setup(QEMUFile *f, void *opaque)
2645 xbzrle_load_setup();
2646 compress_threads_load_setup();
2647 ramblock_recv_map_init();
2648 return 0;
2651 static int ram_load_cleanup(void *opaque)
2653 RAMBlock *rb;
2654 xbzrle_load_cleanup();
2655 compress_threads_load_cleanup();
2657 RAMBLOCK_FOREACH(rb) {
2658 g_free(rb->receivedmap);
2659 rb->receivedmap = NULL;
2661 return 0;
2665 * ram_postcopy_incoming_init: allocate postcopy data structures
2667 * Returns 0 for success and negative if there was one error
2669 * @mis: current migration incoming state
2671 * Allocate data structures etc needed by incoming migration with
2672 * postcopy-ram. postcopy-ram's similarly names
2673 * postcopy_ram_incoming_init does the work.
2675 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2677 unsigned long ram_pages = last_ram_page();
2679 return postcopy_ram_incoming_init(mis, ram_pages);
2683 * ram_load_postcopy: load a page in postcopy case
2685 * Returns 0 for success or -errno in case of error
2687 * Called in postcopy mode by ram_load().
2688 * rcu_read_lock is taken prior to this being called.
2690 * @f: QEMUFile where to send the data
2692 static int ram_load_postcopy(QEMUFile *f)
2694 int flags = 0, ret = 0;
2695 bool place_needed = false;
2696 bool matching_page_sizes = false;
2697 MigrationIncomingState *mis = migration_incoming_get_current();
2698 /* Temporary page that is later 'placed' */
2699 void *postcopy_host_page = postcopy_get_tmp_page(mis);
2700 void *last_host = NULL;
2701 bool all_zero = false;
2703 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2704 ram_addr_t addr;
2705 void *host = NULL;
2706 void *page_buffer = NULL;
2707 void *place_source = NULL;
2708 RAMBlock *block = NULL;
2709 uint8_t ch;
2711 addr = qemu_get_be64(f);
2714 * If qemu file error, we should stop here, and then "addr"
2715 * may be invalid
2717 ret = qemu_file_get_error(f);
2718 if (ret) {
2719 break;
2722 flags = addr & ~TARGET_PAGE_MASK;
2723 addr &= TARGET_PAGE_MASK;
2725 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2726 place_needed = false;
2727 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2728 block = ram_block_from_stream(f, flags);
2730 host = host_from_ram_block_offset(block, addr);
2731 if (!host) {
2732 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2733 ret = -EINVAL;
2734 break;
2736 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2738 * Postcopy requires that we place whole host pages atomically;
2739 * these may be huge pages for RAMBlocks that are backed by
2740 * hugetlbfs.
2741 * To make it atomic, the data is read into a temporary page
2742 * that's moved into place later.
2743 * The migration protocol uses, possibly smaller, target-pages
2744 * however the source ensures it always sends all the components
2745 * of a host page in order.
2747 page_buffer = postcopy_host_page +
2748 ((uintptr_t)host & (block->page_size - 1));
2749 /* If all TP are zero then we can optimise the place */
2750 if (!((uintptr_t)host & (block->page_size - 1))) {
2751 all_zero = true;
2752 } else {
2753 /* not the 1st TP within the HP */
2754 if (host != (last_host + TARGET_PAGE_SIZE)) {
2755 error_report("Non-sequential target page %p/%p",
2756 host, last_host);
2757 ret = -EINVAL;
2758 break;
2764 * If it's the last part of a host page then we place the host
2765 * page
2767 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2768 (block->page_size - 1)) == 0;
2769 place_source = postcopy_host_page;
2771 last_host = host;
2773 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2774 case RAM_SAVE_FLAG_ZERO:
2775 ch = qemu_get_byte(f);
2776 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2777 if (ch) {
2778 all_zero = false;
2780 break;
2782 case RAM_SAVE_FLAG_PAGE:
2783 all_zero = false;
2784 if (!place_needed || !matching_page_sizes) {
2785 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2786 } else {
2787 /* Avoids the qemu_file copy during postcopy, which is
2788 * going to do a copy later; can only do it when we
2789 * do this read in one go (matching page sizes)
2791 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2792 TARGET_PAGE_SIZE);
2794 break;
2795 case RAM_SAVE_FLAG_EOS:
2796 /* normal exit */
2797 break;
2798 default:
2799 error_report("Unknown combination of migration flags: %#x"
2800 " (postcopy mode)", flags);
2801 ret = -EINVAL;
2802 break;
2805 /* Detect for any possible file errors */
2806 if (!ret && qemu_file_get_error(f)) {
2807 ret = qemu_file_get_error(f);
2810 if (!ret && place_needed) {
2811 /* This gets called at the last target page in the host page */
2812 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2814 if (all_zero) {
2815 ret = postcopy_place_page_zero(mis, place_dest,
2816 block);
2817 } else {
2818 ret = postcopy_place_page(mis, place_dest,
2819 place_source, block);
2824 return ret;
2827 static bool postcopy_is_advised(void)
2829 PostcopyState ps = postcopy_state_get();
2830 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
2833 static bool postcopy_is_running(void)
2835 PostcopyState ps = postcopy_state_get();
2836 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
2839 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2841 int flags = 0, ret = 0, invalid_flags = 0;
2842 static uint64_t seq_iter;
2843 int len = 0;
2845 * If system is running in postcopy mode, page inserts to host memory must
2846 * be atomic
2848 bool postcopy_running = postcopy_is_running();
2849 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2850 bool postcopy_advised = postcopy_is_advised();
2852 seq_iter++;
2854 if (version_id != 4) {
2855 ret = -EINVAL;
2858 if (!migrate_use_compression()) {
2859 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2861 /* This RCU critical section can be very long running.
2862 * When RCU reclaims in the code start to become numerous,
2863 * it will be necessary to reduce the granularity of this
2864 * critical section.
2866 rcu_read_lock();
2868 if (postcopy_running) {
2869 ret = ram_load_postcopy(f);
2872 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2873 ram_addr_t addr, total_ram_bytes;
2874 void *host = NULL;
2875 uint8_t ch;
2877 addr = qemu_get_be64(f);
2878 flags = addr & ~TARGET_PAGE_MASK;
2879 addr &= TARGET_PAGE_MASK;
2881 if (flags & invalid_flags) {
2882 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2883 error_report("Received an unexpected compressed page");
2886 ret = -EINVAL;
2887 break;
2890 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2891 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2892 RAMBlock *block = ram_block_from_stream(f, flags);
2894 host = host_from_ram_block_offset(block, addr);
2895 if (!host) {
2896 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2897 ret = -EINVAL;
2898 break;
2900 ramblock_recv_bitmap_set(block, host);
2901 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2904 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2905 case RAM_SAVE_FLAG_MEM_SIZE:
2906 /* Synchronize RAM block list */
2907 total_ram_bytes = addr;
2908 while (!ret && total_ram_bytes) {
2909 RAMBlock *block;
2910 char id[256];
2911 ram_addr_t length;
2913 len = qemu_get_byte(f);
2914 qemu_get_buffer(f, (uint8_t *)id, len);
2915 id[len] = 0;
2916 length = qemu_get_be64(f);
2918 block = qemu_ram_block_by_name(id);
2919 if (block) {
2920 if (length != block->used_length) {
2921 Error *local_err = NULL;
2923 ret = qemu_ram_resize(block, length,
2924 &local_err);
2925 if (local_err) {
2926 error_report_err(local_err);
2929 /* For postcopy we need to check hugepage sizes match */
2930 if (postcopy_advised &&
2931 block->page_size != qemu_host_page_size) {
2932 uint64_t remote_page_size = qemu_get_be64(f);
2933 if (remote_page_size != block->page_size) {
2934 error_report("Mismatched RAM page size %s "
2935 "(local) %zd != %" PRId64,
2936 id, block->page_size,
2937 remote_page_size);
2938 ret = -EINVAL;
2941 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2942 block->idstr);
2943 } else {
2944 error_report("Unknown ramblock \"%s\", cannot "
2945 "accept migration", id);
2946 ret = -EINVAL;
2949 total_ram_bytes -= length;
2951 break;
2953 case RAM_SAVE_FLAG_ZERO:
2954 ch = qemu_get_byte(f);
2955 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2956 break;
2958 case RAM_SAVE_FLAG_PAGE:
2959 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2960 break;
2962 case RAM_SAVE_FLAG_COMPRESS_PAGE:
2963 len = qemu_get_be32(f);
2964 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2965 error_report("Invalid compressed data length: %d", len);
2966 ret = -EINVAL;
2967 break;
2969 decompress_data_with_multi_threads(f, host, len);
2970 break;
2972 case RAM_SAVE_FLAG_XBZRLE:
2973 if (load_xbzrle(f, addr, host) < 0) {
2974 error_report("Failed to decompress XBZRLE page at "
2975 RAM_ADDR_FMT, addr);
2976 ret = -EINVAL;
2977 break;
2979 break;
2980 case RAM_SAVE_FLAG_EOS:
2981 /* normal exit */
2982 break;
2983 default:
2984 if (flags & RAM_SAVE_FLAG_HOOK) {
2985 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2986 } else {
2987 error_report("Unknown combination of migration flags: %#x",
2988 flags);
2989 ret = -EINVAL;
2992 if (!ret) {
2993 ret = qemu_file_get_error(f);
2997 wait_for_decompress_done();
2998 rcu_read_unlock();
2999 trace_ram_load_complete(ret, seq_iter);
3000 return ret;
3003 static bool ram_has_postcopy(void *opaque)
3005 return migrate_postcopy_ram();
3008 static SaveVMHandlers savevm_ram_handlers = {
3009 .save_setup = ram_save_setup,
3010 .save_live_iterate = ram_save_iterate,
3011 .save_live_complete_postcopy = ram_save_complete,
3012 .save_live_complete_precopy = ram_save_complete,
3013 .has_postcopy = ram_has_postcopy,
3014 .save_live_pending = ram_save_pending,
3015 .load_state = ram_load,
3016 .save_cleanup = ram_save_cleanup,
3017 .load_setup = ram_load_setup,
3018 .load_cleanup = ram_load_cleanup,
3021 void ram_mig_init(void)
3023 qemu_mutex_init(&XBZRLE.lock);
3024 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);