gluster: Add preallocated truncation
[qemu.git] / migration / ram.c
blob8333d8e35e0a302b305d1da9fa773e9a15cb00a1
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qapi-event.h"
33 #include "qemu/cutils.h"
34 #include "qemu/bitops.h"
35 #include "qemu/bitmap.h"
36 #include "qemu/main-loop.h"
37 #include "xbzrle.h"
38 #include "ram.h"
39 #include "migration.h"
40 #include "migration/register.h"
41 #include "migration/misc.h"
42 #include "qemu-file.h"
43 #include "postcopy-ram.h"
44 #include "migration/page_cache.h"
45 #include "qemu/error-report.h"
46 #include "qapi/error.h"
47 #include "qapi/qmp/qerror.h"
48 #include "trace.h"
49 #include "exec/ram_addr.h"
50 #include "exec/target_page.h"
51 #include "qemu/rcu_queue.h"
52 #include "migration/colo.h"
53 #include "migration/block.h"
55 /***********************************************************/
56 /* ram save/restore */
58 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
59 * worked for pages that where filled with the same char. We switched
60 * it to only search for the zero value. And to avoid confusion with
61 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
64 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
65 #define RAM_SAVE_FLAG_ZERO 0x02
66 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
67 #define RAM_SAVE_FLAG_PAGE 0x08
68 #define RAM_SAVE_FLAG_EOS 0x10
69 #define RAM_SAVE_FLAG_CONTINUE 0x20
70 #define RAM_SAVE_FLAG_XBZRLE 0x40
71 /* 0x80 is reserved in migration.h start with 0x100 next */
72 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
74 static inline bool is_zero_range(uint8_t *p, uint64_t size)
76 return buffer_is_zero(p, size);
79 XBZRLECacheStats xbzrle_counters;
81 /* struct contains XBZRLE cache and a static page
82 used by the compression */
83 static struct {
84 /* buffer used for XBZRLE encoding */
85 uint8_t *encoded_buf;
86 /* buffer for storing page content */
87 uint8_t *current_buf;
88 /* Cache for XBZRLE, Protected by lock. */
89 PageCache *cache;
90 QemuMutex lock;
91 /* it will store a page full of zeros */
92 uint8_t *zero_target_page;
93 /* buffer used for XBZRLE decoding */
94 uint8_t *decoded_buf;
95 } XBZRLE;
97 static void XBZRLE_cache_lock(void)
99 if (migrate_use_xbzrle())
100 qemu_mutex_lock(&XBZRLE.lock);
103 static void XBZRLE_cache_unlock(void)
105 if (migrate_use_xbzrle())
106 qemu_mutex_unlock(&XBZRLE.lock);
110 * xbzrle_cache_resize: resize the xbzrle cache
112 * This function is called from qmp_migrate_set_cache_size in main
113 * thread, possibly while a migration is in progress. A running
114 * migration may be using the cache and might finish during this call,
115 * hence changes to the cache are protected by XBZRLE.lock().
117 * Returns 0 for success or -1 for error
119 * @new_size: new cache size
120 * @errp: set *errp if the check failed, with reason
122 int xbzrle_cache_resize(int64_t new_size, Error **errp)
124 PageCache *new_cache;
125 int64_t ret = 0;
127 /* Check for truncation */
128 if (new_size != (size_t)new_size) {
129 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
130 "exceeding address space");
131 return -1;
134 if (new_size == migrate_xbzrle_cache_size()) {
135 /* nothing to do */
136 return 0;
139 XBZRLE_cache_lock();
141 if (XBZRLE.cache != NULL) {
142 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
143 if (!new_cache) {
144 ret = -1;
145 goto out;
148 cache_fini(XBZRLE.cache);
149 XBZRLE.cache = new_cache;
151 out:
152 XBZRLE_cache_unlock();
153 return ret;
156 static void ramblock_recv_map_init(void)
158 RAMBlock *rb;
160 RAMBLOCK_FOREACH(rb) {
161 assert(!rb->receivedmap);
162 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
166 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
168 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
169 rb->receivedmap);
172 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
174 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
177 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
178 size_t nr)
180 bitmap_set_atomic(rb->receivedmap,
181 ramblock_recv_bitmap_offset(host_addr, rb),
182 nr);
186 * An outstanding page request, on the source, having been received
187 * and queued
189 struct RAMSrcPageRequest {
190 RAMBlock *rb;
191 hwaddr offset;
192 hwaddr len;
194 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
197 /* State of RAM for migration */
198 struct RAMState {
199 /* QEMUFile used for this migration */
200 QEMUFile *f;
201 /* Last block that we have visited searching for dirty pages */
202 RAMBlock *last_seen_block;
203 /* Last block from where we have sent data */
204 RAMBlock *last_sent_block;
205 /* Last dirty target page we have sent */
206 ram_addr_t last_page;
207 /* last ram version we have seen */
208 uint32_t last_version;
209 /* We are in the first round */
210 bool ram_bulk_stage;
211 /* How many times we have dirty too many pages */
212 int dirty_rate_high_cnt;
213 /* these variables are used for bitmap sync */
214 /* last time we did a full bitmap_sync */
215 int64_t time_last_bitmap_sync;
216 /* bytes transferred at start_time */
217 uint64_t bytes_xfer_prev;
218 /* number of dirty pages since start_time */
219 uint64_t num_dirty_pages_period;
220 /* xbzrle misses since the beginning of the period */
221 uint64_t xbzrle_cache_miss_prev;
222 /* number of iterations at the beginning of period */
223 uint64_t iterations_prev;
224 /* Iterations since start */
225 uint64_t iterations;
226 /* number of dirty bits in the bitmap */
227 uint64_t migration_dirty_pages;
228 /* protects modification of the bitmap */
229 QemuMutex bitmap_mutex;
230 /* The RAMBlock used in the last src_page_requests */
231 RAMBlock *last_req_rb;
232 /* Queue of outstanding page requests from the destination */
233 QemuMutex src_page_req_mutex;
234 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
236 typedef struct RAMState RAMState;
238 static RAMState *ram_state;
240 uint64_t ram_bytes_remaining(void)
242 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
246 MigrationStats ram_counters;
248 /* used by the search for pages to send */
249 struct PageSearchStatus {
250 /* Current block being searched */
251 RAMBlock *block;
252 /* Current page to search from */
253 unsigned long page;
254 /* Set once we wrap around */
255 bool complete_round;
257 typedef struct PageSearchStatus PageSearchStatus;
259 struct CompressParam {
260 bool done;
261 bool quit;
262 QEMUFile *file;
263 QemuMutex mutex;
264 QemuCond cond;
265 RAMBlock *block;
266 ram_addr_t offset;
268 typedef struct CompressParam CompressParam;
270 struct DecompressParam {
271 bool done;
272 bool quit;
273 QemuMutex mutex;
274 QemuCond cond;
275 void *des;
276 uint8_t *compbuf;
277 int len;
279 typedef struct DecompressParam DecompressParam;
281 static CompressParam *comp_param;
282 static QemuThread *compress_threads;
283 /* comp_done_cond is used to wake up the migration thread when
284 * one of the compression threads has finished the compression.
285 * comp_done_lock is used to co-work with comp_done_cond.
287 static QemuMutex comp_done_lock;
288 static QemuCond comp_done_cond;
289 /* The empty QEMUFileOps will be used by file in CompressParam */
290 static const QEMUFileOps empty_ops = { };
292 static DecompressParam *decomp_param;
293 static QemuThread *decompress_threads;
294 static QemuMutex decomp_done_lock;
295 static QemuCond decomp_done_cond;
297 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
298 ram_addr_t offset);
300 static void *do_data_compress(void *opaque)
302 CompressParam *param = opaque;
303 RAMBlock *block;
304 ram_addr_t offset;
306 qemu_mutex_lock(&param->mutex);
307 while (!param->quit) {
308 if (param->block) {
309 block = param->block;
310 offset = param->offset;
311 param->block = NULL;
312 qemu_mutex_unlock(&param->mutex);
314 do_compress_ram_page(param->file, block, offset);
316 qemu_mutex_lock(&comp_done_lock);
317 param->done = true;
318 qemu_cond_signal(&comp_done_cond);
319 qemu_mutex_unlock(&comp_done_lock);
321 qemu_mutex_lock(&param->mutex);
322 } else {
323 qemu_cond_wait(&param->cond, &param->mutex);
326 qemu_mutex_unlock(&param->mutex);
328 return NULL;
331 static inline void terminate_compression_threads(void)
333 int idx, thread_count;
335 thread_count = migrate_compress_threads();
337 for (idx = 0; idx < thread_count; idx++) {
338 qemu_mutex_lock(&comp_param[idx].mutex);
339 comp_param[idx].quit = true;
340 qemu_cond_signal(&comp_param[idx].cond);
341 qemu_mutex_unlock(&comp_param[idx].mutex);
345 static void compress_threads_save_cleanup(void)
347 int i, thread_count;
349 if (!migrate_use_compression()) {
350 return;
352 terminate_compression_threads();
353 thread_count = migrate_compress_threads();
354 for (i = 0; i < thread_count; i++) {
355 qemu_thread_join(compress_threads + i);
356 qemu_fclose(comp_param[i].file);
357 qemu_mutex_destroy(&comp_param[i].mutex);
358 qemu_cond_destroy(&comp_param[i].cond);
360 qemu_mutex_destroy(&comp_done_lock);
361 qemu_cond_destroy(&comp_done_cond);
362 g_free(compress_threads);
363 g_free(comp_param);
364 compress_threads = NULL;
365 comp_param = NULL;
368 static void compress_threads_save_setup(void)
370 int i, thread_count;
372 if (!migrate_use_compression()) {
373 return;
375 thread_count = migrate_compress_threads();
376 compress_threads = g_new0(QemuThread, thread_count);
377 comp_param = g_new0(CompressParam, thread_count);
378 qemu_cond_init(&comp_done_cond);
379 qemu_mutex_init(&comp_done_lock);
380 for (i = 0; i < thread_count; i++) {
381 /* comp_param[i].file is just used as a dummy buffer to save data,
382 * set its ops to empty.
384 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
385 comp_param[i].done = true;
386 comp_param[i].quit = false;
387 qemu_mutex_init(&comp_param[i].mutex);
388 qemu_cond_init(&comp_param[i].cond);
389 qemu_thread_create(compress_threads + i, "compress",
390 do_data_compress, comp_param + i,
391 QEMU_THREAD_JOINABLE);
395 /* Multiple fd's */
397 struct MultiFDSendParams {
398 uint8_t id;
399 char *name;
400 QemuThread thread;
401 QemuSemaphore sem;
402 QemuMutex mutex;
403 bool quit;
405 typedef struct MultiFDSendParams MultiFDSendParams;
407 struct {
408 MultiFDSendParams *params;
409 /* number of created threads */
410 int count;
411 } *multifd_send_state;
413 static void terminate_multifd_send_threads(Error *errp)
415 int i;
417 for (i = 0; i < multifd_send_state->count; i++) {
418 MultiFDSendParams *p = &multifd_send_state->params[i];
420 qemu_mutex_lock(&p->mutex);
421 p->quit = true;
422 qemu_sem_post(&p->sem);
423 qemu_mutex_unlock(&p->mutex);
427 int multifd_save_cleanup(Error **errp)
429 int i;
430 int ret = 0;
432 if (!migrate_use_multifd()) {
433 return 0;
435 terminate_multifd_send_threads(NULL);
436 for (i = 0; i < multifd_send_state->count; i++) {
437 MultiFDSendParams *p = &multifd_send_state->params[i];
439 qemu_thread_join(&p->thread);
440 qemu_mutex_destroy(&p->mutex);
441 qemu_sem_destroy(&p->sem);
442 g_free(p->name);
443 p->name = NULL;
445 g_free(multifd_send_state->params);
446 multifd_send_state->params = NULL;
447 g_free(multifd_send_state);
448 multifd_send_state = NULL;
449 return ret;
452 static void *multifd_send_thread(void *opaque)
454 MultiFDSendParams *p = opaque;
456 while (true) {
457 qemu_mutex_lock(&p->mutex);
458 if (p->quit) {
459 qemu_mutex_unlock(&p->mutex);
460 break;
462 qemu_mutex_unlock(&p->mutex);
463 qemu_sem_wait(&p->sem);
466 return NULL;
469 int multifd_save_setup(void)
471 int thread_count;
472 uint8_t i;
474 if (!migrate_use_multifd()) {
475 return 0;
477 thread_count = migrate_multifd_channels();
478 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
479 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
480 multifd_send_state->count = 0;
481 for (i = 0; i < thread_count; i++) {
482 MultiFDSendParams *p = &multifd_send_state->params[i];
484 qemu_mutex_init(&p->mutex);
485 qemu_sem_init(&p->sem, 0);
486 p->quit = false;
487 p->id = i;
488 p->name = g_strdup_printf("multifdsend_%d", i);
489 qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
490 QEMU_THREAD_JOINABLE);
492 multifd_send_state->count++;
494 return 0;
497 struct MultiFDRecvParams {
498 uint8_t id;
499 char *name;
500 QemuThread thread;
501 QemuSemaphore sem;
502 QemuMutex mutex;
503 bool quit;
505 typedef struct MultiFDRecvParams MultiFDRecvParams;
507 struct {
508 MultiFDRecvParams *params;
509 /* number of created threads */
510 int count;
511 } *multifd_recv_state;
513 static void terminate_multifd_recv_threads(Error *errp)
515 int i;
517 for (i = 0; i < multifd_recv_state->count; i++) {
518 MultiFDRecvParams *p = &multifd_recv_state->params[i];
520 qemu_mutex_lock(&p->mutex);
521 p->quit = true;
522 qemu_sem_post(&p->sem);
523 qemu_mutex_unlock(&p->mutex);
527 int multifd_load_cleanup(Error **errp)
529 int i;
530 int ret = 0;
532 if (!migrate_use_multifd()) {
533 return 0;
535 terminate_multifd_recv_threads(NULL);
536 for (i = 0; i < multifd_recv_state->count; i++) {
537 MultiFDRecvParams *p = &multifd_recv_state->params[i];
539 qemu_thread_join(&p->thread);
540 qemu_mutex_destroy(&p->mutex);
541 qemu_sem_destroy(&p->sem);
542 g_free(p->name);
543 p->name = NULL;
545 g_free(multifd_recv_state->params);
546 multifd_recv_state->params = NULL;
547 g_free(multifd_recv_state);
548 multifd_recv_state = NULL;
550 return ret;
553 static void *multifd_recv_thread(void *opaque)
555 MultiFDRecvParams *p = opaque;
557 while (true) {
558 qemu_mutex_lock(&p->mutex);
559 if (p->quit) {
560 qemu_mutex_unlock(&p->mutex);
561 break;
563 qemu_mutex_unlock(&p->mutex);
564 qemu_sem_wait(&p->sem);
567 return NULL;
570 int multifd_load_setup(void)
572 int thread_count;
573 uint8_t i;
575 if (!migrate_use_multifd()) {
576 return 0;
578 thread_count = migrate_multifd_channels();
579 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
580 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
581 multifd_recv_state->count = 0;
582 for (i = 0; i < thread_count; i++) {
583 MultiFDRecvParams *p = &multifd_recv_state->params[i];
585 qemu_mutex_init(&p->mutex);
586 qemu_sem_init(&p->sem, 0);
587 p->quit = false;
588 p->id = i;
589 p->name = g_strdup_printf("multifdrecv_%d", i);
590 qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
591 QEMU_THREAD_JOINABLE);
592 multifd_recv_state->count++;
594 return 0;
598 * save_page_header: write page header to wire
600 * If this is the 1st block, it also writes the block identification
602 * Returns the number of bytes written
604 * @f: QEMUFile where to send the data
605 * @block: block that contains the page we want to send
606 * @offset: offset inside the block for the page
607 * in the lower bits, it contains flags
609 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
610 ram_addr_t offset)
612 size_t size, len;
614 if (block == rs->last_sent_block) {
615 offset |= RAM_SAVE_FLAG_CONTINUE;
617 qemu_put_be64(f, offset);
618 size = 8;
620 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
621 len = strlen(block->idstr);
622 qemu_put_byte(f, len);
623 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
624 size += 1 + len;
625 rs->last_sent_block = block;
627 return size;
631 * mig_throttle_guest_down: throotle down the guest
633 * Reduce amount of guest cpu execution to hopefully slow down memory
634 * writes. If guest dirty memory rate is reduced below the rate at
635 * which we can transfer pages to the destination then we should be
636 * able to complete migration. Some workloads dirty memory way too
637 * fast and will not effectively converge, even with auto-converge.
639 static void mig_throttle_guest_down(void)
641 MigrationState *s = migrate_get_current();
642 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
643 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
645 /* We have not started throttling yet. Let's start it. */
646 if (!cpu_throttle_active()) {
647 cpu_throttle_set(pct_initial);
648 } else {
649 /* Throttling already on, just increase the rate */
650 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
655 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
657 * @rs: current RAM state
658 * @current_addr: address for the zero page
660 * Update the xbzrle cache to reflect a page that's been sent as all 0.
661 * The important thing is that a stale (not-yet-0'd) page be replaced
662 * by the new data.
663 * As a bonus, if the page wasn't in the cache it gets added so that
664 * when a small write is made into the 0'd page it gets XBZRLE sent.
666 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
668 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
669 return;
672 /* We don't care if this fails to allocate a new cache page
673 * as long as it updated an old one */
674 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
675 ram_counters.dirty_sync_count);
678 #define ENCODING_FLAG_XBZRLE 0x1
681 * save_xbzrle_page: compress and send current page
683 * Returns: 1 means that we wrote the page
684 * 0 means that page is identical to the one already sent
685 * -1 means that xbzrle would be longer than normal
687 * @rs: current RAM state
688 * @current_data: pointer to the address of the page contents
689 * @current_addr: addr of the page
690 * @block: block that contains the page we want to send
691 * @offset: offset inside the block for the page
692 * @last_stage: if we are at the completion stage
694 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
695 ram_addr_t current_addr, RAMBlock *block,
696 ram_addr_t offset, bool last_stage)
698 int encoded_len = 0, bytes_xbzrle;
699 uint8_t *prev_cached_page;
701 if (!cache_is_cached(XBZRLE.cache, current_addr,
702 ram_counters.dirty_sync_count)) {
703 xbzrle_counters.cache_miss++;
704 if (!last_stage) {
705 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
706 ram_counters.dirty_sync_count) == -1) {
707 return -1;
708 } else {
709 /* update *current_data when the page has been
710 inserted into cache */
711 *current_data = get_cached_data(XBZRLE.cache, current_addr);
714 return -1;
717 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
719 /* save current buffer into memory */
720 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
722 /* XBZRLE encoding (if there is no overflow) */
723 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
724 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
725 TARGET_PAGE_SIZE);
726 if (encoded_len == 0) {
727 trace_save_xbzrle_page_skipping();
728 return 0;
729 } else if (encoded_len == -1) {
730 trace_save_xbzrle_page_overflow();
731 xbzrle_counters.overflow++;
732 /* update data in the cache */
733 if (!last_stage) {
734 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
735 *current_data = prev_cached_page;
737 return -1;
740 /* we need to update the data in the cache, in order to get the same data */
741 if (!last_stage) {
742 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
745 /* Send XBZRLE based compressed page */
746 bytes_xbzrle = save_page_header(rs, rs->f, block,
747 offset | RAM_SAVE_FLAG_XBZRLE);
748 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
749 qemu_put_be16(rs->f, encoded_len);
750 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
751 bytes_xbzrle += encoded_len + 1 + 2;
752 xbzrle_counters.pages++;
753 xbzrle_counters.bytes += bytes_xbzrle;
754 ram_counters.transferred += bytes_xbzrle;
756 return 1;
760 * migration_bitmap_find_dirty: find the next dirty page from start
762 * Called with rcu_read_lock() to protect migration_bitmap
764 * Returns the byte offset within memory region of the start of a dirty page
766 * @rs: current RAM state
767 * @rb: RAMBlock where to search for dirty pages
768 * @start: page where we start the search
770 static inline
771 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
772 unsigned long start)
774 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
775 unsigned long *bitmap = rb->bmap;
776 unsigned long next;
778 if (rs->ram_bulk_stage && start > 0) {
779 next = start + 1;
780 } else {
781 next = find_next_bit(bitmap, size, start);
784 return next;
787 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
788 RAMBlock *rb,
789 unsigned long page)
791 bool ret;
793 ret = test_and_clear_bit(page, rb->bmap);
795 if (ret) {
796 rs->migration_dirty_pages--;
798 return ret;
801 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
802 ram_addr_t start, ram_addr_t length)
804 rs->migration_dirty_pages +=
805 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
806 &rs->num_dirty_pages_period);
810 * ram_pagesize_summary: calculate all the pagesizes of a VM
812 * Returns a summary bitmap of the page sizes of all RAMBlocks
814 * For VMs with just normal pages this is equivalent to the host page
815 * size. If it's got some huge pages then it's the OR of all the
816 * different page sizes.
818 uint64_t ram_pagesize_summary(void)
820 RAMBlock *block;
821 uint64_t summary = 0;
823 RAMBLOCK_FOREACH(block) {
824 summary |= block->page_size;
827 return summary;
830 static void migration_bitmap_sync(RAMState *rs)
832 RAMBlock *block;
833 int64_t end_time;
834 uint64_t bytes_xfer_now;
836 ram_counters.dirty_sync_count++;
838 if (!rs->time_last_bitmap_sync) {
839 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
842 trace_migration_bitmap_sync_start();
843 memory_global_dirty_log_sync();
845 qemu_mutex_lock(&rs->bitmap_mutex);
846 rcu_read_lock();
847 RAMBLOCK_FOREACH(block) {
848 migration_bitmap_sync_range(rs, block, 0, block->used_length);
850 rcu_read_unlock();
851 qemu_mutex_unlock(&rs->bitmap_mutex);
853 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
855 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
857 /* more than 1 second = 1000 millisecons */
858 if (end_time > rs->time_last_bitmap_sync + 1000) {
859 /* calculate period counters */
860 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
861 / (end_time - rs->time_last_bitmap_sync);
862 bytes_xfer_now = ram_counters.transferred;
864 /* During block migration the auto-converge logic incorrectly detects
865 * that ram migration makes no progress. Avoid this by disabling the
866 * throttling logic during the bulk phase of block migration. */
867 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
868 /* The following detection logic can be refined later. For now:
869 Check to see if the dirtied bytes is 50% more than the approx.
870 amount of bytes that just got transferred since the last time we
871 were in this routine. If that happens twice, start or increase
872 throttling */
874 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
875 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
876 (++rs->dirty_rate_high_cnt >= 2)) {
877 trace_migration_throttle();
878 rs->dirty_rate_high_cnt = 0;
879 mig_throttle_guest_down();
883 if (migrate_use_xbzrle()) {
884 if (rs->iterations_prev != rs->iterations) {
885 xbzrle_counters.cache_miss_rate =
886 (double)(xbzrle_counters.cache_miss -
887 rs->xbzrle_cache_miss_prev) /
888 (rs->iterations - rs->iterations_prev);
890 rs->iterations_prev = rs->iterations;
891 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
894 /* reset period counters */
895 rs->time_last_bitmap_sync = end_time;
896 rs->num_dirty_pages_period = 0;
897 rs->bytes_xfer_prev = bytes_xfer_now;
899 if (migrate_use_events()) {
900 qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
905 * save_zero_page: send the zero page to the stream
907 * Returns the number of pages written.
909 * @rs: current RAM state
910 * @block: block that contains the page we want to send
911 * @offset: offset inside the block for the page
913 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
915 uint8_t *p = block->host + offset;
916 int pages = -1;
918 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
919 ram_counters.duplicate++;
920 ram_counters.transferred +=
921 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
922 qemu_put_byte(rs->f, 0);
923 ram_counters.transferred += 1;
924 pages = 1;
927 return pages;
930 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
932 if (!migrate_release_ram() || !migration_in_postcopy()) {
933 return;
936 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
940 * ram_save_page: send the given page to the stream
942 * Returns the number of pages written.
943 * < 0 - error
944 * >=0 - Number of pages written - this might legally be 0
945 * if xbzrle noticed the page was the same.
947 * @rs: current RAM state
948 * @block: block that contains the page we want to send
949 * @offset: offset inside the block for the page
950 * @last_stage: if we are at the completion stage
952 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
954 int pages = -1;
955 uint64_t bytes_xmit;
956 ram_addr_t current_addr;
957 uint8_t *p;
958 int ret;
959 bool send_async = true;
960 RAMBlock *block = pss->block;
961 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
963 p = block->host + offset;
964 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
966 /* In doubt sent page as normal */
967 bytes_xmit = 0;
968 ret = ram_control_save_page(rs->f, block->offset,
969 offset, TARGET_PAGE_SIZE, &bytes_xmit);
970 if (bytes_xmit) {
971 ram_counters.transferred += bytes_xmit;
972 pages = 1;
975 XBZRLE_cache_lock();
977 current_addr = block->offset + offset;
979 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
980 if (ret != RAM_SAVE_CONTROL_DELAYED) {
981 if (bytes_xmit > 0) {
982 ram_counters.normal++;
983 } else if (bytes_xmit == 0) {
984 ram_counters.duplicate++;
987 } else {
988 pages = save_zero_page(rs, block, offset);
989 if (pages > 0) {
990 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
991 * page would be stale
993 xbzrle_cache_zero_page(rs, current_addr);
994 ram_release_pages(block->idstr, offset, pages);
995 } else if (!rs->ram_bulk_stage &&
996 !migration_in_postcopy() && migrate_use_xbzrle()) {
997 pages = save_xbzrle_page(rs, &p, current_addr, block,
998 offset, last_stage);
999 if (!last_stage) {
1000 /* Can't send this cached data async, since the cache page
1001 * might get updated before it gets to the wire
1003 send_async = false;
1008 /* XBZRLE overflow or normal page */
1009 if (pages == -1) {
1010 ram_counters.transferred +=
1011 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
1012 if (send_async) {
1013 qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
1014 migrate_release_ram() &
1015 migration_in_postcopy());
1016 } else {
1017 qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
1019 ram_counters.transferred += TARGET_PAGE_SIZE;
1020 pages = 1;
1021 ram_counters.normal++;
1024 XBZRLE_cache_unlock();
1026 return pages;
1029 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
1030 ram_addr_t offset)
1032 RAMState *rs = ram_state;
1033 int bytes_sent, blen;
1034 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1036 bytes_sent = save_page_header(rs, f, block, offset |
1037 RAM_SAVE_FLAG_COMPRESS_PAGE);
1038 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1039 migrate_compress_level());
1040 if (blen < 0) {
1041 bytes_sent = 0;
1042 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1043 error_report("compressed data failed!");
1044 } else {
1045 bytes_sent += blen;
1046 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1049 return bytes_sent;
1052 static void flush_compressed_data(RAMState *rs)
1054 int idx, len, thread_count;
1056 if (!migrate_use_compression()) {
1057 return;
1059 thread_count = migrate_compress_threads();
1061 qemu_mutex_lock(&comp_done_lock);
1062 for (idx = 0; idx < thread_count; idx++) {
1063 while (!comp_param[idx].done) {
1064 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1067 qemu_mutex_unlock(&comp_done_lock);
1069 for (idx = 0; idx < thread_count; idx++) {
1070 qemu_mutex_lock(&comp_param[idx].mutex);
1071 if (!comp_param[idx].quit) {
1072 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1073 ram_counters.transferred += len;
1075 qemu_mutex_unlock(&comp_param[idx].mutex);
1079 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1080 ram_addr_t offset)
1082 param->block = block;
1083 param->offset = offset;
1086 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1087 ram_addr_t offset)
1089 int idx, thread_count, bytes_xmit = -1, pages = -1;
1091 thread_count = migrate_compress_threads();
1092 qemu_mutex_lock(&comp_done_lock);
1093 while (true) {
1094 for (idx = 0; idx < thread_count; idx++) {
1095 if (comp_param[idx].done) {
1096 comp_param[idx].done = false;
1097 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1098 qemu_mutex_lock(&comp_param[idx].mutex);
1099 set_compress_params(&comp_param[idx], block, offset);
1100 qemu_cond_signal(&comp_param[idx].cond);
1101 qemu_mutex_unlock(&comp_param[idx].mutex);
1102 pages = 1;
1103 ram_counters.normal++;
1104 ram_counters.transferred += bytes_xmit;
1105 break;
1108 if (pages > 0) {
1109 break;
1110 } else {
1111 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1114 qemu_mutex_unlock(&comp_done_lock);
1116 return pages;
1120 * ram_save_compressed_page: compress the given page and send it to the stream
1122 * Returns the number of pages written.
1124 * @rs: current RAM state
1125 * @block: block that contains the page we want to send
1126 * @offset: offset inside the block for the page
1127 * @last_stage: if we are at the completion stage
1129 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
1130 bool last_stage)
1132 int pages = -1;
1133 uint64_t bytes_xmit = 0;
1134 uint8_t *p;
1135 int ret, blen;
1136 RAMBlock *block = pss->block;
1137 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1139 p = block->host + offset;
1141 ret = ram_control_save_page(rs->f, block->offset,
1142 offset, TARGET_PAGE_SIZE, &bytes_xmit);
1143 if (bytes_xmit) {
1144 ram_counters.transferred += bytes_xmit;
1145 pages = 1;
1147 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
1148 if (ret != RAM_SAVE_CONTROL_DELAYED) {
1149 if (bytes_xmit > 0) {
1150 ram_counters.normal++;
1151 } else if (bytes_xmit == 0) {
1152 ram_counters.duplicate++;
1155 } else {
1156 /* When starting the process of a new block, the first page of
1157 * the block should be sent out before other pages in the same
1158 * block, and all the pages in last block should have been sent
1159 * out, keeping this order is important, because the 'cont' flag
1160 * is used to avoid resending the block name.
1162 if (block != rs->last_sent_block) {
1163 flush_compressed_data(rs);
1164 pages = save_zero_page(rs, block, offset);
1165 if (pages == -1) {
1166 /* Make sure the first page is sent out before other pages */
1167 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1168 RAM_SAVE_FLAG_COMPRESS_PAGE);
1169 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1170 migrate_compress_level());
1171 if (blen > 0) {
1172 ram_counters.transferred += bytes_xmit + blen;
1173 ram_counters.normal++;
1174 pages = 1;
1175 } else {
1176 qemu_file_set_error(rs->f, blen);
1177 error_report("compressed data failed!");
1180 if (pages > 0) {
1181 ram_release_pages(block->idstr, offset, pages);
1183 } else {
1184 pages = save_zero_page(rs, block, offset);
1185 if (pages == -1) {
1186 pages = compress_page_with_multi_thread(rs, block, offset);
1187 } else {
1188 ram_release_pages(block->idstr, offset, pages);
1193 return pages;
1197 * find_dirty_block: find the next dirty page and update any state
1198 * associated with the search process.
1200 * Returns if a page is found
1202 * @rs: current RAM state
1203 * @pss: data about the state of the current dirty page scan
1204 * @again: set to false if the search has scanned the whole of RAM
1206 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1208 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1209 if (pss->complete_round && pss->block == rs->last_seen_block &&
1210 pss->page >= rs->last_page) {
1212 * We've been once around the RAM and haven't found anything.
1213 * Give up.
1215 *again = false;
1216 return false;
1218 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1219 /* Didn't find anything in this RAM Block */
1220 pss->page = 0;
1221 pss->block = QLIST_NEXT_RCU(pss->block, next);
1222 if (!pss->block) {
1223 /* Hit the end of the list */
1224 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1225 /* Flag that we've looped */
1226 pss->complete_round = true;
1227 rs->ram_bulk_stage = false;
1228 if (migrate_use_xbzrle()) {
1229 /* If xbzrle is on, stop using the data compression at this
1230 * point. In theory, xbzrle can do better than compression.
1232 flush_compressed_data(rs);
1235 /* Didn't find anything this time, but try again on the new block */
1236 *again = true;
1237 return false;
1238 } else {
1239 /* Can go around again, but... */
1240 *again = true;
1241 /* We've found something so probably don't need to */
1242 return true;
1247 * unqueue_page: gets a page of the queue
1249 * Helper for 'get_queued_page' - gets a page off the queue
1251 * Returns the block of the page (or NULL if none available)
1253 * @rs: current RAM state
1254 * @offset: used to return the offset within the RAMBlock
1256 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1258 RAMBlock *block = NULL;
1260 qemu_mutex_lock(&rs->src_page_req_mutex);
1261 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1262 struct RAMSrcPageRequest *entry =
1263 QSIMPLEQ_FIRST(&rs->src_page_requests);
1264 block = entry->rb;
1265 *offset = entry->offset;
1267 if (entry->len > TARGET_PAGE_SIZE) {
1268 entry->len -= TARGET_PAGE_SIZE;
1269 entry->offset += TARGET_PAGE_SIZE;
1270 } else {
1271 memory_region_unref(block->mr);
1272 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1273 g_free(entry);
1276 qemu_mutex_unlock(&rs->src_page_req_mutex);
1278 return block;
1282 * get_queued_page: unqueue a page from the postocpy requests
1284 * Skips pages that are already sent (!dirty)
1286 * Returns if a queued page is found
1288 * @rs: current RAM state
1289 * @pss: data about the state of the current dirty page scan
1291 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1293 RAMBlock *block;
1294 ram_addr_t offset;
1295 bool dirty;
1297 do {
1298 block = unqueue_page(rs, &offset);
1300 * We're sending this page, and since it's postcopy nothing else
1301 * will dirty it, and we must make sure it doesn't get sent again
1302 * even if this queue request was received after the background
1303 * search already sent it.
1305 if (block) {
1306 unsigned long page;
1308 page = offset >> TARGET_PAGE_BITS;
1309 dirty = test_bit(page, block->bmap);
1310 if (!dirty) {
1311 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1312 page, test_bit(page, block->unsentmap));
1313 } else {
1314 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1318 } while (block && !dirty);
1320 if (block) {
1322 * As soon as we start servicing pages out of order, then we have
1323 * to kill the bulk stage, since the bulk stage assumes
1324 * in (migration_bitmap_find_and_reset_dirty) that every page is
1325 * dirty, that's no longer true.
1327 rs->ram_bulk_stage = false;
1330 * We want the background search to continue from the queued page
1331 * since the guest is likely to want other pages near to the page
1332 * it just requested.
1334 pss->block = block;
1335 pss->page = offset >> TARGET_PAGE_BITS;
1338 return !!block;
1342 * migration_page_queue_free: drop any remaining pages in the ram
1343 * request queue
1345 * It should be empty at the end anyway, but in error cases there may
1346 * be some left. in case that there is any page left, we drop it.
1349 static void migration_page_queue_free(RAMState *rs)
1351 struct RAMSrcPageRequest *mspr, *next_mspr;
1352 /* This queue generally should be empty - but in the case of a failed
1353 * migration might have some droppings in.
1355 rcu_read_lock();
1356 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1357 memory_region_unref(mspr->rb->mr);
1358 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1359 g_free(mspr);
1361 rcu_read_unlock();
1365 * ram_save_queue_pages: queue the page for transmission
1367 * A request from postcopy destination for example.
1369 * Returns zero on success or negative on error
1371 * @rbname: Name of the RAMBLock of the request. NULL means the
1372 * same that last one.
1373 * @start: starting address from the start of the RAMBlock
1374 * @len: length (in bytes) to send
1376 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1378 RAMBlock *ramblock;
1379 RAMState *rs = ram_state;
1381 ram_counters.postcopy_requests++;
1382 rcu_read_lock();
1383 if (!rbname) {
1384 /* Reuse last RAMBlock */
1385 ramblock = rs->last_req_rb;
1387 if (!ramblock) {
1389 * Shouldn't happen, we can't reuse the last RAMBlock if
1390 * it's the 1st request.
1392 error_report("ram_save_queue_pages no previous block");
1393 goto err;
1395 } else {
1396 ramblock = qemu_ram_block_by_name(rbname);
1398 if (!ramblock) {
1399 /* We shouldn't be asked for a non-existent RAMBlock */
1400 error_report("ram_save_queue_pages no block '%s'", rbname);
1401 goto err;
1403 rs->last_req_rb = ramblock;
1405 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1406 if (start+len > ramblock->used_length) {
1407 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1408 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1409 __func__, start, len, ramblock->used_length);
1410 goto err;
1413 struct RAMSrcPageRequest *new_entry =
1414 g_malloc0(sizeof(struct RAMSrcPageRequest));
1415 new_entry->rb = ramblock;
1416 new_entry->offset = start;
1417 new_entry->len = len;
1419 memory_region_ref(ramblock->mr);
1420 qemu_mutex_lock(&rs->src_page_req_mutex);
1421 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1422 qemu_mutex_unlock(&rs->src_page_req_mutex);
1423 rcu_read_unlock();
1425 return 0;
1427 err:
1428 rcu_read_unlock();
1429 return -1;
1433 * ram_save_target_page: save one target page
1435 * Returns the number of pages written
1437 * @rs: current RAM state
1438 * @ms: current migration state
1439 * @pss: data about the page we want to send
1440 * @last_stage: if we are at the completion stage
1442 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1443 bool last_stage)
1445 int res = 0;
1447 /* Check the pages is dirty and if it is send it */
1448 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1450 * If xbzrle is on, stop using the data compression after first
1451 * round of migration even if compression is enabled. In theory,
1452 * xbzrle can do better than compression.
1454 if (migrate_use_compression() &&
1455 (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1456 res = ram_save_compressed_page(rs, pss, last_stage);
1457 } else {
1458 res = ram_save_page(rs, pss, last_stage);
1461 if (res < 0) {
1462 return res;
1464 if (pss->block->unsentmap) {
1465 clear_bit(pss->page, pss->block->unsentmap);
1469 return res;
1473 * ram_save_host_page: save a whole host page
1475 * Starting at *offset send pages up to the end of the current host
1476 * page. It's valid for the initial offset to point into the middle of
1477 * a host page in which case the remainder of the hostpage is sent.
1478 * Only dirty target pages are sent. Note that the host page size may
1479 * be a huge page for this block.
1480 * The saving stops at the boundary of the used_length of the block
1481 * if the RAMBlock isn't a multiple of the host page size.
1483 * Returns the number of pages written or negative on error
1485 * @rs: current RAM state
1486 * @ms: current migration state
1487 * @pss: data about the page we want to send
1488 * @last_stage: if we are at the completion stage
1490 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1491 bool last_stage)
1493 int tmppages, pages = 0;
1494 size_t pagesize_bits =
1495 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1497 do {
1498 tmppages = ram_save_target_page(rs, pss, last_stage);
1499 if (tmppages < 0) {
1500 return tmppages;
1503 pages += tmppages;
1504 pss->page++;
1505 } while ((pss->page & (pagesize_bits - 1)) &&
1506 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1508 /* The offset we leave with is the last one we looked at */
1509 pss->page--;
1510 return pages;
1514 * ram_find_and_save_block: finds a dirty page and sends it to f
1516 * Called within an RCU critical section.
1518 * Returns the number of pages written where zero means no dirty pages
1520 * @rs: current RAM state
1521 * @last_stage: if we are at the completion stage
1523 * On systems where host-page-size > target-page-size it will send all the
1524 * pages in a host page that are dirty.
1527 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1529 PageSearchStatus pss;
1530 int pages = 0;
1531 bool again, found;
1533 /* No dirty page as there is zero RAM */
1534 if (!ram_bytes_total()) {
1535 return pages;
1538 pss.block = rs->last_seen_block;
1539 pss.page = rs->last_page;
1540 pss.complete_round = false;
1542 if (!pss.block) {
1543 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1546 do {
1547 again = true;
1548 found = get_queued_page(rs, &pss);
1550 if (!found) {
1551 /* priority queue empty, so just search for something dirty */
1552 found = find_dirty_block(rs, &pss, &again);
1555 if (found) {
1556 pages = ram_save_host_page(rs, &pss, last_stage);
1558 } while (!pages && again);
1560 rs->last_seen_block = pss.block;
1561 rs->last_page = pss.page;
1563 return pages;
1566 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1568 uint64_t pages = size / TARGET_PAGE_SIZE;
1570 if (zero) {
1571 ram_counters.duplicate += pages;
1572 } else {
1573 ram_counters.normal += pages;
1574 ram_counters.transferred += size;
1575 qemu_update_position(f, size);
1579 uint64_t ram_bytes_total(void)
1581 RAMBlock *block;
1582 uint64_t total = 0;
1584 rcu_read_lock();
1585 RAMBLOCK_FOREACH(block) {
1586 total += block->used_length;
1588 rcu_read_unlock();
1589 return total;
1592 static void xbzrle_load_setup(void)
1594 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1597 static void xbzrle_load_cleanup(void)
1599 g_free(XBZRLE.decoded_buf);
1600 XBZRLE.decoded_buf = NULL;
1603 static void ram_state_cleanup(RAMState **rsp)
1605 migration_page_queue_free(*rsp);
1606 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1607 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1608 g_free(*rsp);
1609 *rsp = NULL;
1612 static void xbzrle_cleanup(void)
1614 XBZRLE_cache_lock();
1615 if (XBZRLE.cache) {
1616 cache_fini(XBZRLE.cache);
1617 g_free(XBZRLE.encoded_buf);
1618 g_free(XBZRLE.current_buf);
1619 g_free(XBZRLE.zero_target_page);
1620 XBZRLE.cache = NULL;
1621 XBZRLE.encoded_buf = NULL;
1622 XBZRLE.current_buf = NULL;
1623 XBZRLE.zero_target_page = NULL;
1625 XBZRLE_cache_unlock();
1628 static void ram_save_cleanup(void *opaque)
1630 RAMState **rsp = opaque;
1631 RAMBlock *block;
1633 /* caller have hold iothread lock or is in a bh, so there is
1634 * no writing race against this migration_bitmap
1636 memory_global_dirty_log_stop();
1638 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1639 g_free(block->bmap);
1640 block->bmap = NULL;
1641 g_free(block->unsentmap);
1642 block->unsentmap = NULL;
1645 xbzrle_cleanup();
1646 compress_threads_save_cleanup();
1647 ram_state_cleanup(rsp);
1650 static void ram_state_reset(RAMState *rs)
1652 rs->last_seen_block = NULL;
1653 rs->last_sent_block = NULL;
1654 rs->last_page = 0;
1655 rs->last_version = ram_list.version;
1656 rs->ram_bulk_stage = true;
1659 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1662 * 'expected' is the value you expect the bitmap mostly to be full
1663 * of; it won't bother printing lines that are all this value.
1664 * If 'todump' is null the migration bitmap is dumped.
1666 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1667 unsigned long pages)
1669 int64_t cur;
1670 int64_t linelen = 128;
1671 char linebuf[129];
1673 for (cur = 0; cur < pages; cur += linelen) {
1674 int64_t curb;
1675 bool found = false;
1677 * Last line; catch the case where the line length
1678 * is longer than remaining ram
1680 if (cur + linelen > pages) {
1681 linelen = pages - cur;
1683 for (curb = 0; curb < linelen; curb++) {
1684 bool thisbit = test_bit(cur + curb, todump);
1685 linebuf[curb] = thisbit ? '1' : '.';
1686 found = found || (thisbit != expected);
1688 if (found) {
1689 linebuf[curb] = '\0';
1690 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1695 /* **** functions for postcopy ***** */
1697 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1699 struct RAMBlock *block;
1701 RAMBLOCK_FOREACH(block) {
1702 unsigned long *bitmap = block->bmap;
1703 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1704 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1706 while (run_start < range) {
1707 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1708 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1709 (run_end - run_start) << TARGET_PAGE_BITS);
1710 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1716 * postcopy_send_discard_bm_ram: discard a RAMBlock
1718 * Returns zero on success
1720 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1721 * Note: At this point the 'unsentmap' is the processed bitmap combined
1722 * with the dirtymap; so a '1' means it's either dirty or unsent.
1724 * @ms: current migration state
1725 * @pds: state for postcopy
1726 * @start: RAMBlock starting page
1727 * @length: RAMBlock size
1729 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1730 PostcopyDiscardState *pds,
1731 RAMBlock *block)
1733 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1734 unsigned long current;
1735 unsigned long *unsentmap = block->unsentmap;
1737 for (current = 0; current < end; ) {
1738 unsigned long one = find_next_bit(unsentmap, end, current);
1740 if (one <= end) {
1741 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1742 unsigned long discard_length;
1744 if (zero >= end) {
1745 discard_length = end - one;
1746 } else {
1747 discard_length = zero - one;
1749 if (discard_length) {
1750 postcopy_discard_send_range(ms, pds, one, discard_length);
1752 current = one + discard_length;
1753 } else {
1754 current = one;
1758 return 0;
1762 * postcopy_each_ram_send_discard: discard all RAMBlocks
1764 * Returns 0 for success or negative for error
1766 * Utility for the outgoing postcopy code.
1767 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1768 * passing it bitmap indexes and name.
1769 * (qemu_ram_foreach_block ends up passing unscaled lengths
1770 * which would mean postcopy code would have to deal with target page)
1772 * @ms: current migration state
1774 static int postcopy_each_ram_send_discard(MigrationState *ms)
1776 struct RAMBlock *block;
1777 int ret;
1779 RAMBLOCK_FOREACH(block) {
1780 PostcopyDiscardState *pds =
1781 postcopy_discard_send_init(ms, block->idstr);
1784 * Postcopy sends chunks of bitmap over the wire, but it
1785 * just needs indexes at this point, avoids it having
1786 * target page specific code.
1788 ret = postcopy_send_discard_bm_ram(ms, pds, block);
1789 postcopy_discard_send_finish(ms, pds);
1790 if (ret) {
1791 return ret;
1795 return 0;
1799 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1801 * Helper for postcopy_chunk_hostpages; it's called twice to
1802 * canonicalize the two bitmaps, that are similar, but one is
1803 * inverted.
1805 * Postcopy requires that all target pages in a hostpage are dirty or
1806 * clean, not a mix. This function canonicalizes the bitmaps.
1808 * @ms: current migration state
1809 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1810 * otherwise we need to canonicalize partially dirty host pages
1811 * @block: block that contains the page we want to canonicalize
1812 * @pds: state for postcopy
1814 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1815 RAMBlock *block,
1816 PostcopyDiscardState *pds)
1818 RAMState *rs = ram_state;
1819 unsigned long *bitmap = block->bmap;
1820 unsigned long *unsentmap = block->unsentmap;
1821 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1822 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1823 unsigned long run_start;
1825 if (block->page_size == TARGET_PAGE_SIZE) {
1826 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1827 return;
1830 if (unsent_pass) {
1831 /* Find a sent page */
1832 run_start = find_next_zero_bit(unsentmap, pages, 0);
1833 } else {
1834 /* Find a dirty page */
1835 run_start = find_next_bit(bitmap, pages, 0);
1838 while (run_start < pages) {
1839 bool do_fixup = false;
1840 unsigned long fixup_start_addr;
1841 unsigned long host_offset;
1844 * If the start of this run of pages is in the middle of a host
1845 * page, then we need to fixup this host page.
1847 host_offset = run_start % host_ratio;
1848 if (host_offset) {
1849 do_fixup = true;
1850 run_start -= host_offset;
1851 fixup_start_addr = run_start;
1852 /* For the next pass */
1853 run_start = run_start + host_ratio;
1854 } else {
1855 /* Find the end of this run */
1856 unsigned long run_end;
1857 if (unsent_pass) {
1858 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1859 } else {
1860 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1863 * If the end isn't at the start of a host page, then the
1864 * run doesn't finish at the end of a host page
1865 * and we need to discard.
1867 host_offset = run_end % host_ratio;
1868 if (host_offset) {
1869 do_fixup = true;
1870 fixup_start_addr = run_end - host_offset;
1872 * This host page has gone, the next loop iteration starts
1873 * from after the fixup
1875 run_start = fixup_start_addr + host_ratio;
1876 } else {
1878 * No discards on this iteration, next loop starts from
1879 * next sent/dirty page
1881 run_start = run_end + 1;
1885 if (do_fixup) {
1886 unsigned long page;
1888 /* Tell the destination to discard this page */
1889 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1890 /* For the unsent_pass we:
1891 * discard partially sent pages
1892 * For the !unsent_pass (dirty) we:
1893 * discard partially dirty pages that were sent
1894 * (any partially sent pages were already discarded
1895 * by the previous unsent_pass)
1897 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1898 host_ratio);
1901 /* Clean up the bitmap */
1902 for (page = fixup_start_addr;
1903 page < fixup_start_addr + host_ratio; page++) {
1904 /* All pages in this host page are now not sent */
1905 set_bit(page, unsentmap);
1908 * Remark them as dirty, updating the count for any pages
1909 * that weren't previously dirty.
1911 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1915 if (unsent_pass) {
1916 /* Find the next sent page for the next iteration */
1917 run_start = find_next_zero_bit(unsentmap, pages, run_start);
1918 } else {
1919 /* Find the next dirty page for the next iteration */
1920 run_start = find_next_bit(bitmap, pages, run_start);
1926 * postcopy_chuck_hostpages: discrad any partially sent host page
1928 * Utility for the outgoing postcopy code.
1930 * Discard any partially sent host-page size chunks, mark any partially
1931 * dirty host-page size chunks as all dirty. In this case the host-page
1932 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1934 * Returns zero on success
1936 * @ms: current migration state
1937 * @block: block we want to work with
1939 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1941 PostcopyDiscardState *pds =
1942 postcopy_discard_send_init(ms, block->idstr);
1944 /* First pass: Discard all partially sent host pages */
1945 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1947 * Second pass: Ensure that all partially dirty host pages are made
1948 * fully dirty.
1950 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1952 postcopy_discard_send_finish(ms, pds);
1953 return 0;
1957 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1959 * Returns zero on success
1961 * Transmit the set of pages to be discarded after precopy to the target
1962 * these are pages that:
1963 * a) Have been previously transmitted but are now dirty again
1964 * b) Pages that have never been transmitted, this ensures that
1965 * any pages on the destination that have been mapped by background
1966 * tasks get discarded (transparent huge pages is the specific concern)
1967 * Hopefully this is pretty sparse
1969 * @ms: current migration state
1971 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1973 RAMState *rs = ram_state;
1974 RAMBlock *block;
1975 int ret;
1977 rcu_read_lock();
1979 /* This should be our last sync, the src is now paused */
1980 migration_bitmap_sync(rs);
1982 /* Easiest way to make sure we don't resume in the middle of a host-page */
1983 rs->last_seen_block = NULL;
1984 rs->last_sent_block = NULL;
1985 rs->last_page = 0;
1987 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1988 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1989 unsigned long *bitmap = block->bmap;
1990 unsigned long *unsentmap = block->unsentmap;
1992 if (!unsentmap) {
1993 /* We don't have a safe way to resize the sentmap, so
1994 * if the bitmap was resized it will be NULL at this
1995 * point.
1997 error_report("migration ram resized during precopy phase");
1998 rcu_read_unlock();
1999 return -EINVAL;
2001 /* Deal with TPS != HPS and huge pages */
2002 ret = postcopy_chunk_hostpages(ms, block);
2003 if (ret) {
2004 rcu_read_unlock();
2005 return ret;
2009 * Update the unsentmap to be unsentmap = unsentmap | dirty
2011 bitmap_or(unsentmap, unsentmap, bitmap, pages);
2012 #ifdef DEBUG_POSTCOPY
2013 ram_debug_dump_bitmap(unsentmap, true, pages);
2014 #endif
2016 trace_ram_postcopy_send_discard_bitmap();
2018 ret = postcopy_each_ram_send_discard(ms);
2019 rcu_read_unlock();
2021 return ret;
2025 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2027 * Returns zero on success
2029 * @rbname: name of the RAMBlock of the request. NULL means the
2030 * same that last one.
2031 * @start: RAMBlock starting page
2032 * @length: RAMBlock size
2034 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2036 int ret = -1;
2038 trace_ram_discard_range(rbname, start, length);
2040 rcu_read_lock();
2041 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2043 if (!rb) {
2044 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2045 goto err;
2048 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2049 length >> qemu_target_page_bits());
2050 ret = ram_block_discard_range(rb, start, length);
2052 err:
2053 rcu_read_unlock();
2055 return ret;
2059 * For every allocation, we will try not to crash the VM if the
2060 * allocation failed.
2062 static int xbzrle_init(void)
2064 Error *local_err = NULL;
2066 if (!migrate_use_xbzrle()) {
2067 return 0;
2070 XBZRLE_cache_lock();
2072 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2073 if (!XBZRLE.zero_target_page) {
2074 error_report("%s: Error allocating zero page", __func__);
2075 goto err_out;
2078 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2079 TARGET_PAGE_SIZE, &local_err);
2080 if (!XBZRLE.cache) {
2081 error_report_err(local_err);
2082 goto free_zero_page;
2085 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2086 if (!XBZRLE.encoded_buf) {
2087 error_report("%s: Error allocating encoded_buf", __func__);
2088 goto free_cache;
2091 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2092 if (!XBZRLE.current_buf) {
2093 error_report("%s: Error allocating current_buf", __func__);
2094 goto free_encoded_buf;
2097 /* We are all good */
2098 XBZRLE_cache_unlock();
2099 return 0;
2101 free_encoded_buf:
2102 g_free(XBZRLE.encoded_buf);
2103 XBZRLE.encoded_buf = NULL;
2104 free_cache:
2105 cache_fini(XBZRLE.cache);
2106 XBZRLE.cache = NULL;
2107 free_zero_page:
2108 g_free(XBZRLE.zero_target_page);
2109 XBZRLE.zero_target_page = NULL;
2110 err_out:
2111 XBZRLE_cache_unlock();
2112 return -ENOMEM;
2115 static int ram_state_init(RAMState **rsp)
2117 *rsp = g_try_new0(RAMState, 1);
2119 if (!*rsp) {
2120 error_report("%s: Init ramstate fail", __func__);
2121 return -1;
2124 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2125 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2126 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2129 * Count the total number of pages used by ram blocks not including any
2130 * gaps due to alignment or unplugs.
2132 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2134 ram_state_reset(*rsp);
2136 return 0;
2139 static void ram_list_init_bitmaps(void)
2141 RAMBlock *block;
2142 unsigned long pages;
2144 /* Skip setting bitmap if there is no RAM */
2145 if (ram_bytes_total()) {
2146 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2147 pages = block->max_length >> TARGET_PAGE_BITS;
2148 block->bmap = bitmap_new(pages);
2149 bitmap_set(block->bmap, 0, pages);
2150 if (migrate_postcopy_ram()) {
2151 block->unsentmap = bitmap_new(pages);
2152 bitmap_set(block->unsentmap, 0, pages);
2158 static void ram_init_bitmaps(RAMState *rs)
2160 /* For memory_global_dirty_log_start below. */
2161 qemu_mutex_lock_iothread();
2162 qemu_mutex_lock_ramlist();
2163 rcu_read_lock();
2165 ram_list_init_bitmaps();
2166 memory_global_dirty_log_start();
2167 migration_bitmap_sync(rs);
2169 rcu_read_unlock();
2170 qemu_mutex_unlock_ramlist();
2171 qemu_mutex_unlock_iothread();
2174 static int ram_init_all(RAMState **rsp)
2176 if (ram_state_init(rsp)) {
2177 return -1;
2180 if (xbzrle_init()) {
2181 ram_state_cleanup(rsp);
2182 return -1;
2185 ram_init_bitmaps(*rsp);
2187 return 0;
2191 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2192 * long-running RCU critical section. When rcu-reclaims in the code
2193 * start to become numerous it will be necessary to reduce the
2194 * granularity of these critical sections.
2198 * ram_save_setup: Setup RAM for migration
2200 * Returns zero to indicate success and negative for error
2202 * @f: QEMUFile where to send the data
2203 * @opaque: RAMState pointer
2205 static int ram_save_setup(QEMUFile *f, void *opaque)
2207 RAMState **rsp = opaque;
2208 RAMBlock *block;
2210 /* migration has already setup the bitmap, reuse it. */
2211 if (!migration_in_colo_state()) {
2212 if (ram_init_all(rsp) != 0) {
2213 return -1;
2216 (*rsp)->f = f;
2218 rcu_read_lock();
2220 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2222 RAMBLOCK_FOREACH(block) {
2223 qemu_put_byte(f, strlen(block->idstr));
2224 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2225 qemu_put_be64(f, block->used_length);
2226 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2227 qemu_put_be64(f, block->page_size);
2231 rcu_read_unlock();
2232 compress_threads_save_setup();
2234 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2235 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2237 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2239 return 0;
2243 * ram_save_iterate: iterative stage for migration
2245 * Returns zero to indicate success and negative for error
2247 * @f: QEMUFile where to send the data
2248 * @opaque: RAMState pointer
2250 static int ram_save_iterate(QEMUFile *f, void *opaque)
2252 RAMState **temp = opaque;
2253 RAMState *rs = *temp;
2254 int ret;
2255 int i;
2256 int64_t t0;
2257 int done = 0;
2259 rcu_read_lock();
2260 if (ram_list.version != rs->last_version) {
2261 ram_state_reset(rs);
2264 /* Read version before ram_list.blocks */
2265 smp_rmb();
2267 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2269 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2270 i = 0;
2271 while ((ret = qemu_file_rate_limit(f)) == 0) {
2272 int pages;
2274 pages = ram_find_and_save_block(rs, false);
2275 /* no more pages to sent */
2276 if (pages == 0) {
2277 done = 1;
2278 break;
2280 rs->iterations++;
2282 /* we want to check in the 1st loop, just in case it was the 1st time
2283 and we had to sync the dirty bitmap.
2284 qemu_get_clock_ns() is a bit expensive, so we only check each some
2285 iterations
2287 if ((i & 63) == 0) {
2288 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2289 if (t1 > MAX_WAIT) {
2290 trace_ram_save_iterate_big_wait(t1, i);
2291 break;
2294 i++;
2296 flush_compressed_data(rs);
2297 rcu_read_unlock();
2300 * Must occur before EOS (or any QEMUFile operation)
2301 * because of RDMA protocol.
2303 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2305 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2306 ram_counters.transferred += 8;
2308 ret = qemu_file_get_error(f);
2309 if (ret < 0) {
2310 return ret;
2313 return done;
2317 * ram_save_complete: function called to send the remaining amount of ram
2319 * Returns zero to indicate success
2321 * Called with iothread lock
2323 * @f: QEMUFile where to send the data
2324 * @opaque: RAMState pointer
2326 static int ram_save_complete(QEMUFile *f, void *opaque)
2328 RAMState **temp = opaque;
2329 RAMState *rs = *temp;
2331 rcu_read_lock();
2333 if (!migration_in_postcopy()) {
2334 migration_bitmap_sync(rs);
2337 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2339 /* try transferring iterative blocks of memory */
2341 /* flush all remaining blocks regardless of rate limiting */
2342 while (true) {
2343 int pages;
2345 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2346 /* no more blocks to sent */
2347 if (pages == 0) {
2348 break;
2352 flush_compressed_data(rs);
2353 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2355 rcu_read_unlock();
2357 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2359 return 0;
2362 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2363 uint64_t *non_postcopiable_pending,
2364 uint64_t *postcopiable_pending)
2366 RAMState **temp = opaque;
2367 RAMState *rs = *temp;
2368 uint64_t remaining_size;
2370 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2372 if (!migration_in_postcopy() &&
2373 remaining_size < max_size) {
2374 qemu_mutex_lock_iothread();
2375 rcu_read_lock();
2376 migration_bitmap_sync(rs);
2377 rcu_read_unlock();
2378 qemu_mutex_unlock_iothread();
2379 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2382 if (migrate_postcopy_ram()) {
2383 /* We can do postcopy, and all the data is postcopiable */
2384 *postcopiable_pending += remaining_size;
2385 } else {
2386 *non_postcopiable_pending += remaining_size;
2390 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2392 unsigned int xh_len;
2393 int xh_flags;
2394 uint8_t *loaded_data;
2396 /* extract RLE header */
2397 xh_flags = qemu_get_byte(f);
2398 xh_len = qemu_get_be16(f);
2400 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2401 error_report("Failed to load XBZRLE page - wrong compression!");
2402 return -1;
2405 if (xh_len > TARGET_PAGE_SIZE) {
2406 error_report("Failed to load XBZRLE page - len overflow!");
2407 return -1;
2409 loaded_data = XBZRLE.decoded_buf;
2410 /* load data and decode */
2411 /* it can change loaded_data to point to an internal buffer */
2412 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2414 /* decode RLE */
2415 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2416 TARGET_PAGE_SIZE) == -1) {
2417 error_report("Failed to load XBZRLE page - decode error!");
2418 return -1;
2421 return 0;
2425 * ram_block_from_stream: read a RAMBlock id from the migration stream
2427 * Must be called from within a rcu critical section.
2429 * Returns a pointer from within the RCU-protected ram_list.
2431 * @f: QEMUFile where to read the data from
2432 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2434 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2436 static RAMBlock *block = NULL;
2437 char id[256];
2438 uint8_t len;
2440 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2441 if (!block) {
2442 error_report("Ack, bad migration stream!");
2443 return NULL;
2445 return block;
2448 len = qemu_get_byte(f);
2449 qemu_get_buffer(f, (uint8_t *)id, len);
2450 id[len] = 0;
2452 block = qemu_ram_block_by_name(id);
2453 if (!block) {
2454 error_report("Can't find block %s", id);
2455 return NULL;
2458 return block;
2461 static inline void *host_from_ram_block_offset(RAMBlock *block,
2462 ram_addr_t offset)
2464 if (!offset_in_ramblock(block, offset)) {
2465 return NULL;
2468 return block->host + offset;
2472 * ram_handle_compressed: handle the zero page case
2474 * If a page (or a whole RDMA chunk) has been
2475 * determined to be zero, then zap it.
2477 * @host: host address for the zero page
2478 * @ch: what the page is filled from. We only support zero
2479 * @size: size of the zero page
2481 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2483 if (ch != 0 || !is_zero_range(host, size)) {
2484 memset(host, ch, size);
2488 static void *do_data_decompress(void *opaque)
2490 DecompressParam *param = opaque;
2491 unsigned long pagesize;
2492 uint8_t *des;
2493 int len;
2495 qemu_mutex_lock(&param->mutex);
2496 while (!param->quit) {
2497 if (param->des) {
2498 des = param->des;
2499 len = param->len;
2500 param->des = 0;
2501 qemu_mutex_unlock(&param->mutex);
2503 pagesize = TARGET_PAGE_SIZE;
2504 /* uncompress() will return failed in some case, especially
2505 * when the page is dirted when doing the compression, it's
2506 * not a problem because the dirty page will be retransferred
2507 * and uncompress() won't break the data in other pages.
2509 uncompress((Bytef *)des, &pagesize,
2510 (const Bytef *)param->compbuf, len);
2512 qemu_mutex_lock(&decomp_done_lock);
2513 param->done = true;
2514 qemu_cond_signal(&decomp_done_cond);
2515 qemu_mutex_unlock(&decomp_done_lock);
2517 qemu_mutex_lock(&param->mutex);
2518 } else {
2519 qemu_cond_wait(&param->cond, &param->mutex);
2522 qemu_mutex_unlock(&param->mutex);
2524 return NULL;
2527 static void wait_for_decompress_done(void)
2529 int idx, thread_count;
2531 if (!migrate_use_compression()) {
2532 return;
2535 thread_count = migrate_decompress_threads();
2536 qemu_mutex_lock(&decomp_done_lock);
2537 for (idx = 0; idx < thread_count; idx++) {
2538 while (!decomp_param[idx].done) {
2539 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2542 qemu_mutex_unlock(&decomp_done_lock);
2545 static void compress_threads_load_setup(void)
2547 int i, thread_count;
2549 if (!migrate_use_compression()) {
2550 return;
2552 thread_count = migrate_decompress_threads();
2553 decompress_threads = g_new0(QemuThread, thread_count);
2554 decomp_param = g_new0(DecompressParam, thread_count);
2555 qemu_mutex_init(&decomp_done_lock);
2556 qemu_cond_init(&decomp_done_cond);
2557 for (i = 0; i < thread_count; i++) {
2558 qemu_mutex_init(&decomp_param[i].mutex);
2559 qemu_cond_init(&decomp_param[i].cond);
2560 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2561 decomp_param[i].done = true;
2562 decomp_param[i].quit = false;
2563 qemu_thread_create(decompress_threads + i, "decompress",
2564 do_data_decompress, decomp_param + i,
2565 QEMU_THREAD_JOINABLE);
2569 static void compress_threads_load_cleanup(void)
2571 int i, thread_count;
2573 if (!migrate_use_compression()) {
2574 return;
2576 thread_count = migrate_decompress_threads();
2577 for (i = 0; i < thread_count; i++) {
2578 qemu_mutex_lock(&decomp_param[i].mutex);
2579 decomp_param[i].quit = true;
2580 qemu_cond_signal(&decomp_param[i].cond);
2581 qemu_mutex_unlock(&decomp_param[i].mutex);
2583 for (i = 0; i < thread_count; i++) {
2584 qemu_thread_join(decompress_threads + i);
2585 qemu_mutex_destroy(&decomp_param[i].mutex);
2586 qemu_cond_destroy(&decomp_param[i].cond);
2587 g_free(decomp_param[i].compbuf);
2589 g_free(decompress_threads);
2590 g_free(decomp_param);
2591 decompress_threads = NULL;
2592 decomp_param = NULL;
2595 static void decompress_data_with_multi_threads(QEMUFile *f,
2596 void *host, int len)
2598 int idx, thread_count;
2600 thread_count = migrate_decompress_threads();
2601 qemu_mutex_lock(&decomp_done_lock);
2602 while (true) {
2603 for (idx = 0; idx < thread_count; idx++) {
2604 if (decomp_param[idx].done) {
2605 decomp_param[idx].done = false;
2606 qemu_mutex_lock(&decomp_param[idx].mutex);
2607 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2608 decomp_param[idx].des = host;
2609 decomp_param[idx].len = len;
2610 qemu_cond_signal(&decomp_param[idx].cond);
2611 qemu_mutex_unlock(&decomp_param[idx].mutex);
2612 break;
2615 if (idx < thread_count) {
2616 break;
2617 } else {
2618 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2621 qemu_mutex_unlock(&decomp_done_lock);
2625 * ram_load_setup: Setup RAM for migration incoming side
2627 * Returns zero to indicate success and negative for error
2629 * @f: QEMUFile where to receive the data
2630 * @opaque: RAMState pointer
2632 static int ram_load_setup(QEMUFile *f, void *opaque)
2634 xbzrle_load_setup();
2635 compress_threads_load_setup();
2636 ramblock_recv_map_init();
2637 return 0;
2640 static int ram_load_cleanup(void *opaque)
2642 RAMBlock *rb;
2643 xbzrle_load_cleanup();
2644 compress_threads_load_cleanup();
2646 RAMBLOCK_FOREACH(rb) {
2647 g_free(rb->receivedmap);
2648 rb->receivedmap = NULL;
2650 return 0;
2654 * ram_postcopy_incoming_init: allocate postcopy data structures
2656 * Returns 0 for success and negative if there was one error
2658 * @mis: current migration incoming state
2660 * Allocate data structures etc needed by incoming migration with
2661 * postcopy-ram. postcopy-ram's similarly names
2662 * postcopy_ram_incoming_init does the work.
2664 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2666 unsigned long ram_pages = last_ram_page();
2668 return postcopy_ram_incoming_init(mis, ram_pages);
2672 * ram_load_postcopy: load a page in postcopy case
2674 * Returns 0 for success or -errno in case of error
2676 * Called in postcopy mode by ram_load().
2677 * rcu_read_lock is taken prior to this being called.
2679 * @f: QEMUFile where to send the data
2681 static int ram_load_postcopy(QEMUFile *f)
2683 int flags = 0, ret = 0;
2684 bool place_needed = false;
2685 bool matching_page_sizes = false;
2686 MigrationIncomingState *mis = migration_incoming_get_current();
2687 /* Temporary page that is later 'placed' */
2688 void *postcopy_host_page = postcopy_get_tmp_page(mis);
2689 void *last_host = NULL;
2690 bool all_zero = false;
2692 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2693 ram_addr_t addr;
2694 void *host = NULL;
2695 void *page_buffer = NULL;
2696 void *place_source = NULL;
2697 RAMBlock *block = NULL;
2698 uint8_t ch;
2700 addr = qemu_get_be64(f);
2701 flags = addr & ~TARGET_PAGE_MASK;
2702 addr &= TARGET_PAGE_MASK;
2704 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2705 place_needed = false;
2706 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2707 block = ram_block_from_stream(f, flags);
2709 host = host_from_ram_block_offset(block, addr);
2710 if (!host) {
2711 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2712 ret = -EINVAL;
2713 break;
2715 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2717 * Postcopy requires that we place whole host pages atomically;
2718 * these may be huge pages for RAMBlocks that are backed by
2719 * hugetlbfs.
2720 * To make it atomic, the data is read into a temporary page
2721 * that's moved into place later.
2722 * The migration protocol uses, possibly smaller, target-pages
2723 * however the source ensures it always sends all the components
2724 * of a host page in order.
2726 page_buffer = postcopy_host_page +
2727 ((uintptr_t)host & (block->page_size - 1));
2728 /* If all TP are zero then we can optimise the place */
2729 if (!((uintptr_t)host & (block->page_size - 1))) {
2730 all_zero = true;
2731 } else {
2732 /* not the 1st TP within the HP */
2733 if (host != (last_host + TARGET_PAGE_SIZE)) {
2734 error_report("Non-sequential target page %p/%p",
2735 host, last_host);
2736 ret = -EINVAL;
2737 break;
2743 * If it's the last part of a host page then we place the host
2744 * page
2746 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2747 (block->page_size - 1)) == 0;
2748 place_source = postcopy_host_page;
2750 last_host = host;
2752 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2753 case RAM_SAVE_FLAG_ZERO:
2754 ch = qemu_get_byte(f);
2755 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2756 if (ch) {
2757 all_zero = false;
2759 break;
2761 case RAM_SAVE_FLAG_PAGE:
2762 all_zero = false;
2763 if (!place_needed || !matching_page_sizes) {
2764 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2765 } else {
2766 /* Avoids the qemu_file copy during postcopy, which is
2767 * going to do a copy later; can only do it when we
2768 * do this read in one go (matching page sizes)
2770 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2771 TARGET_PAGE_SIZE);
2773 break;
2774 case RAM_SAVE_FLAG_EOS:
2775 /* normal exit */
2776 break;
2777 default:
2778 error_report("Unknown combination of migration flags: %#x"
2779 " (postcopy mode)", flags);
2780 ret = -EINVAL;
2783 if (place_needed) {
2784 /* This gets called at the last target page in the host page */
2785 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2787 if (all_zero) {
2788 ret = postcopy_place_page_zero(mis, place_dest,
2789 block);
2790 } else {
2791 ret = postcopy_place_page(mis, place_dest,
2792 place_source, block);
2795 if (!ret) {
2796 ret = qemu_file_get_error(f);
2800 return ret;
2803 static bool postcopy_is_advised(void)
2805 PostcopyState ps = postcopy_state_get();
2806 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
2809 static bool postcopy_is_running(void)
2811 PostcopyState ps = postcopy_state_get();
2812 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
2815 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2817 int flags = 0, ret = 0, invalid_flags = 0;
2818 static uint64_t seq_iter;
2819 int len = 0;
2821 * If system is running in postcopy mode, page inserts to host memory must
2822 * be atomic
2824 bool postcopy_running = postcopy_is_running();
2825 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2826 bool postcopy_advised = postcopy_is_advised();
2828 seq_iter++;
2830 if (version_id != 4) {
2831 ret = -EINVAL;
2834 if (!migrate_use_compression()) {
2835 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2837 /* This RCU critical section can be very long running.
2838 * When RCU reclaims in the code start to become numerous,
2839 * it will be necessary to reduce the granularity of this
2840 * critical section.
2842 rcu_read_lock();
2844 if (postcopy_running) {
2845 ret = ram_load_postcopy(f);
2848 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2849 ram_addr_t addr, total_ram_bytes;
2850 void *host = NULL;
2851 uint8_t ch;
2853 addr = qemu_get_be64(f);
2854 flags = addr & ~TARGET_PAGE_MASK;
2855 addr &= TARGET_PAGE_MASK;
2857 if (flags & invalid_flags) {
2858 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2859 error_report("Received an unexpected compressed page");
2862 ret = -EINVAL;
2863 break;
2866 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2867 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2868 RAMBlock *block = ram_block_from_stream(f, flags);
2870 host = host_from_ram_block_offset(block, addr);
2871 if (!host) {
2872 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2873 ret = -EINVAL;
2874 break;
2876 ramblock_recv_bitmap_set(block, host);
2877 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2880 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2881 case RAM_SAVE_FLAG_MEM_SIZE:
2882 /* Synchronize RAM block list */
2883 total_ram_bytes = addr;
2884 while (!ret && total_ram_bytes) {
2885 RAMBlock *block;
2886 char id[256];
2887 ram_addr_t length;
2889 len = qemu_get_byte(f);
2890 qemu_get_buffer(f, (uint8_t *)id, len);
2891 id[len] = 0;
2892 length = qemu_get_be64(f);
2894 block = qemu_ram_block_by_name(id);
2895 if (block) {
2896 if (length != block->used_length) {
2897 Error *local_err = NULL;
2899 ret = qemu_ram_resize(block, length,
2900 &local_err);
2901 if (local_err) {
2902 error_report_err(local_err);
2905 /* For postcopy we need to check hugepage sizes match */
2906 if (postcopy_advised &&
2907 block->page_size != qemu_host_page_size) {
2908 uint64_t remote_page_size = qemu_get_be64(f);
2909 if (remote_page_size != block->page_size) {
2910 error_report("Mismatched RAM page size %s "
2911 "(local) %zd != %" PRId64,
2912 id, block->page_size,
2913 remote_page_size);
2914 ret = -EINVAL;
2917 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2918 block->idstr);
2919 } else {
2920 error_report("Unknown ramblock \"%s\", cannot "
2921 "accept migration", id);
2922 ret = -EINVAL;
2925 total_ram_bytes -= length;
2927 break;
2929 case RAM_SAVE_FLAG_ZERO:
2930 ch = qemu_get_byte(f);
2931 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2932 break;
2934 case RAM_SAVE_FLAG_PAGE:
2935 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2936 break;
2938 case RAM_SAVE_FLAG_COMPRESS_PAGE:
2939 len = qemu_get_be32(f);
2940 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2941 error_report("Invalid compressed data length: %d", len);
2942 ret = -EINVAL;
2943 break;
2945 decompress_data_with_multi_threads(f, host, len);
2946 break;
2948 case RAM_SAVE_FLAG_XBZRLE:
2949 if (load_xbzrle(f, addr, host) < 0) {
2950 error_report("Failed to decompress XBZRLE page at "
2951 RAM_ADDR_FMT, addr);
2952 ret = -EINVAL;
2953 break;
2955 break;
2956 case RAM_SAVE_FLAG_EOS:
2957 /* normal exit */
2958 break;
2959 default:
2960 if (flags & RAM_SAVE_FLAG_HOOK) {
2961 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2962 } else {
2963 error_report("Unknown combination of migration flags: %#x",
2964 flags);
2965 ret = -EINVAL;
2968 if (!ret) {
2969 ret = qemu_file_get_error(f);
2973 wait_for_decompress_done();
2974 rcu_read_unlock();
2975 trace_ram_load_complete(ret, seq_iter);
2976 return ret;
2979 static bool ram_has_postcopy(void *opaque)
2981 return migrate_postcopy_ram();
2984 static SaveVMHandlers savevm_ram_handlers = {
2985 .save_setup = ram_save_setup,
2986 .save_live_iterate = ram_save_iterate,
2987 .save_live_complete_postcopy = ram_save_complete,
2988 .save_live_complete_precopy = ram_save_complete,
2989 .has_postcopy = ram_has_postcopy,
2990 .save_live_pending = ram_save_pending,
2991 .load_state = ram_load,
2992 .save_cleanup = ram_save_cleanup,
2993 .load_setup = ram_load_setup,
2994 .load_cleanup = ram_load_cleanup,
2997 void ram_mig_init(void)
2999 qemu_mutex_init(&XBZRLE.lock);
3000 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);