Merge remote-tracking branch 'remotes/dgilbert/tags/pull-hmp-20180320' into staging
[qemu/ar7.git] / migration / ram.c
blob0e90efa09236af7ec32bbee78a4c66ebecda7b95
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qemu/cutils.h"
33 #include "qemu/bitops.h"
34 #include "qemu/bitmap.h"
35 #include "qemu/main-loop.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "migration/page_cache.h"
44 #include "qemu/error-report.h"
45 #include "qapi/error.h"
46 #include "qapi/qapi-events-migration.h"
47 #include "qapi/qmp/qerror.h"
48 #include "trace.h"
49 #include "exec/ram_addr.h"
50 #include "exec/target_page.h"
51 #include "qemu/rcu_queue.h"
52 #include "migration/colo.h"
53 #include "migration/block.h"
55 /***********************************************************/
56 /* ram save/restore */
58 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
59 * worked for pages that where filled with the same char. We switched
60 * it to only search for the zero value. And to avoid confusion with
61 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
64 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
65 #define RAM_SAVE_FLAG_ZERO 0x02
66 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
67 #define RAM_SAVE_FLAG_PAGE 0x08
68 #define RAM_SAVE_FLAG_EOS 0x10
69 #define RAM_SAVE_FLAG_CONTINUE 0x20
70 #define RAM_SAVE_FLAG_XBZRLE 0x40
71 /* 0x80 is reserved in migration.h start with 0x100 next */
72 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
74 static inline bool is_zero_range(uint8_t *p, uint64_t size)
76 return buffer_is_zero(p, size);
79 XBZRLECacheStats xbzrle_counters;
81 /* struct contains XBZRLE cache and a static page
82 used by the compression */
83 static struct {
84 /* buffer used for XBZRLE encoding */
85 uint8_t *encoded_buf;
86 /* buffer for storing page content */
87 uint8_t *current_buf;
88 /* Cache for XBZRLE, Protected by lock. */
89 PageCache *cache;
90 QemuMutex lock;
91 /* it will store a page full of zeros */
92 uint8_t *zero_target_page;
93 /* buffer used for XBZRLE decoding */
94 uint8_t *decoded_buf;
95 } XBZRLE;
97 static void XBZRLE_cache_lock(void)
99 if (migrate_use_xbzrle())
100 qemu_mutex_lock(&XBZRLE.lock);
103 static void XBZRLE_cache_unlock(void)
105 if (migrate_use_xbzrle())
106 qemu_mutex_unlock(&XBZRLE.lock);
110 * xbzrle_cache_resize: resize the xbzrle cache
112 * This function is called from qmp_migrate_set_cache_size in main
113 * thread, possibly while a migration is in progress. A running
114 * migration may be using the cache and might finish during this call,
115 * hence changes to the cache are protected by XBZRLE.lock().
117 * Returns 0 for success or -1 for error
119 * @new_size: new cache size
120 * @errp: set *errp if the check failed, with reason
122 int xbzrle_cache_resize(int64_t new_size, Error **errp)
124 PageCache *new_cache;
125 int64_t ret = 0;
127 /* Check for truncation */
128 if (new_size != (size_t)new_size) {
129 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
130 "exceeding address space");
131 return -1;
134 if (new_size == migrate_xbzrle_cache_size()) {
135 /* nothing to do */
136 return 0;
139 XBZRLE_cache_lock();
141 if (XBZRLE.cache != NULL) {
142 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
143 if (!new_cache) {
144 ret = -1;
145 goto out;
148 cache_fini(XBZRLE.cache);
149 XBZRLE.cache = new_cache;
151 out:
152 XBZRLE_cache_unlock();
153 return ret;
156 static void ramblock_recv_map_init(void)
158 RAMBlock *rb;
160 RAMBLOCK_FOREACH(rb) {
161 assert(!rb->receivedmap);
162 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
166 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
168 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
169 rb->receivedmap);
172 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
174 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
177 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
179 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
182 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
183 size_t nr)
185 bitmap_set_atomic(rb->receivedmap,
186 ramblock_recv_bitmap_offset(host_addr, rb),
187 nr);
191 * An outstanding page request, on the source, having been received
192 * and queued
194 struct RAMSrcPageRequest {
195 RAMBlock *rb;
196 hwaddr offset;
197 hwaddr len;
199 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
202 /* State of RAM for migration */
203 struct RAMState {
204 /* QEMUFile used for this migration */
205 QEMUFile *f;
206 /* Last block that we have visited searching for dirty pages */
207 RAMBlock *last_seen_block;
208 /* Last block from where we have sent data */
209 RAMBlock *last_sent_block;
210 /* Last dirty target page we have sent */
211 ram_addr_t last_page;
212 /* last ram version we have seen */
213 uint32_t last_version;
214 /* We are in the first round */
215 bool ram_bulk_stage;
216 /* How many times we have dirty too many pages */
217 int dirty_rate_high_cnt;
218 /* these variables are used for bitmap sync */
219 /* last time we did a full bitmap_sync */
220 int64_t time_last_bitmap_sync;
221 /* bytes transferred at start_time */
222 uint64_t bytes_xfer_prev;
223 /* number of dirty pages since start_time */
224 uint64_t num_dirty_pages_period;
225 /* xbzrle misses since the beginning of the period */
226 uint64_t xbzrle_cache_miss_prev;
227 /* number of iterations at the beginning of period */
228 uint64_t iterations_prev;
229 /* Iterations since start */
230 uint64_t iterations;
231 /* number of dirty bits in the bitmap */
232 uint64_t migration_dirty_pages;
233 /* protects modification of the bitmap */
234 QemuMutex bitmap_mutex;
235 /* The RAMBlock used in the last src_page_requests */
236 RAMBlock *last_req_rb;
237 /* Queue of outstanding page requests from the destination */
238 QemuMutex src_page_req_mutex;
239 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
241 typedef struct RAMState RAMState;
243 static RAMState *ram_state;
245 uint64_t ram_bytes_remaining(void)
247 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
251 MigrationStats ram_counters;
253 /* used by the search for pages to send */
254 struct PageSearchStatus {
255 /* Current block being searched */
256 RAMBlock *block;
257 /* Current page to search from */
258 unsigned long page;
259 /* Set once we wrap around */
260 bool complete_round;
262 typedef struct PageSearchStatus PageSearchStatus;
264 struct CompressParam {
265 bool done;
266 bool quit;
267 QEMUFile *file;
268 QemuMutex mutex;
269 QemuCond cond;
270 RAMBlock *block;
271 ram_addr_t offset;
273 typedef struct CompressParam CompressParam;
275 struct DecompressParam {
276 bool done;
277 bool quit;
278 QemuMutex mutex;
279 QemuCond cond;
280 void *des;
281 uint8_t *compbuf;
282 int len;
284 typedef struct DecompressParam DecompressParam;
286 static CompressParam *comp_param;
287 static QemuThread *compress_threads;
288 /* comp_done_cond is used to wake up the migration thread when
289 * one of the compression threads has finished the compression.
290 * comp_done_lock is used to co-work with comp_done_cond.
292 static QemuMutex comp_done_lock;
293 static QemuCond comp_done_cond;
294 /* The empty QEMUFileOps will be used by file in CompressParam */
295 static const QEMUFileOps empty_ops = { };
297 static DecompressParam *decomp_param;
298 static QemuThread *decompress_threads;
299 static QemuMutex decomp_done_lock;
300 static QemuCond decomp_done_cond;
302 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
303 ram_addr_t offset);
305 static void *do_data_compress(void *opaque)
307 CompressParam *param = opaque;
308 RAMBlock *block;
309 ram_addr_t offset;
311 qemu_mutex_lock(&param->mutex);
312 while (!param->quit) {
313 if (param->block) {
314 block = param->block;
315 offset = param->offset;
316 param->block = NULL;
317 qemu_mutex_unlock(&param->mutex);
319 do_compress_ram_page(param->file, block, offset);
321 qemu_mutex_lock(&comp_done_lock);
322 param->done = true;
323 qemu_cond_signal(&comp_done_cond);
324 qemu_mutex_unlock(&comp_done_lock);
326 qemu_mutex_lock(&param->mutex);
327 } else {
328 qemu_cond_wait(&param->cond, &param->mutex);
331 qemu_mutex_unlock(&param->mutex);
333 return NULL;
336 static inline void terminate_compression_threads(void)
338 int idx, thread_count;
340 thread_count = migrate_compress_threads();
342 for (idx = 0; idx < thread_count; idx++) {
343 qemu_mutex_lock(&comp_param[idx].mutex);
344 comp_param[idx].quit = true;
345 qemu_cond_signal(&comp_param[idx].cond);
346 qemu_mutex_unlock(&comp_param[idx].mutex);
350 static void compress_threads_save_cleanup(void)
352 int i, thread_count;
354 if (!migrate_use_compression()) {
355 return;
357 terminate_compression_threads();
358 thread_count = migrate_compress_threads();
359 for (i = 0; i < thread_count; i++) {
360 qemu_thread_join(compress_threads + i);
361 qemu_fclose(comp_param[i].file);
362 qemu_mutex_destroy(&comp_param[i].mutex);
363 qemu_cond_destroy(&comp_param[i].cond);
365 qemu_mutex_destroy(&comp_done_lock);
366 qemu_cond_destroy(&comp_done_cond);
367 g_free(compress_threads);
368 g_free(comp_param);
369 compress_threads = NULL;
370 comp_param = NULL;
373 static void compress_threads_save_setup(void)
375 int i, thread_count;
377 if (!migrate_use_compression()) {
378 return;
380 thread_count = migrate_compress_threads();
381 compress_threads = g_new0(QemuThread, thread_count);
382 comp_param = g_new0(CompressParam, thread_count);
383 qemu_cond_init(&comp_done_cond);
384 qemu_mutex_init(&comp_done_lock);
385 for (i = 0; i < thread_count; i++) {
386 /* comp_param[i].file is just used as a dummy buffer to save data,
387 * set its ops to empty.
389 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
390 comp_param[i].done = true;
391 comp_param[i].quit = false;
392 qemu_mutex_init(&comp_param[i].mutex);
393 qemu_cond_init(&comp_param[i].cond);
394 qemu_thread_create(compress_threads + i, "compress",
395 do_data_compress, comp_param + i,
396 QEMU_THREAD_JOINABLE);
400 /* Multiple fd's */
402 struct MultiFDSendParams {
403 uint8_t id;
404 char *name;
405 QemuThread thread;
406 QemuSemaphore sem;
407 QemuMutex mutex;
408 bool quit;
410 typedef struct MultiFDSendParams MultiFDSendParams;
412 struct {
413 MultiFDSendParams *params;
414 /* number of created threads */
415 int count;
416 } *multifd_send_state;
418 static void terminate_multifd_send_threads(Error *errp)
420 int i;
422 for (i = 0; i < multifd_send_state->count; i++) {
423 MultiFDSendParams *p = &multifd_send_state->params[i];
425 qemu_mutex_lock(&p->mutex);
426 p->quit = true;
427 qemu_sem_post(&p->sem);
428 qemu_mutex_unlock(&p->mutex);
432 int multifd_save_cleanup(Error **errp)
434 int i;
435 int ret = 0;
437 if (!migrate_use_multifd()) {
438 return 0;
440 terminate_multifd_send_threads(NULL);
441 for (i = 0; i < multifd_send_state->count; i++) {
442 MultiFDSendParams *p = &multifd_send_state->params[i];
444 qemu_thread_join(&p->thread);
445 qemu_mutex_destroy(&p->mutex);
446 qemu_sem_destroy(&p->sem);
447 g_free(p->name);
448 p->name = NULL;
450 g_free(multifd_send_state->params);
451 multifd_send_state->params = NULL;
452 g_free(multifd_send_state);
453 multifd_send_state = NULL;
454 return ret;
457 static void *multifd_send_thread(void *opaque)
459 MultiFDSendParams *p = opaque;
461 while (true) {
462 qemu_mutex_lock(&p->mutex);
463 if (p->quit) {
464 qemu_mutex_unlock(&p->mutex);
465 break;
467 qemu_mutex_unlock(&p->mutex);
468 qemu_sem_wait(&p->sem);
471 return NULL;
474 int multifd_save_setup(void)
476 int thread_count;
477 uint8_t i;
479 if (!migrate_use_multifd()) {
480 return 0;
482 thread_count = migrate_multifd_channels();
483 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
484 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
485 multifd_send_state->count = 0;
486 for (i = 0; i < thread_count; i++) {
487 MultiFDSendParams *p = &multifd_send_state->params[i];
489 qemu_mutex_init(&p->mutex);
490 qemu_sem_init(&p->sem, 0);
491 p->quit = false;
492 p->id = i;
493 p->name = g_strdup_printf("multifdsend_%d", i);
494 qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
495 QEMU_THREAD_JOINABLE);
497 multifd_send_state->count++;
499 return 0;
502 struct MultiFDRecvParams {
503 uint8_t id;
504 char *name;
505 QemuThread thread;
506 QemuSemaphore sem;
507 QemuMutex mutex;
508 bool quit;
510 typedef struct MultiFDRecvParams MultiFDRecvParams;
512 struct {
513 MultiFDRecvParams *params;
514 /* number of created threads */
515 int count;
516 } *multifd_recv_state;
518 static void terminate_multifd_recv_threads(Error *errp)
520 int i;
522 for (i = 0; i < multifd_recv_state->count; i++) {
523 MultiFDRecvParams *p = &multifd_recv_state->params[i];
525 qemu_mutex_lock(&p->mutex);
526 p->quit = true;
527 qemu_sem_post(&p->sem);
528 qemu_mutex_unlock(&p->mutex);
532 int multifd_load_cleanup(Error **errp)
534 int i;
535 int ret = 0;
537 if (!migrate_use_multifd()) {
538 return 0;
540 terminate_multifd_recv_threads(NULL);
541 for (i = 0; i < multifd_recv_state->count; i++) {
542 MultiFDRecvParams *p = &multifd_recv_state->params[i];
544 qemu_thread_join(&p->thread);
545 qemu_mutex_destroy(&p->mutex);
546 qemu_sem_destroy(&p->sem);
547 g_free(p->name);
548 p->name = NULL;
550 g_free(multifd_recv_state->params);
551 multifd_recv_state->params = NULL;
552 g_free(multifd_recv_state);
553 multifd_recv_state = NULL;
555 return ret;
558 static void *multifd_recv_thread(void *opaque)
560 MultiFDRecvParams *p = opaque;
562 while (true) {
563 qemu_mutex_lock(&p->mutex);
564 if (p->quit) {
565 qemu_mutex_unlock(&p->mutex);
566 break;
568 qemu_mutex_unlock(&p->mutex);
569 qemu_sem_wait(&p->sem);
572 return NULL;
575 int multifd_load_setup(void)
577 int thread_count;
578 uint8_t i;
580 if (!migrate_use_multifd()) {
581 return 0;
583 thread_count = migrate_multifd_channels();
584 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
585 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
586 multifd_recv_state->count = 0;
587 for (i = 0; i < thread_count; i++) {
588 MultiFDRecvParams *p = &multifd_recv_state->params[i];
590 qemu_mutex_init(&p->mutex);
591 qemu_sem_init(&p->sem, 0);
592 p->quit = false;
593 p->id = i;
594 p->name = g_strdup_printf("multifdrecv_%d", i);
595 qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
596 QEMU_THREAD_JOINABLE);
597 multifd_recv_state->count++;
599 return 0;
603 * save_page_header: write page header to wire
605 * If this is the 1st block, it also writes the block identification
607 * Returns the number of bytes written
609 * @f: QEMUFile where to send the data
610 * @block: block that contains the page we want to send
611 * @offset: offset inside the block for the page
612 * in the lower bits, it contains flags
614 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
615 ram_addr_t offset)
617 size_t size, len;
619 if (block == rs->last_sent_block) {
620 offset |= RAM_SAVE_FLAG_CONTINUE;
622 qemu_put_be64(f, offset);
623 size = 8;
625 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
626 len = strlen(block->idstr);
627 qemu_put_byte(f, len);
628 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
629 size += 1 + len;
630 rs->last_sent_block = block;
632 return size;
636 * mig_throttle_guest_down: throotle down the guest
638 * Reduce amount of guest cpu execution to hopefully slow down memory
639 * writes. If guest dirty memory rate is reduced below the rate at
640 * which we can transfer pages to the destination then we should be
641 * able to complete migration. Some workloads dirty memory way too
642 * fast and will not effectively converge, even with auto-converge.
644 static void mig_throttle_guest_down(void)
646 MigrationState *s = migrate_get_current();
647 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
648 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
650 /* We have not started throttling yet. Let's start it. */
651 if (!cpu_throttle_active()) {
652 cpu_throttle_set(pct_initial);
653 } else {
654 /* Throttling already on, just increase the rate */
655 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
660 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
662 * @rs: current RAM state
663 * @current_addr: address for the zero page
665 * Update the xbzrle cache to reflect a page that's been sent as all 0.
666 * The important thing is that a stale (not-yet-0'd) page be replaced
667 * by the new data.
668 * As a bonus, if the page wasn't in the cache it gets added so that
669 * when a small write is made into the 0'd page it gets XBZRLE sent.
671 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
673 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
674 return;
677 /* We don't care if this fails to allocate a new cache page
678 * as long as it updated an old one */
679 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
680 ram_counters.dirty_sync_count);
683 #define ENCODING_FLAG_XBZRLE 0x1
686 * save_xbzrle_page: compress and send current page
688 * Returns: 1 means that we wrote the page
689 * 0 means that page is identical to the one already sent
690 * -1 means that xbzrle would be longer than normal
692 * @rs: current RAM state
693 * @current_data: pointer to the address of the page contents
694 * @current_addr: addr of the page
695 * @block: block that contains the page we want to send
696 * @offset: offset inside the block for the page
697 * @last_stage: if we are at the completion stage
699 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
700 ram_addr_t current_addr, RAMBlock *block,
701 ram_addr_t offset, bool last_stage)
703 int encoded_len = 0, bytes_xbzrle;
704 uint8_t *prev_cached_page;
706 if (!cache_is_cached(XBZRLE.cache, current_addr,
707 ram_counters.dirty_sync_count)) {
708 xbzrle_counters.cache_miss++;
709 if (!last_stage) {
710 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
711 ram_counters.dirty_sync_count) == -1) {
712 return -1;
713 } else {
714 /* update *current_data when the page has been
715 inserted into cache */
716 *current_data = get_cached_data(XBZRLE.cache, current_addr);
719 return -1;
722 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
724 /* save current buffer into memory */
725 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
727 /* XBZRLE encoding (if there is no overflow) */
728 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
729 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
730 TARGET_PAGE_SIZE);
731 if (encoded_len == 0) {
732 trace_save_xbzrle_page_skipping();
733 return 0;
734 } else if (encoded_len == -1) {
735 trace_save_xbzrle_page_overflow();
736 xbzrle_counters.overflow++;
737 /* update data in the cache */
738 if (!last_stage) {
739 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
740 *current_data = prev_cached_page;
742 return -1;
745 /* we need to update the data in the cache, in order to get the same data */
746 if (!last_stage) {
747 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
750 /* Send XBZRLE based compressed page */
751 bytes_xbzrle = save_page_header(rs, rs->f, block,
752 offset | RAM_SAVE_FLAG_XBZRLE);
753 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
754 qemu_put_be16(rs->f, encoded_len);
755 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
756 bytes_xbzrle += encoded_len + 1 + 2;
757 xbzrle_counters.pages++;
758 xbzrle_counters.bytes += bytes_xbzrle;
759 ram_counters.transferred += bytes_xbzrle;
761 return 1;
765 * migration_bitmap_find_dirty: find the next dirty page from start
767 * Called with rcu_read_lock() to protect migration_bitmap
769 * Returns the byte offset within memory region of the start of a dirty page
771 * @rs: current RAM state
772 * @rb: RAMBlock where to search for dirty pages
773 * @start: page where we start the search
775 static inline
776 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
777 unsigned long start)
779 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
780 unsigned long *bitmap = rb->bmap;
781 unsigned long next;
783 if (rs->ram_bulk_stage && start > 0) {
784 next = start + 1;
785 } else {
786 next = find_next_bit(bitmap, size, start);
789 return next;
792 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
793 RAMBlock *rb,
794 unsigned long page)
796 bool ret;
798 ret = test_and_clear_bit(page, rb->bmap);
800 if (ret) {
801 rs->migration_dirty_pages--;
803 return ret;
806 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
807 ram_addr_t start, ram_addr_t length)
809 rs->migration_dirty_pages +=
810 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
811 &rs->num_dirty_pages_period);
815 * ram_pagesize_summary: calculate all the pagesizes of a VM
817 * Returns a summary bitmap of the page sizes of all RAMBlocks
819 * For VMs with just normal pages this is equivalent to the host page
820 * size. If it's got some huge pages then it's the OR of all the
821 * different page sizes.
823 uint64_t ram_pagesize_summary(void)
825 RAMBlock *block;
826 uint64_t summary = 0;
828 RAMBLOCK_FOREACH(block) {
829 summary |= block->page_size;
832 return summary;
835 static void migration_bitmap_sync(RAMState *rs)
837 RAMBlock *block;
838 int64_t end_time;
839 uint64_t bytes_xfer_now;
841 ram_counters.dirty_sync_count++;
843 if (!rs->time_last_bitmap_sync) {
844 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
847 trace_migration_bitmap_sync_start();
848 memory_global_dirty_log_sync();
850 qemu_mutex_lock(&rs->bitmap_mutex);
851 rcu_read_lock();
852 RAMBLOCK_FOREACH(block) {
853 migration_bitmap_sync_range(rs, block, 0, block->used_length);
855 rcu_read_unlock();
856 qemu_mutex_unlock(&rs->bitmap_mutex);
858 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
860 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
862 /* more than 1 second = 1000 millisecons */
863 if (end_time > rs->time_last_bitmap_sync + 1000) {
864 /* calculate period counters */
865 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
866 / (end_time - rs->time_last_bitmap_sync);
867 bytes_xfer_now = ram_counters.transferred;
869 /* During block migration the auto-converge logic incorrectly detects
870 * that ram migration makes no progress. Avoid this by disabling the
871 * throttling logic during the bulk phase of block migration. */
872 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
873 /* The following detection logic can be refined later. For now:
874 Check to see if the dirtied bytes is 50% more than the approx.
875 amount of bytes that just got transferred since the last time we
876 were in this routine. If that happens twice, start or increase
877 throttling */
879 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
880 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
881 (++rs->dirty_rate_high_cnt >= 2)) {
882 trace_migration_throttle();
883 rs->dirty_rate_high_cnt = 0;
884 mig_throttle_guest_down();
888 if (migrate_use_xbzrle()) {
889 if (rs->iterations_prev != rs->iterations) {
890 xbzrle_counters.cache_miss_rate =
891 (double)(xbzrle_counters.cache_miss -
892 rs->xbzrle_cache_miss_prev) /
893 (rs->iterations - rs->iterations_prev);
895 rs->iterations_prev = rs->iterations;
896 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
899 /* reset period counters */
900 rs->time_last_bitmap_sync = end_time;
901 rs->num_dirty_pages_period = 0;
902 rs->bytes_xfer_prev = bytes_xfer_now;
904 if (migrate_use_events()) {
905 qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
910 * save_zero_page: send the zero page to the stream
912 * Returns the number of pages written.
914 * @rs: current RAM state
915 * @block: block that contains the page we want to send
916 * @offset: offset inside the block for the page
918 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
920 uint8_t *p = block->host + offset;
921 int pages = -1;
923 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
924 ram_counters.duplicate++;
925 ram_counters.transferred +=
926 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
927 qemu_put_byte(rs->f, 0);
928 ram_counters.transferred += 1;
929 pages = 1;
932 return pages;
935 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
937 if (!migrate_release_ram() || !migration_in_postcopy()) {
938 return;
941 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
945 * ram_save_page: send the given page to the stream
947 * Returns the number of pages written.
948 * < 0 - error
949 * >=0 - Number of pages written - this might legally be 0
950 * if xbzrle noticed the page was the same.
952 * @rs: current RAM state
953 * @block: block that contains the page we want to send
954 * @offset: offset inside the block for the page
955 * @last_stage: if we are at the completion stage
957 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
959 int pages = -1;
960 uint64_t bytes_xmit;
961 ram_addr_t current_addr;
962 uint8_t *p;
963 int ret;
964 bool send_async = true;
965 RAMBlock *block = pss->block;
966 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
968 p = block->host + offset;
969 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
971 /* In doubt sent page as normal */
972 bytes_xmit = 0;
973 ret = ram_control_save_page(rs->f, block->offset,
974 offset, TARGET_PAGE_SIZE, &bytes_xmit);
975 if (bytes_xmit) {
976 ram_counters.transferred += bytes_xmit;
977 pages = 1;
980 XBZRLE_cache_lock();
982 current_addr = block->offset + offset;
984 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
985 if (ret != RAM_SAVE_CONTROL_DELAYED) {
986 if (bytes_xmit > 0) {
987 ram_counters.normal++;
988 } else if (bytes_xmit == 0) {
989 ram_counters.duplicate++;
992 } else {
993 pages = save_zero_page(rs, block, offset);
994 if (pages > 0) {
995 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
996 * page would be stale
998 xbzrle_cache_zero_page(rs, current_addr);
999 ram_release_pages(block->idstr, offset, pages);
1000 } else if (!rs->ram_bulk_stage &&
1001 !migration_in_postcopy() && migrate_use_xbzrle()) {
1002 pages = save_xbzrle_page(rs, &p, current_addr, block,
1003 offset, last_stage);
1004 if (!last_stage) {
1005 /* Can't send this cached data async, since the cache page
1006 * might get updated before it gets to the wire
1008 send_async = false;
1013 /* XBZRLE overflow or normal page */
1014 if (pages == -1) {
1015 ram_counters.transferred +=
1016 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
1017 if (send_async) {
1018 qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
1019 migrate_release_ram() &
1020 migration_in_postcopy());
1021 } else {
1022 qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
1024 ram_counters.transferred += TARGET_PAGE_SIZE;
1025 pages = 1;
1026 ram_counters.normal++;
1029 XBZRLE_cache_unlock();
1031 return pages;
1034 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
1035 ram_addr_t offset)
1037 RAMState *rs = ram_state;
1038 int bytes_sent, blen;
1039 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1041 bytes_sent = save_page_header(rs, f, block, offset |
1042 RAM_SAVE_FLAG_COMPRESS_PAGE);
1043 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1044 migrate_compress_level());
1045 if (blen < 0) {
1046 bytes_sent = 0;
1047 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1048 error_report("compressed data failed!");
1049 } else {
1050 bytes_sent += blen;
1051 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1054 return bytes_sent;
1057 static void flush_compressed_data(RAMState *rs)
1059 int idx, len, thread_count;
1061 if (!migrate_use_compression()) {
1062 return;
1064 thread_count = migrate_compress_threads();
1066 qemu_mutex_lock(&comp_done_lock);
1067 for (idx = 0; idx < thread_count; idx++) {
1068 while (!comp_param[idx].done) {
1069 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1072 qemu_mutex_unlock(&comp_done_lock);
1074 for (idx = 0; idx < thread_count; idx++) {
1075 qemu_mutex_lock(&comp_param[idx].mutex);
1076 if (!comp_param[idx].quit) {
1077 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1078 ram_counters.transferred += len;
1080 qemu_mutex_unlock(&comp_param[idx].mutex);
1084 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1085 ram_addr_t offset)
1087 param->block = block;
1088 param->offset = offset;
1091 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1092 ram_addr_t offset)
1094 int idx, thread_count, bytes_xmit = -1, pages = -1;
1096 thread_count = migrate_compress_threads();
1097 qemu_mutex_lock(&comp_done_lock);
1098 while (true) {
1099 for (idx = 0; idx < thread_count; idx++) {
1100 if (comp_param[idx].done) {
1101 comp_param[idx].done = false;
1102 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1103 qemu_mutex_lock(&comp_param[idx].mutex);
1104 set_compress_params(&comp_param[idx], block, offset);
1105 qemu_cond_signal(&comp_param[idx].cond);
1106 qemu_mutex_unlock(&comp_param[idx].mutex);
1107 pages = 1;
1108 ram_counters.normal++;
1109 ram_counters.transferred += bytes_xmit;
1110 break;
1113 if (pages > 0) {
1114 break;
1115 } else {
1116 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1119 qemu_mutex_unlock(&comp_done_lock);
1121 return pages;
1125 * ram_save_compressed_page: compress the given page and send it to the stream
1127 * Returns the number of pages written.
1129 * @rs: current RAM state
1130 * @block: block that contains the page we want to send
1131 * @offset: offset inside the block for the page
1132 * @last_stage: if we are at the completion stage
1134 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
1135 bool last_stage)
1137 int pages = -1;
1138 uint64_t bytes_xmit = 0;
1139 uint8_t *p;
1140 int ret, blen;
1141 RAMBlock *block = pss->block;
1142 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1144 p = block->host + offset;
1146 ret = ram_control_save_page(rs->f, block->offset,
1147 offset, TARGET_PAGE_SIZE, &bytes_xmit);
1148 if (bytes_xmit) {
1149 ram_counters.transferred += bytes_xmit;
1150 pages = 1;
1152 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
1153 if (ret != RAM_SAVE_CONTROL_DELAYED) {
1154 if (bytes_xmit > 0) {
1155 ram_counters.normal++;
1156 } else if (bytes_xmit == 0) {
1157 ram_counters.duplicate++;
1160 } else {
1161 /* When starting the process of a new block, the first page of
1162 * the block should be sent out before other pages in the same
1163 * block, and all the pages in last block should have been sent
1164 * out, keeping this order is important, because the 'cont' flag
1165 * is used to avoid resending the block name.
1167 if (block != rs->last_sent_block) {
1168 flush_compressed_data(rs);
1169 pages = save_zero_page(rs, block, offset);
1170 if (pages == -1) {
1171 /* Make sure the first page is sent out before other pages */
1172 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1173 RAM_SAVE_FLAG_COMPRESS_PAGE);
1174 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1175 migrate_compress_level());
1176 if (blen > 0) {
1177 ram_counters.transferred += bytes_xmit + blen;
1178 ram_counters.normal++;
1179 pages = 1;
1180 } else {
1181 qemu_file_set_error(rs->f, blen);
1182 error_report("compressed data failed!");
1185 if (pages > 0) {
1186 ram_release_pages(block->idstr, offset, pages);
1188 } else {
1189 pages = save_zero_page(rs, block, offset);
1190 if (pages == -1) {
1191 pages = compress_page_with_multi_thread(rs, block, offset);
1192 } else {
1193 ram_release_pages(block->idstr, offset, pages);
1198 return pages;
1202 * find_dirty_block: find the next dirty page and update any state
1203 * associated with the search process.
1205 * Returns if a page is found
1207 * @rs: current RAM state
1208 * @pss: data about the state of the current dirty page scan
1209 * @again: set to false if the search has scanned the whole of RAM
1211 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1213 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1214 if (pss->complete_round && pss->block == rs->last_seen_block &&
1215 pss->page >= rs->last_page) {
1217 * We've been once around the RAM and haven't found anything.
1218 * Give up.
1220 *again = false;
1221 return false;
1223 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1224 /* Didn't find anything in this RAM Block */
1225 pss->page = 0;
1226 pss->block = QLIST_NEXT_RCU(pss->block, next);
1227 if (!pss->block) {
1228 /* Hit the end of the list */
1229 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1230 /* Flag that we've looped */
1231 pss->complete_round = true;
1232 rs->ram_bulk_stage = false;
1233 if (migrate_use_xbzrle()) {
1234 /* If xbzrle is on, stop using the data compression at this
1235 * point. In theory, xbzrle can do better than compression.
1237 flush_compressed_data(rs);
1240 /* Didn't find anything this time, but try again on the new block */
1241 *again = true;
1242 return false;
1243 } else {
1244 /* Can go around again, but... */
1245 *again = true;
1246 /* We've found something so probably don't need to */
1247 return true;
1252 * unqueue_page: gets a page of the queue
1254 * Helper for 'get_queued_page' - gets a page off the queue
1256 * Returns the block of the page (or NULL if none available)
1258 * @rs: current RAM state
1259 * @offset: used to return the offset within the RAMBlock
1261 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1263 RAMBlock *block = NULL;
1265 qemu_mutex_lock(&rs->src_page_req_mutex);
1266 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1267 struct RAMSrcPageRequest *entry =
1268 QSIMPLEQ_FIRST(&rs->src_page_requests);
1269 block = entry->rb;
1270 *offset = entry->offset;
1272 if (entry->len > TARGET_PAGE_SIZE) {
1273 entry->len -= TARGET_PAGE_SIZE;
1274 entry->offset += TARGET_PAGE_SIZE;
1275 } else {
1276 memory_region_unref(block->mr);
1277 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1278 g_free(entry);
1281 qemu_mutex_unlock(&rs->src_page_req_mutex);
1283 return block;
1287 * get_queued_page: unqueue a page from the postocpy requests
1289 * Skips pages that are already sent (!dirty)
1291 * Returns if a queued page is found
1293 * @rs: current RAM state
1294 * @pss: data about the state of the current dirty page scan
1296 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1298 RAMBlock *block;
1299 ram_addr_t offset;
1300 bool dirty;
1302 do {
1303 block = unqueue_page(rs, &offset);
1305 * We're sending this page, and since it's postcopy nothing else
1306 * will dirty it, and we must make sure it doesn't get sent again
1307 * even if this queue request was received after the background
1308 * search already sent it.
1310 if (block) {
1311 unsigned long page;
1313 page = offset >> TARGET_PAGE_BITS;
1314 dirty = test_bit(page, block->bmap);
1315 if (!dirty) {
1316 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1317 page, test_bit(page, block->unsentmap));
1318 } else {
1319 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1323 } while (block && !dirty);
1325 if (block) {
1327 * As soon as we start servicing pages out of order, then we have
1328 * to kill the bulk stage, since the bulk stage assumes
1329 * in (migration_bitmap_find_and_reset_dirty) that every page is
1330 * dirty, that's no longer true.
1332 rs->ram_bulk_stage = false;
1335 * We want the background search to continue from the queued page
1336 * since the guest is likely to want other pages near to the page
1337 * it just requested.
1339 pss->block = block;
1340 pss->page = offset >> TARGET_PAGE_BITS;
1343 return !!block;
1347 * migration_page_queue_free: drop any remaining pages in the ram
1348 * request queue
1350 * It should be empty at the end anyway, but in error cases there may
1351 * be some left. in case that there is any page left, we drop it.
1354 static void migration_page_queue_free(RAMState *rs)
1356 struct RAMSrcPageRequest *mspr, *next_mspr;
1357 /* This queue generally should be empty - but in the case of a failed
1358 * migration might have some droppings in.
1360 rcu_read_lock();
1361 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1362 memory_region_unref(mspr->rb->mr);
1363 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1364 g_free(mspr);
1366 rcu_read_unlock();
1370 * ram_save_queue_pages: queue the page for transmission
1372 * A request from postcopy destination for example.
1374 * Returns zero on success or negative on error
1376 * @rbname: Name of the RAMBLock of the request. NULL means the
1377 * same that last one.
1378 * @start: starting address from the start of the RAMBlock
1379 * @len: length (in bytes) to send
1381 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1383 RAMBlock *ramblock;
1384 RAMState *rs = ram_state;
1386 ram_counters.postcopy_requests++;
1387 rcu_read_lock();
1388 if (!rbname) {
1389 /* Reuse last RAMBlock */
1390 ramblock = rs->last_req_rb;
1392 if (!ramblock) {
1394 * Shouldn't happen, we can't reuse the last RAMBlock if
1395 * it's the 1st request.
1397 error_report("ram_save_queue_pages no previous block");
1398 goto err;
1400 } else {
1401 ramblock = qemu_ram_block_by_name(rbname);
1403 if (!ramblock) {
1404 /* We shouldn't be asked for a non-existent RAMBlock */
1405 error_report("ram_save_queue_pages no block '%s'", rbname);
1406 goto err;
1408 rs->last_req_rb = ramblock;
1410 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1411 if (start+len > ramblock->used_length) {
1412 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1413 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1414 __func__, start, len, ramblock->used_length);
1415 goto err;
1418 struct RAMSrcPageRequest *new_entry =
1419 g_malloc0(sizeof(struct RAMSrcPageRequest));
1420 new_entry->rb = ramblock;
1421 new_entry->offset = start;
1422 new_entry->len = len;
1424 memory_region_ref(ramblock->mr);
1425 qemu_mutex_lock(&rs->src_page_req_mutex);
1426 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1427 qemu_mutex_unlock(&rs->src_page_req_mutex);
1428 rcu_read_unlock();
1430 return 0;
1432 err:
1433 rcu_read_unlock();
1434 return -1;
1438 * ram_save_target_page: save one target page
1440 * Returns the number of pages written
1442 * @rs: current RAM state
1443 * @ms: current migration state
1444 * @pss: data about the page we want to send
1445 * @last_stage: if we are at the completion stage
1447 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1448 bool last_stage)
1450 int res = 0;
1452 /* Check the pages is dirty and if it is send it */
1453 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1455 * If xbzrle is on, stop using the data compression after first
1456 * round of migration even if compression is enabled. In theory,
1457 * xbzrle can do better than compression.
1459 if (migrate_use_compression() &&
1460 (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1461 res = ram_save_compressed_page(rs, pss, last_stage);
1462 } else {
1463 res = ram_save_page(rs, pss, last_stage);
1466 if (res < 0) {
1467 return res;
1469 if (pss->block->unsentmap) {
1470 clear_bit(pss->page, pss->block->unsentmap);
1474 return res;
1478 * ram_save_host_page: save a whole host page
1480 * Starting at *offset send pages up to the end of the current host
1481 * page. It's valid for the initial offset to point into the middle of
1482 * a host page in which case the remainder of the hostpage is sent.
1483 * Only dirty target pages are sent. Note that the host page size may
1484 * be a huge page for this block.
1485 * The saving stops at the boundary of the used_length of the block
1486 * if the RAMBlock isn't a multiple of the host page size.
1488 * Returns the number of pages written or negative on error
1490 * @rs: current RAM state
1491 * @ms: current migration state
1492 * @pss: data about the page we want to send
1493 * @last_stage: if we are at the completion stage
1495 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1496 bool last_stage)
1498 int tmppages, pages = 0;
1499 size_t pagesize_bits =
1500 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1502 do {
1503 tmppages = ram_save_target_page(rs, pss, last_stage);
1504 if (tmppages < 0) {
1505 return tmppages;
1508 pages += tmppages;
1509 pss->page++;
1510 } while ((pss->page & (pagesize_bits - 1)) &&
1511 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1513 /* The offset we leave with is the last one we looked at */
1514 pss->page--;
1515 return pages;
1519 * ram_find_and_save_block: finds a dirty page and sends it to f
1521 * Called within an RCU critical section.
1523 * Returns the number of pages written where zero means no dirty pages
1525 * @rs: current RAM state
1526 * @last_stage: if we are at the completion stage
1528 * On systems where host-page-size > target-page-size it will send all the
1529 * pages in a host page that are dirty.
1532 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1534 PageSearchStatus pss;
1535 int pages = 0;
1536 bool again, found;
1538 /* No dirty page as there is zero RAM */
1539 if (!ram_bytes_total()) {
1540 return pages;
1543 pss.block = rs->last_seen_block;
1544 pss.page = rs->last_page;
1545 pss.complete_round = false;
1547 if (!pss.block) {
1548 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1551 do {
1552 again = true;
1553 found = get_queued_page(rs, &pss);
1555 if (!found) {
1556 /* priority queue empty, so just search for something dirty */
1557 found = find_dirty_block(rs, &pss, &again);
1560 if (found) {
1561 pages = ram_save_host_page(rs, &pss, last_stage);
1563 } while (!pages && again);
1565 rs->last_seen_block = pss.block;
1566 rs->last_page = pss.page;
1568 return pages;
1571 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1573 uint64_t pages = size / TARGET_PAGE_SIZE;
1575 if (zero) {
1576 ram_counters.duplicate += pages;
1577 } else {
1578 ram_counters.normal += pages;
1579 ram_counters.transferred += size;
1580 qemu_update_position(f, size);
1584 uint64_t ram_bytes_total(void)
1586 RAMBlock *block;
1587 uint64_t total = 0;
1589 rcu_read_lock();
1590 RAMBLOCK_FOREACH(block) {
1591 total += block->used_length;
1593 rcu_read_unlock();
1594 return total;
1597 static void xbzrle_load_setup(void)
1599 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1602 static void xbzrle_load_cleanup(void)
1604 g_free(XBZRLE.decoded_buf);
1605 XBZRLE.decoded_buf = NULL;
1608 static void ram_state_cleanup(RAMState **rsp)
1610 if (*rsp) {
1611 migration_page_queue_free(*rsp);
1612 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1613 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1614 g_free(*rsp);
1615 *rsp = NULL;
1619 static void xbzrle_cleanup(void)
1621 XBZRLE_cache_lock();
1622 if (XBZRLE.cache) {
1623 cache_fini(XBZRLE.cache);
1624 g_free(XBZRLE.encoded_buf);
1625 g_free(XBZRLE.current_buf);
1626 g_free(XBZRLE.zero_target_page);
1627 XBZRLE.cache = NULL;
1628 XBZRLE.encoded_buf = NULL;
1629 XBZRLE.current_buf = NULL;
1630 XBZRLE.zero_target_page = NULL;
1632 XBZRLE_cache_unlock();
1635 static void ram_save_cleanup(void *opaque)
1637 RAMState **rsp = opaque;
1638 RAMBlock *block;
1640 /* caller have hold iothread lock or is in a bh, so there is
1641 * no writing race against this migration_bitmap
1643 memory_global_dirty_log_stop();
1645 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1646 g_free(block->bmap);
1647 block->bmap = NULL;
1648 g_free(block->unsentmap);
1649 block->unsentmap = NULL;
1652 xbzrle_cleanup();
1653 compress_threads_save_cleanup();
1654 ram_state_cleanup(rsp);
1657 static void ram_state_reset(RAMState *rs)
1659 rs->last_seen_block = NULL;
1660 rs->last_sent_block = NULL;
1661 rs->last_page = 0;
1662 rs->last_version = ram_list.version;
1663 rs->ram_bulk_stage = true;
1666 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1669 * 'expected' is the value you expect the bitmap mostly to be full
1670 * of; it won't bother printing lines that are all this value.
1671 * If 'todump' is null the migration bitmap is dumped.
1673 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1674 unsigned long pages)
1676 int64_t cur;
1677 int64_t linelen = 128;
1678 char linebuf[129];
1680 for (cur = 0; cur < pages; cur += linelen) {
1681 int64_t curb;
1682 bool found = false;
1684 * Last line; catch the case where the line length
1685 * is longer than remaining ram
1687 if (cur + linelen > pages) {
1688 linelen = pages - cur;
1690 for (curb = 0; curb < linelen; curb++) {
1691 bool thisbit = test_bit(cur + curb, todump);
1692 linebuf[curb] = thisbit ? '1' : '.';
1693 found = found || (thisbit != expected);
1695 if (found) {
1696 linebuf[curb] = '\0';
1697 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1702 /* **** functions for postcopy ***** */
1704 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1706 struct RAMBlock *block;
1708 RAMBLOCK_FOREACH(block) {
1709 unsigned long *bitmap = block->bmap;
1710 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1711 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1713 while (run_start < range) {
1714 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1715 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1716 (run_end - run_start) << TARGET_PAGE_BITS);
1717 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1723 * postcopy_send_discard_bm_ram: discard a RAMBlock
1725 * Returns zero on success
1727 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1728 * Note: At this point the 'unsentmap' is the processed bitmap combined
1729 * with the dirtymap; so a '1' means it's either dirty or unsent.
1731 * @ms: current migration state
1732 * @pds: state for postcopy
1733 * @start: RAMBlock starting page
1734 * @length: RAMBlock size
1736 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1737 PostcopyDiscardState *pds,
1738 RAMBlock *block)
1740 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1741 unsigned long current;
1742 unsigned long *unsentmap = block->unsentmap;
1744 for (current = 0; current < end; ) {
1745 unsigned long one = find_next_bit(unsentmap, end, current);
1747 if (one <= end) {
1748 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1749 unsigned long discard_length;
1751 if (zero >= end) {
1752 discard_length = end - one;
1753 } else {
1754 discard_length = zero - one;
1756 if (discard_length) {
1757 postcopy_discard_send_range(ms, pds, one, discard_length);
1759 current = one + discard_length;
1760 } else {
1761 current = one;
1765 return 0;
1769 * postcopy_each_ram_send_discard: discard all RAMBlocks
1771 * Returns 0 for success or negative for error
1773 * Utility for the outgoing postcopy code.
1774 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1775 * passing it bitmap indexes and name.
1776 * (qemu_ram_foreach_block ends up passing unscaled lengths
1777 * which would mean postcopy code would have to deal with target page)
1779 * @ms: current migration state
1781 static int postcopy_each_ram_send_discard(MigrationState *ms)
1783 struct RAMBlock *block;
1784 int ret;
1786 RAMBLOCK_FOREACH(block) {
1787 PostcopyDiscardState *pds =
1788 postcopy_discard_send_init(ms, block->idstr);
1791 * Postcopy sends chunks of bitmap over the wire, but it
1792 * just needs indexes at this point, avoids it having
1793 * target page specific code.
1795 ret = postcopy_send_discard_bm_ram(ms, pds, block);
1796 postcopy_discard_send_finish(ms, pds);
1797 if (ret) {
1798 return ret;
1802 return 0;
1806 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1808 * Helper for postcopy_chunk_hostpages; it's called twice to
1809 * canonicalize the two bitmaps, that are similar, but one is
1810 * inverted.
1812 * Postcopy requires that all target pages in a hostpage are dirty or
1813 * clean, not a mix. This function canonicalizes the bitmaps.
1815 * @ms: current migration state
1816 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1817 * otherwise we need to canonicalize partially dirty host pages
1818 * @block: block that contains the page we want to canonicalize
1819 * @pds: state for postcopy
1821 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1822 RAMBlock *block,
1823 PostcopyDiscardState *pds)
1825 RAMState *rs = ram_state;
1826 unsigned long *bitmap = block->bmap;
1827 unsigned long *unsentmap = block->unsentmap;
1828 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1829 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1830 unsigned long run_start;
1832 if (block->page_size == TARGET_PAGE_SIZE) {
1833 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1834 return;
1837 if (unsent_pass) {
1838 /* Find a sent page */
1839 run_start = find_next_zero_bit(unsentmap, pages, 0);
1840 } else {
1841 /* Find a dirty page */
1842 run_start = find_next_bit(bitmap, pages, 0);
1845 while (run_start < pages) {
1846 bool do_fixup = false;
1847 unsigned long fixup_start_addr;
1848 unsigned long host_offset;
1851 * If the start of this run of pages is in the middle of a host
1852 * page, then we need to fixup this host page.
1854 host_offset = run_start % host_ratio;
1855 if (host_offset) {
1856 do_fixup = true;
1857 run_start -= host_offset;
1858 fixup_start_addr = run_start;
1859 /* For the next pass */
1860 run_start = run_start + host_ratio;
1861 } else {
1862 /* Find the end of this run */
1863 unsigned long run_end;
1864 if (unsent_pass) {
1865 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1866 } else {
1867 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1870 * If the end isn't at the start of a host page, then the
1871 * run doesn't finish at the end of a host page
1872 * and we need to discard.
1874 host_offset = run_end % host_ratio;
1875 if (host_offset) {
1876 do_fixup = true;
1877 fixup_start_addr = run_end - host_offset;
1879 * This host page has gone, the next loop iteration starts
1880 * from after the fixup
1882 run_start = fixup_start_addr + host_ratio;
1883 } else {
1885 * No discards on this iteration, next loop starts from
1886 * next sent/dirty page
1888 run_start = run_end + 1;
1892 if (do_fixup) {
1893 unsigned long page;
1895 /* Tell the destination to discard this page */
1896 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1897 /* For the unsent_pass we:
1898 * discard partially sent pages
1899 * For the !unsent_pass (dirty) we:
1900 * discard partially dirty pages that were sent
1901 * (any partially sent pages were already discarded
1902 * by the previous unsent_pass)
1904 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1905 host_ratio);
1908 /* Clean up the bitmap */
1909 for (page = fixup_start_addr;
1910 page < fixup_start_addr + host_ratio; page++) {
1911 /* All pages in this host page are now not sent */
1912 set_bit(page, unsentmap);
1915 * Remark them as dirty, updating the count for any pages
1916 * that weren't previously dirty.
1918 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1922 if (unsent_pass) {
1923 /* Find the next sent page for the next iteration */
1924 run_start = find_next_zero_bit(unsentmap, pages, run_start);
1925 } else {
1926 /* Find the next dirty page for the next iteration */
1927 run_start = find_next_bit(bitmap, pages, run_start);
1933 * postcopy_chuck_hostpages: discrad any partially sent host page
1935 * Utility for the outgoing postcopy code.
1937 * Discard any partially sent host-page size chunks, mark any partially
1938 * dirty host-page size chunks as all dirty. In this case the host-page
1939 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1941 * Returns zero on success
1943 * @ms: current migration state
1944 * @block: block we want to work with
1946 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1948 PostcopyDiscardState *pds =
1949 postcopy_discard_send_init(ms, block->idstr);
1951 /* First pass: Discard all partially sent host pages */
1952 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1954 * Second pass: Ensure that all partially dirty host pages are made
1955 * fully dirty.
1957 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1959 postcopy_discard_send_finish(ms, pds);
1960 return 0;
1964 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1966 * Returns zero on success
1968 * Transmit the set of pages to be discarded after precopy to the target
1969 * these are pages that:
1970 * a) Have been previously transmitted but are now dirty again
1971 * b) Pages that have never been transmitted, this ensures that
1972 * any pages on the destination that have been mapped by background
1973 * tasks get discarded (transparent huge pages is the specific concern)
1974 * Hopefully this is pretty sparse
1976 * @ms: current migration state
1978 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1980 RAMState *rs = ram_state;
1981 RAMBlock *block;
1982 int ret;
1984 rcu_read_lock();
1986 /* This should be our last sync, the src is now paused */
1987 migration_bitmap_sync(rs);
1989 /* Easiest way to make sure we don't resume in the middle of a host-page */
1990 rs->last_seen_block = NULL;
1991 rs->last_sent_block = NULL;
1992 rs->last_page = 0;
1994 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1995 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1996 unsigned long *bitmap = block->bmap;
1997 unsigned long *unsentmap = block->unsentmap;
1999 if (!unsentmap) {
2000 /* We don't have a safe way to resize the sentmap, so
2001 * if the bitmap was resized it will be NULL at this
2002 * point.
2004 error_report("migration ram resized during precopy phase");
2005 rcu_read_unlock();
2006 return -EINVAL;
2008 /* Deal with TPS != HPS and huge pages */
2009 ret = postcopy_chunk_hostpages(ms, block);
2010 if (ret) {
2011 rcu_read_unlock();
2012 return ret;
2016 * Update the unsentmap to be unsentmap = unsentmap | dirty
2018 bitmap_or(unsentmap, unsentmap, bitmap, pages);
2019 #ifdef DEBUG_POSTCOPY
2020 ram_debug_dump_bitmap(unsentmap, true, pages);
2021 #endif
2023 trace_ram_postcopy_send_discard_bitmap();
2025 ret = postcopy_each_ram_send_discard(ms);
2026 rcu_read_unlock();
2028 return ret;
2032 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2034 * Returns zero on success
2036 * @rbname: name of the RAMBlock of the request. NULL means the
2037 * same that last one.
2038 * @start: RAMBlock starting page
2039 * @length: RAMBlock size
2041 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2043 int ret = -1;
2045 trace_ram_discard_range(rbname, start, length);
2047 rcu_read_lock();
2048 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2050 if (!rb) {
2051 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2052 goto err;
2055 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2056 length >> qemu_target_page_bits());
2057 ret = ram_block_discard_range(rb, start, length);
2059 err:
2060 rcu_read_unlock();
2062 return ret;
2066 * For every allocation, we will try not to crash the VM if the
2067 * allocation failed.
2069 static int xbzrle_init(void)
2071 Error *local_err = NULL;
2073 if (!migrate_use_xbzrle()) {
2074 return 0;
2077 XBZRLE_cache_lock();
2079 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2080 if (!XBZRLE.zero_target_page) {
2081 error_report("%s: Error allocating zero page", __func__);
2082 goto err_out;
2085 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2086 TARGET_PAGE_SIZE, &local_err);
2087 if (!XBZRLE.cache) {
2088 error_report_err(local_err);
2089 goto free_zero_page;
2092 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2093 if (!XBZRLE.encoded_buf) {
2094 error_report("%s: Error allocating encoded_buf", __func__);
2095 goto free_cache;
2098 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2099 if (!XBZRLE.current_buf) {
2100 error_report("%s: Error allocating current_buf", __func__);
2101 goto free_encoded_buf;
2104 /* We are all good */
2105 XBZRLE_cache_unlock();
2106 return 0;
2108 free_encoded_buf:
2109 g_free(XBZRLE.encoded_buf);
2110 XBZRLE.encoded_buf = NULL;
2111 free_cache:
2112 cache_fini(XBZRLE.cache);
2113 XBZRLE.cache = NULL;
2114 free_zero_page:
2115 g_free(XBZRLE.zero_target_page);
2116 XBZRLE.zero_target_page = NULL;
2117 err_out:
2118 XBZRLE_cache_unlock();
2119 return -ENOMEM;
2122 static int ram_state_init(RAMState **rsp)
2124 *rsp = g_try_new0(RAMState, 1);
2126 if (!*rsp) {
2127 error_report("%s: Init ramstate fail", __func__);
2128 return -1;
2131 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2132 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2133 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2136 * Count the total number of pages used by ram blocks not including any
2137 * gaps due to alignment or unplugs.
2139 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2141 ram_state_reset(*rsp);
2143 return 0;
2146 static void ram_list_init_bitmaps(void)
2148 RAMBlock *block;
2149 unsigned long pages;
2151 /* Skip setting bitmap if there is no RAM */
2152 if (ram_bytes_total()) {
2153 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2154 pages = block->max_length >> TARGET_PAGE_BITS;
2155 block->bmap = bitmap_new(pages);
2156 bitmap_set(block->bmap, 0, pages);
2157 if (migrate_postcopy_ram()) {
2158 block->unsentmap = bitmap_new(pages);
2159 bitmap_set(block->unsentmap, 0, pages);
2165 static void ram_init_bitmaps(RAMState *rs)
2167 /* For memory_global_dirty_log_start below. */
2168 qemu_mutex_lock_iothread();
2169 qemu_mutex_lock_ramlist();
2170 rcu_read_lock();
2172 ram_list_init_bitmaps();
2173 memory_global_dirty_log_start();
2174 migration_bitmap_sync(rs);
2176 rcu_read_unlock();
2177 qemu_mutex_unlock_ramlist();
2178 qemu_mutex_unlock_iothread();
2181 static int ram_init_all(RAMState **rsp)
2183 if (ram_state_init(rsp)) {
2184 return -1;
2187 if (xbzrle_init()) {
2188 ram_state_cleanup(rsp);
2189 return -1;
2192 ram_init_bitmaps(*rsp);
2194 return 0;
2198 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2199 * long-running RCU critical section. When rcu-reclaims in the code
2200 * start to become numerous it will be necessary to reduce the
2201 * granularity of these critical sections.
2205 * ram_save_setup: Setup RAM for migration
2207 * Returns zero to indicate success and negative for error
2209 * @f: QEMUFile where to send the data
2210 * @opaque: RAMState pointer
2212 static int ram_save_setup(QEMUFile *f, void *opaque)
2214 RAMState **rsp = opaque;
2215 RAMBlock *block;
2217 /* migration has already setup the bitmap, reuse it. */
2218 if (!migration_in_colo_state()) {
2219 if (ram_init_all(rsp) != 0) {
2220 return -1;
2223 (*rsp)->f = f;
2225 rcu_read_lock();
2227 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2229 RAMBLOCK_FOREACH(block) {
2230 qemu_put_byte(f, strlen(block->idstr));
2231 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2232 qemu_put_be64(f, block->used_length);
2233 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2234 qemu_put_be64(f, block->page_size);
2238 rcu_read_unlock();
2239 compress_threads_save_setup();
2241 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2242 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2244 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2246 return 0;
2250 * ram_save_iterate: iterative stage for migration
2252 * Returns zero to indicate success and negative for error
2254 * @f: QEMUFile where to send the data
2255 * @opaque: RAMState pointer
2257 static int ram_save_iterate(QEMUFile *f, void *opaque)
2259 RAMState **temp = opaque;
2260 RAMState *rs = *temp;
2261 int ret;
2262 int i;
2263 int64_t t0;
2264 int done = 0;
2266 if (blk_mig_bulk_active()) {
2267 /* Avoid transferring ram during bulk phase of block migration as
2268 * the bulk phase will usually take a long time and transferring
2269 * ram updates during that time is pointless. */
2270 goto out;
2273 rcu_read_lock();
2274 if (ram_list.version != rs->last_version) {
2275 ram_state_reset(rs);
2278 /* Read version before ram_list.blocks */
2279 smp_rmb();
2281 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2283 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2284 i = 0;
2285 while ((ret = qemu_file_rate_limit(f)) == 0) {
2286 int pages;
2288 pages = ram_find_and_save_block(rs, false);
2289 /* no more pages to sent */
2290 if (pages == 0) {
2291 done = 1;
2292 break;
2294 rs->iterations++;
2296 /* we want to check in the 1st loop, just in case it was the 1st time
2297 and we had to sync the dirty bitmap.
2298 qemu_get_clock_ns() is a bit expensive, so we only check each some
2299 iterations
2301 if ((i & 63) == 0) {
2302 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2303 if (t1 > MAX_WAIT) {
2304 trace_ram_save_iterate_big_wait(t1, i);
2305 break;
2308 i++;
2310 flush_compressed_data(rs);
2311 rcu_read_unlock();
2314 * Must occur before EOS (or any QEMUFile operation)
2315 * because of RDMA protocol.
2317 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2319 out:
2320 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2321 ram_counters.transferred += 8;
2323 ret = qemu_file_get_error(f);
2324 if (ret < 0) {
2325 return ret;
2328 return done;
2332 * ram_save_complete: function called to send the remaining amount of ram
2334 * Returns zero to indicate success
2336 * Called with iothread lock
2338 * @f: QEMUFile where to send the data
2339 * @opaque: RAMState pointer
2341 static int ram_save_complete(QEMUFile *f, void *opaque)
2343 RAMState **temp = opaque;
2344 RAMState *rs = *temp;
2346 rcu_read_lock();
2348 if (!migration_in_postcopy()) {
2349 migration_bitmap_sync(rs);
2352 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2354 /* try transferring iterative blocks of memory */
2356 /* flush all remaining blocks regardless of rate limiting */
2357 while (true) {
2358 int pages;
2360 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2361 /* no more blocks to sent */
2362 if (pages == 0) {
2363 break;
2367 flush_compressed_data(rs);
2368 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2370 rcu_read_unlock();
2372 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2374 return 0;
2377 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2378 uint64_t *res_precopy_only,
2379 uint64_t *res_compatible,
2380 uint64_t *res_postcopy_only)
2382 RAMState **temp = opaque;
2383 RAMState *rs = *temp;
2384 uint64_t remaining_size;
2386 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2388 if (!migration_in_postcopy() &&
2389 remaining_size < max_size) {
2390 qemu_mutex_lock_iothread();
2391 rcu_read_lock();
2392 migration_bitmap_sync(rs);
2393 rcu_read_unlock();
2394 qemu_mutex_unlock_iothread();
2395 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2398 if (migrate_postcopy_ram()) {
2399 /* We can do postcopy, and all the data is postcopiable */
2400 *res_compatible += remaining_size;
2401 } else {
2402 *res_precopy_only += remaining_size;
2406 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2408 unsigned int xh_len;
2409 int xh_flags;
2410 uint8_t *loaded_data;
2412 /* extract RLE header */
2413 xh_flags = qemu_get_byte(f);
2414 xh_len = qemu_get_be16(f);
2416 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2417 error_report("Failed to load XBZRLE page - wrong compression!");
2418 return -1;
2421 if (xh_len > TARGET_PAGE_SIZE) {
2422 error_report("Failed to load XBZRLE page - len overflow!");
2423 return -1;
2425 loaded_data = XBZRLE.decoded_buf;
2426 /* load data and decode */
2427 /* it can change loaded_data to point to an internal buffer */
2428 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2430 /* decode RLE */
2431 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2432 TARGET_PAGE_SIZE) == -1) {
2433 error_report("Failed to load XBZRLE page - decode error!");
2434 return -1;
2437 return 0;
2441 * ram_block_from_stream: read a RAMBlock id from the migration stream
2443 * Must be called from within a rcu critical section.
2445 * Returns a pointer from within the RCU-protected ram_list.
2447 * @f: QEMUFile where to read the data from
2448 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2450 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2452 static RAMBlock *block = NULL;
2453 char id[256];
2454 uint8_t len;
2456 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2457 if (!block) {
2458 error_report("Ack, bad migration stream!");
2459 return NULL;
2461 return block;
2464 len = qemu_get_byte(f);
2465 qemu_get_buffer(f, (uint8_t *)id, len);
2466 id[len] = 0;
2468 block = qemu_ram_block_by_name(id);
2469 if (!block) {
2470 error_report("Can't find block %s", id);
2471 return NULL;
2474 return block;
2477 static inline void *host_from_ram_block_offset(RAMBlock *block,
2478 ram_addr_t offset)
2480 if (!offset_in_ramblock(block, offset)) {
2481 return NULL;
2484 return block->host + offset;
2488 * ram_handle_compressed: handle the zero page case
2490 * If a page (or a whole RDMA chunk) has been
2491 * determined to be zero, then zap it.
2493 * @host: host address for the zero page
2494 * @ch: what the page is filled from. We only support zero
2495 * @size: size of the zero page
2497 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2499 if (ch != 0 || !is_zero_range(host, size)) {
2500 memset(host, ch, size);
2504 static void *do_data_decompress(void *opaque)
2506 DecompressParam *param = opaque;
2507 unsigned long pagesize;
2508 uint8_t *des;
2509 int len;
2511 qemu_mutex_lock(&param->mutex);
2512 while (!param->quit) {
2513 if (param->des) {
2514 des = param->des;
2515 len = param->len;
2516 param->des = 0;
2517 qemu_mutex_unlock(&param->mutex);
2519 pagesize = TARGET_PAGE_SIZE;
2520 /* uncompress() will return failed in some case, especially
2521 * when the page is dirted when doing the compression, it's
2522 * not a problem because the dirty page will be retransferred
2523 * and uncompress() won't break the data in other pages.
2525 uncompress((Bytef *)des, &pagesize,
2526 (const Bytef *)param->compbuf, len);
2528 qemu_mutex_lock(&decomp_done_lock);
2529 param->done = true;
2530 qemu_cond_signal(&decomp_done_cond);
2531 qemu_mutex_unlock(&decomp_done_lock);
2533 qemu_mutex_lock(&param->mutex);
2534 } else {
2535 qemu_cond_wait(&param->cond, &param->mutex);
2538 qemu_mutex_unlock(&param->mutex);
2540 return NULL;
2543 static void wait_for_decompress_done(void)
2545 int idx, thread_count;
2547 if (!migrate_use_compression()) {
2548 return;
2551 thread_count = migrate_decompress_threads();
2552 qemu_mutex_lock(&decomp_done_lock);
2553 for (idx = 0; idx < thread_count; idx++) {
2554 while (!decomp_param[idx].done) {
2555 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2558 qemu_mutex_unlock(&decomp_done_lock);
2561 static void compress_threads_load_setup(void)
2563 int i, thread_count;
2565 if (!migrate_use_compression()) {
2566 return;
2568 thread_count = migrate_decompress_threads();
2569 decompress_threads = g_new0(QemuThread, thread_count);
2570 decomp_param = g_new0(DecompressParam, thread_count);
2571 qemu_mutex_init(&decomp_done_lock);
2572 qemu_cond_init(&decomp_done_cond);
2573 for (i = 0; i < thread_count; i++) {
2574 qemu_mutex_init(&decomp_param[i].mutex);
2575 qemu_cond_init(&decomp_param[i].cond);
2576 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2577 decomp_param[i].done = true;
2578 decomp_param[i].quit = false;
2579 qemu_thread_create(decompress_threads + i, "decompress",
2580 do_data_decompress, decomp_param + i,
2581 QEMU_THREAD_JOINABLE);
2585 static void compress_threads_load_cleanup(void)
2587 int i, thread_count;
2589 if (!migrate_use_compression()) {
2590 return;
2592 thread_count = migrate_decompress_threads();
2593 for (i = 0; i < thread_count; i++) {
2594 qemu_mutex_lock(&decomp_param[i].mutex);
2595 decomp_param[i].quit = true;
2596 qemu_cond_signal(&decomp_param[i].cond);
2597 qemu_mutex_unlock(&decomp_param[i].mutex);
2599 for (i = 0; i < thread_count; i++) {
2600 qemu_thread_join(decompress_threads + i);
2601 qemu_mutex_destroy(&decomp_param[i].mutex);
2602 qemu_cond_destroy(&decomp_param[i].cond);
2603 g_free(decomp_param[i].compbuf);
2605 g_free(decompress_threads);
2606 g_free(decomp_param);
2607 decompress_threads = NULL;
2608 decomp_param = NULL;
2611 static void decompress_data_with_multi_threads(QEMUFile *f,
2612 void *host, int len)
2614 int idx, thread_count;
2616 thread_count = migrate_decompress_threads();
2617 qemu_mutex_lock(&decomp_done_lock);
2618 while (true) {
2619 for (idx = 0; idx < thread_count; idx++) {
2620 if (decomp_param[idx].done) {
2621 decomp_param[idx].done = false;
2622 qemu_mutex_lock(&decomp_param[idx].mutex);
2623 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2624 decomp_param[idx].des = host;
2625 decomp_param[idx].len = len;
2626 qemu_cond_signal(&decomp_param[idx].cond);
2627 qemu_mutex_unlock(&decomp_param[idx].mutex);
2628 break;
2631 if (idx < thread_count) {
2632 break;
2633 } else {
2634 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2637 qemu_mutex_unlock(&decomp_done_lock);
2641 * ram_load_setup: Setup RAM for migration incoming side
2643 * Returns zero to indicate success and negative for error
2645 * @f: QEMUFile where to receive the data
2646 * @opaque: RAMState pointer
2648 static int ram_load_setup(QEMUFile *f, void *opaque)
2650 xbzrle_load_setup();
2651 compress_threads_load_setup();
2652 ramblock_recv_map_init();
2653 return 0;
2656 static int ram_load_cleanup(void *opaque)
2658 RAMBlock *rb;
2659 xbzrle_load_cleanup();
2660 compress_threads_load_cleanup();
2662 RAMBLOCK_FOREACH(rb) {
2663 g_free(rb->receivedmap);
2664 rb->receivedmap = NULL;
2666 return 0;
2670 * ram_postcopy_incoming_init: allocate postcopy data structures
2672 * Returns 0 for success and negative if there was one error
2674 * @mis: current migration incoming state
2676 * Allocate data structures etc needed by incoming migration with
2677 * postcopy-ram. postcopy-ram's similarly names
2678 * postcopy_ram_incoming_init does the work.
2680 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2682 unsigned long ram_pages = last_ram_page();
2684 return postcopy_ram_incoming_init(mis, ram_pages);
2688 * ram_load_postcopy: load a page in postcopy case
2690 * Returns 0 for success or -errno in case of error
2692 * Called in postcopy mode by ram_load().
2693 * rcu_read_lock is taken prior to this being called.
2695 * @f: QEMUFile where to send the data
2697 static int ram_load_postcopy(QEMUFile *f)
2699 int flags = 0, ret = 0;
2700 bool place_needed = false;
2701 bool matching_page_sizes = false;
2702 MigrationIncomingState *mis = migration_incoming_get_current();
2703 /* Temporary page that is later 'placed' */
2704 void *postcopy_host_page = postcopy_get_tmp_page(mis);
2705 void *last_host = NULL;
2706 bool all_zero = false;
2708 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2709 ram_addr_t addr;
2710 void *host = NULL;
2711 void *page_buffer = NULL;
2712 void *place_source = NULL;
2713 RAMBlock *block = NULL;
2714 uint8_t ch;
2716 addr = qemu_get_be64(f);
2719 * If qemu file error, we should stop here, and then "addr"
2720 * may be invalid
2722 ret = qemu_file_get_error(f);
2723 if (ret) {
2724 break;
2727 flags = addr & ~TARGET_PAGE_MASK;
2728 addr &= TARGET_PAGE_MASK;
2730 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2731 place_needed = false;
2732 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2733 block = ram_block_from_stream(f, flags);
2735 host = host_from_ram_block_offset(block, addr);
2736 if (!host) {
2737 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2738 ret = -EINVAL;
2739 break;
2741 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2743 * Postcopy requires that we place whole host pages atomically;
2744 * these may be huge pages for RAMBlocks that are backed by
2745 * hugetlbfs.
2746 * To make it atomic, the data is read into a temporary page
2747 * that's moved into place later.
2748 * The migration protocol uses, possibly smaller, target-pages
2749 * however the source ensures it always sends all the components
2750 * of a host page in order.
2752 page_buffer = postcopy_host_page +
2753 ((uintptr_t)host & (block->page_size - 1));
2754 /* If all TP are zero then we can optimise the place */
2755 if (!((uintptr_t)host & (block->page_size - 1))) {
2756 all_zero = true;
2757 } else {
2758 /* not the 1st TP within the HP */
2759 if (host != (last_host + TARGET_PAGE_SIZE)) {
2760 error_report("Non-sequential target page %p/%p",
2761 host, last_host);
2762 ret = -EINVAL;
2763 break;
2769 * If it's the last part of a host page then we place the host
2770 * page
2772 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2773 (block->page_size - 1)) == 0;
2774 place_source = postcopy_host_page;
2776 last_host = host;
2778 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2779 case RAM_SAVE_FLAG_ZERO:
2780 ch = qemu_get_byte(f);
2781 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2782 if (ch) {
2783 all_zero = false;
2785 break;
2787 case RAM_SAVE_FLAG_PAGE:
2788 all_zero = false;
2789 if (!place_needed || !matching_page_sizes) {
2790 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2791 } else {
2792 /* Avoids the qemu_file copy during postcopy, which is
2793 * going to do a copy later; can only do it when we
2794 * do this read in one go (matching page sizes)
2796 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2797 TARGET_PAGE_SIZE);
2799 break;
2800 case RAM_SAVE_FLAG_EOS:
2801 /* normal exit */
2802 break;
2803 default:
2804 error_report("Unknown combination of migration flags: %#x"
2805 " (postcopy mode)", flags);
2806 ret = -EINVAL;
2807 break;
2810 /* Detect for any possible file errors */
2811 if (!ret && qemu_file_get_error(f)) {
2812 ret = qemu_file_get_error(f);
2815 if (!ret && place_needed) {
2816 /* This gets called at the last target page in the host page */
2817 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2819 if (all_zero) {
2820 ret = postcopy_place_page_zero(mis, place_dest,
2821 block);
2822 } else {
2823 ret = postcopy_place_page(mis, place_dest,
2824 place_source, block);
2829 return ret;
2832 static bool postcopy_is_advised(void)
2834 PostcopyState ps = postcopy_state_get();
2835 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
2838 static bool postcopy_is_running(void)
2840 PostcopyState ps = postcopy_state_get();
2841 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
2844 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2846 int flags = 0, ret = 0, invalid_flags = 0;
2847 static uint64_t seq_iter;
2848 int len = 0;
2850 * If system is running in postcopy mode, page inserts to host memory must
2851 * be atomic
2853 bool postcopy_running = postcopy_is_running();
2854 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2855 bool postcopy_advised = postcopy_is_advised();
2857 seq_iter++;
2859 if (version_id != 4) {
2860 ret = -EINVAL;
2863 if (!migrate_use_compression()) {
2864 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2866 /* This RCU critical section can be very long running.
2867 * When RCU reclaims in the code start to become numerous,
2868 * it will be necessary to reduce the granularity of this
2869 * critical section.
2871 rcu_read_lock();
2873 if (postcopy_running) {
2874 ret = ram_load_postcopy(f);
2877 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2878 ram_addr_t addr, total_ram_bytes;
2879 void *host = NULL;
2880 uint8_t ch;
2882 addr = qemu_get_be64(f);
2883 flags = addr & ~TARGET_PAGE_MASK;
2884 addr &= TARGET_PAGE_MASK;
2886 if (flags & invalid_flags) {
2887 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2888 error_report("Received an unexpected compressed page");
2891 ret = -EINVAL;
2892 break;
2895 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2896 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2897 RAMBlock *block = ram_block_from_stream(f, flags);
2899 host = host_from_ram_block_offset(block, addr);
2900 if (!host) {
2901 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2902 ret = -EINVAL;
2903 break;
2905 ramblock_recv_bitmap_set(block, host);
2906 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2909 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2910 case RAM_SAVE_FLAG_MEM_SIZE:
2911 /* Synchronize RAM block list */
2912 total_ram_bytes = addr;
2913 while (!ret && total_ram_bytes) {
2914 RAMBlock *block;
2915 char id[256];
2916 ram_addr_t length;
2918 len = qemu_get_byte(f);
2919 qemu_get_buffer(f, (uint8_t *)id, len);
2920 id[len] = 0;
2921 length = qemu_get_be64(f);
2923 block = qemu_ram_block_by_name(id);
2924 if (block) {
2925 if (length != block->used_length) {
2926 Error *local_err = NULL;
2928 ret = qemu_ram_resize(block, length,
2929 &local_err);
2930 if (local_err) {
2931 error_report_err(local_err);
2934 /* For postcopy we need to check hugepage sizes match */
2935 if (postcopy_advised &&
2936 block->page_size != qemu_host_page_size) {
2937 uint64_t remote_page_size = qemu_get_be64(f);
2938 if (remote_page_size != block->page_size) {
2939 error_report("Mismatched RAM page size %s "
2940 "(local) %zd != %" PRId64,
2941 id, block->page_size,
2942 remote_page_size);
2943 ret = -EINVAL;
2946 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2947 block->idstr);
2948 } else {
2949 error_report("Unknown ramblock \"%s\", cannot "
2950 "accept migration", id);
2951 ret = -EINVAL;
2954 total_ram_bytes -= length;
2956 break;
2958 case RAM_SAVE_FLAG_ZERO:
2959 ch = qemu_get_byte(f);
2960 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2961 break;
2963 case RAM_SAVE_FLAG_PAGE:
2964 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2965 break;
2967 case RAM_SAVE_FLAG_COMPRESS_PAGE:
2968 len = qemu_get_be32(f);
2969 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2970 error_report("Invalid compressed data length: %d", len);
2971 ret = -EINVAL;
2972 break;
2974 decompress_data_with_multi_threads(f, host, len);
2975 break;
2977 case RAM_SAVE_FLAG_XBZRLE:
2978 if (load_xbzrle(f, addr, host) < 0) {
2979 error_report("Failed to decompress XBZRLE page at "
2980 RAM_ADDR_FMT, addr);
2981 ret = -EINVAL;
2982 break;
2984 break;
2985 case RAM_SAVE_FLAG_EOS:
2986 /* normal exit */
2987 break;
2988 default:
2989 if (flags & RAM_SAVE_FLAG_HOOK) {
2990 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2991 } else {
2992 error_report("Unknown combination of migration flags: %#x",
2993 flags);
2994 ret = -EINVAL;
2997 if (!ret) {
2998 ret = qemu_file_get_error(f);
3002 wait_for_decompress_done();
3003 rcu_read_unlock();
3004 trace_ram_load_complete(ret, seq_iter);
3005 return ret;
3008 static bool ram_has_postcopy(void *opaque)
3010 return migrate_postcopy_ram();
3013 static SaveVMHandlers savevm_ram_handlers = {
3014 .save_setup = ram_save_setup,
3015 .save_live_iterate = ram_save_iterate,
3016 .save_live_complete_postcopy = ram_save_complete,
3017 .save_live_complete_precopy = ram_save_complete,
3018 .has_postcopy = ram_has_postcopy,
3019 .save_live_pending = ram_save_pending,
3020 .load_state = ram_load,
3021 .save_cleanup = ram_save_cleanup,
3022 .load_setup = ram_load_setup,
3023 .load_cleanup = ram_load_cleanup,
3026 void ram_mig_init(void)
3028 qemu_mutex_init(&XBZRLE.lock);
3029 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);