hw: virtio-pmem: detach the element fromt the virtqueue when error occurs
[qemu/ar7.git] / migration / ram.c
blob433489d6332212384d75e854d6ebed547f086656
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include "qemu/cutils.h"
32 #include "qemu/bitops.h"
33 #include "qemu/bitmap.h"
34 #include "qemu/main-loop.h"
35 #include "xbzrle.h"
36 #include "ram.h"
37 #include "migration.h"
38 #include "migration/register.h"
39 #include "migration/misc.h"
40 #include "qemu-file.h"
41 #include "postcopy-ram.h"
42 #include "page_cache.h"
43 #include "qemu/error-report.h"
44 #include "qapi/error.h"
45 #include "qapi/qapi-types-migration.h"
46 #include "qapi/qapi-events-migration.h"
47 #include "qapi/qmp/qerror.h"
48 #include "trace.h"
49 #include "exec/ram_addr.h"
50 #include "exec/target_page.h"
51 #include "qemu/rcu_queue.h"
52 #include "migration/colo.h"
53 #include "block.h"
54 #include "sysemu/sysemu.h"
55 #include "sysemu/cpu-throttle.h"
56 #include "savevm.h"
57 #include "qemu/iov.h"
58 #include "multifd.h"
60 /***********************************************************/
61 /* ram save/restore */
63 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
64 * worked for pages that where filled with the same char. We switched
65 * it to only search for the zero value. And to avoid confusion with
66 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
69 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
70 #define RAM_SAVE_FLAG_ZERO 0x02
71 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
72 #define RAM_SAVE_FLAG_PAGE 0x08
73 #define RAM_SAVE_FLAG_EOS 0x10
74 #define RAM_SAVE_FLAG_CONTINUE 0x20
75 #define RAM_SAVE_FLAG_XBZRLE 0x40
76 /* 0x80 is reserved in migration.h start with 0x100 next */
77 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
79 static inline bool is_zero_range(uint8_t *p, uint64_t size)
81 return buffer_is_zero(p, size);
84 XBZRLECacheStats xbzrle_counters;
86 /* struct contains XBZRLE cache and a static page
87 used by the compression */
88 static struct {
89 /* buffer used for XBZRLE encoding */
90 uint8_t *encoded_buf;
91 /* buffer for storing page content */
92 uint8_t *current_buf;
93 /* Cache for XBZRLE, Protected by lock. */
94 PageCache *cache;
95 QemuMutex lock;
96 /* it will store a page full of zeros */
97 uint8_t *zero_target_page;
98 /* buffer used for XBZRLE decoding */
99 uint8_t *decoded_buf;
100 } XBZRLE;
102 static void XBZRLE_cache_lock(void)
104 if (migrate_use_xbzrle())
105 qemu_mutex_lock(&XBZRLE.lock);
108 static void XBZRLE_cache_unlock(void)
110 if (migrate_use_xbzrle())
111 qemu_mutex_unlock(&XBZRLE.lock);
115 * xbzrle_cache_resize: resize the xbzrle cache
117 * This function is called from qmp_migrate_set_cache_size in main
118 * thread, possibly while a migration is in progress. A running
119 * migration may be using the cache and might finish during this call,
120 * hence changes to the cache are protected by XBZRLE.lock().
122 * Returns 0 for success or -1 for error
124 * @new_size: new cache size
125 * @errp: set *errp if the check failed, with reason
127 int xbzrle_cache_resize(int64_t new_size, Error **errp)
129 PageCache *new_cache;
130 int64_t ret = 0;
132 /* Check for truncation */
133 if (new_size != (size_t)new_size) {
134 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
135 "exceeding address space");
136 return -1;
139 if (new_size == migrate_xbzrle_cache_size()) {
140 /* nothing to do */
141 return 0;
144 XBZRLE_cache_lock();
146 if (XBZRLE.cache != NULL) {
147 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
148 if (!new_cache) {
149 ret = -1;
150 goto out;
153 cache_fini(XBZRLE.cache);
154 XBZRLE.cache = new_cache;
156 out:
157 XBZRLE_cache_unlock();
158 return ret;
161 bool ramblock_is_ignored(RAMBlock *block)
163 return !qemu_ram_is_migratable(block) ||
164 (migrate_ignore_shared() && qemu_ram_is_shared(block));
167 #undef RAMBLOCK_FOREACH
169 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
171 RAMBlock *block;
172 int ret = 0;
174 RCU_READ_LOCK_GUARD();
176 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
177 ret = func(block, opaque);
178 if (ret) {
179 break;
182 return ret;
185 static void ramblock_recv_map_init(void)
187 RAMBlock *rb;
189 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
190 assert(!rb->receivedmap);
191 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
195 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
197 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
198 rb->receivedmap);
201 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
203 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
206 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
208 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
211 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
212 size_t nr)
214 bitmap_set_atomic(rb->receivedmap,
215 ramblock_recv_bitmap_offset(host_addr, rb),
216 nr);
219 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
222 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
224 * Returns >0 if success with sent bytes, or <0 if error.
226 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
227 const char *block_name)
229 RAMBlock *block = qemu_ram_block_by_name(block_name);
230 unsigned long *le_bitmap, nbits;
231 uint64_t size;
233 if (!block) {
234 error_report("%s: invalid block name: %s", __func__, block_name);
235 return -1;
238 nbits = block->used_length >> TARGET_PAGE_BITS;
241 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
242 * machines we may need 4 more bytes for padding (see below
243 * comment). So extend it a bit before hand.
245 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
248 * Always use little endian when sending the bitmap. This is
249 * required that when source and destination VMs are not using the
250 * same endianness. (Note: big endian won't work.)
252 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
254 /* Size of the bitmap, in bytes */
255 size = DIV_ROUND_UP(nbits, 8);
258 * size is always aligned to 8 bytes for 64bit machines, but it
259 * may not be true for 32bit machines. We need this padding to
260 * make sure the migration can survive even between 32bit and
261 * 64bit machines.
263 size = ROUND_UP(size, 8);
265 qemu_put_be64(file, size);
266 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
268 * Mark as an end, in case the middle part is screwed up due to
269 * some "mysterious" reason.
271 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
272 qemu_fflush(file);
274 g_free(le_bitmap);
276 if (qemu_file_get_error(file)) {
277 return qemu_file_get_error(file);
280 return size + sizeof(size);
284 * An outstanding page request, on the source, having been received
285 * and queued
287 struct RAMSrcPageRequest {
288 RAMBlock *rb;
289 hwaddr offset;
290 hwaddr len;
292 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
295 /* State of RAM for migration */
296 struct RAMState {
297 /* QEMUFile used for this migration */
298 QEMUFile *f;
299 /* Last block that we have visited searching for dirty pages */
300 RAMBlock *last_seen_block;
301 /* Last block from where we have sent data */
302 RAMBlock *last_sent_block;
303 /* Last dirty target page we have sent */
304 ram_addr_t last_page;
305 /* last ram version we have seen */
306 uint32_t last_version;
307 /* We are in the first round */
308 bool ram_bulk_stage;
309 /* The free page optimization is enabled */
310 bool fpo_enabled;
311 /* How many times we have dirty too many pages */
312 int dirty_rate_high_cnt;
313 /* these variables are used for bitmap sync */
314 /* last time we did a full bitmap_sync */
315 int64_t time_last_bitmap_sync;
316 /* bytes transferred at start_time */
317 uint64_t bytes_xfer_prev;
318 /* number of dirty pages since start_time */
319 uint64_t num_dirty_pages_period;
320 /* xbzrle misses since the beginning of the period */
321 uint64_t xbzrle_cache_miss_prev;
322 /* Amount of xbzrle pages since the beginning of the period */
323 uint64_t xbzrle_pages_prev;
324 /* Amount of xbzrle encoded bytes since the beginning of the period */
325 uint64_t xbzrle_bytes_prev;
327 /* compression statistics since the beginning of the period */
328 /* amount of count that no free thread to compress data */
329 uint64_t compress_thread_busy_prev;
330 /* amount bytes after compression */
331 uint64_t compressed_size_prev;
332 /* amount of compressed pages */
333 uint64_t compress_pages_prev;
335 /* total handled target pages at the beginning of period */
336 uint64_t target_page_count_prev;
337 /* total handled target pages since start */
338 uint64_t target_page_count;
339 /* number of dirty bits in the bitmap */
340 uint64_t migration_dirty_pages;
341 /* Protects modification of the bitmap and migration dirty pages */
342 QemuMutex bitmap_mutex;
343 /* The RAMBlock used in the last src_page_requests */
344 RAMBlock *last_req_rb;
345 /* Queue of outstanding page requests from the destination */
346 QemuMutex src_page_req_mutex;
347 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
349 typedef struct RAMState RAMState;
351 static RAMState *ram_state;
353 static NotifierWithReturnList precopy_notifier_list;
355 void precopy_infrastructure_init(void)
357 notifier_with_return_list_init(&precopy_notifier_list);
360 void precopy_add_notifier(NotifierWithReturn *n)
362 notifier_with_return_list_add(&precopy_notifier_list, n);
365 void precopy_remove_notifier(NotifierWithReturn *n)
367 notifier_with_return_remove(n);
370 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
372 PrecopyNotifyData pnd;
373 pnd.reason = reason;
374 pnd.errp = errp;
376 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
379 void precopy_enable_free_page_optimization(void)
381 if (!ram_state) {
382 return;
385 ram_state->fpo_enabled = true;
388 uint64_t ram_bytes_remaining(void)
390 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
394 MigrationStats ram_counters;
396 /* used by the search for pages to send */
397 struct PageSearchStatus {
398 /* Current block being searched */
399 RAMBlock *block;
400 /* Current page to search from */
401 unsigned long page;
402 /* Set once we wrap around */
403 bool complete_round;
405 typedef struct PageSearchStatus PageSearchStatus;
407 CompressionStats compression_counters;
409 struct CompressParam {
410 bool done;
411 bool quit;
412 bool zero_page;
413 QEMUFile *file;
414 QemuMutex mutex;
415 QemuCond cond;
416 RAMBlock *block;
417 ram_addr_t offset;
419 /* internally used fields */
420 z_stream stream;
421 uint8_t *originbuf;
423 typedef struct CompressParam CompressParam;
425 struct DecompressParam {
426 bool done;
427 bool quit;
428 QemuMutex mutex;
429 QemuCond cond;
430 void *des;
431 uint8_t *compbuf;
432 int len;
433 z_stream stream;
435 typedef struct DecompressParam DecompressParam;
437 static CompressParam *comp_param;
438 static QemuThread *compress_threads;
439 /* comp_done_cond is used to wake up the migration thread when
440 * one of the compression threads has finished the compression.
441 * comp_done_lock is used to co-work with comp_done_cond.
443 static QemuMutex comp_done_lock;
444 static QemuCond comp_done_cond;
445 /* The empty QEMUFileOps will be used by file in CompressParam */
446 static const QEMUFileOps empty_ops = { };
448 static QEMUFile *decomp_file;
449 static DecompressParam *decomp_param;
450 static QemuThread *decompress_threads;
451 static QemuMutex decomp_done_lock;
452 static QemuCond decomp_done_cond;
454 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
455 ram_addr_t offset, uint8_t *source_buf);
457 static void *do_data_compress(void *opaque)
459 CompressParam *param = opaque;
460 RAMBlock *block;
461 ram_addr_t offset;
462 bool zero_page;
464 qemu_mutex_lock(&param->mutex);
465 while (!param->quit) {
466 if (param->block) {
467 block = param->block;
468 offset = param->offset;
469 param->block = NULL;
470 qemu_mutex_unlock(&param->mutex);
472 zero_page = do_compress_ram_page(param->file, &param->stream,
473 block, offset, param->originbuf);
475 qemu_mutex_lock(&comp_done_lock);
476 param->done = true;
477 param->zero_page = zero_page;
478 qemu_cond_signal(&comp_done_cond);
479 qemu_mutex_unlock(&comp_done_lock);
481 qemu_mutex_lock(&param->mutex);
482 } else {
483 qemu_cond_wait(&param->cond, &param->mutex);
486 qemu_mutex_unlock(&param->mutex);
488 return NULL;
491 static void compress_threads_save_cleanup(void)
493 int i, thread_count;
495 if (!migrate_use_compression() || !comp_param) {
496 return;
499 thread_count = migrate_compress_threads();
500 for (i = 0; i < thread_count; i++) {
502 * we use it as a indicator which shows if the thread is
503 * properly init'd or not
505 if (!comp_param[i].file) {
506 break;
509 qemu_mutex_lock(&comp_param[i].mutex);
510 comp_param[i].quit = true;
511 qemu_cond_signal(&comp_param[i].cond);
512 qemu_mutex_unlock(&comp_param[i].mutex);
514 qemu_thread_join(compress_threads + i);
515 qemu_mutex_destroy(&comp_param[i].mutex);
516 qemu_cond_destroy(&comp_param[i].cond);
517 deflateEnd(&comp_param[i].stream);
518 g_free(comp_param[i].originbuf);
519 qemu_fclose(comp_param[i].file);
520 comp_param[i].file = NULL;
522 qemu_mutex_destroy(&comp_done_lock);
523 qemu_cond_destroy(&comp_done_cond);
524 g_free(compress_threads);
525 g_free(comp_param);
526 compress_threads = NULL;
527 comp_param = NULL;
530 static int compress_threads_save_setup(void)
532 int i, thread_count;
534 if (!migrate_use_compression()) {
535 return 0;
537 thread_count = migrate_compress_threads();
538 compress_threads = g_new0(QemuThread, thread_count);
539 comp_param = g_new0(CompressParam, thread_count);
540 qemu_cond_init(&comp_done_cond);
541 qemu_mutex_init(&comp_done_lock);
542 for (i = 0; i < thread_count; i++) {
543 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
544 if (!comp_param[i].originbuf) {
545 goto exit;
548 if (deflateInit(&comp_param[i].stream,
549 migrate_compress_level()) != Z_OK) {
550 g_free(comp_param[i].originbuf);
551 goto exit;
554 /* comp_param[i].file is just used as a dummy buffer to save data,
555 * set its ops to empty.
557 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
558 comp_param[i].done = true;
559 comp_param[i].quit = false;
560 qemu_mutex_init(&comp_param[i].mutex);
561 qemu_cond_init(&comp_param[i].cond);
562 qemu_thread_create(compress_threads + i, "compress",
563 do_data_compress, comp_param + i,
564 QEMU_THREAD_JOINABLE);
566 return 0;
568 exit:
569 compress_threads_save_cleanup();
570 return -1;
574 * save_page_header: write page header to wire
576 * If this is the 1st block, it also writes the block identification
578 * Returns the number of bytes written
580 * @f: QEMUFile where to send the data
581 * @block: block that contains the page we want to send
582 * @offset: offset inside the block for the page
583 * in the lower bits, it contains flags
585 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
586 ram_addr_t offset)
588 size_t size, len;
590 if (block == rs->last_sent_block) {
591 offset |= RAM_SAVE_FLAG_CONTINUE;
593 qemu_put_be64(f, offset);
594 size = 8;
596 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
597 len = strlen(block->idstr);
598 qemu_put_byte(f, len);
599 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
600 size += 1 + len;
601 rs->last_sent_block = block;
603 return size;
607 * mig_throttle_guest_down: throotle down the guest
609 * Reduce amount of guest cpu execution to hopefully slow down memory
610 * writes. If guest dirty memory rate is reduced below the rate at
611 * which we can transfer pages to the destination then we should be
612 * able to complete migration. Some workloads dirty memory way too
613 * fast and will not effectively converge, even with auto-converge.
615 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
616 uint64_t bytes_dirty_threshold)
618 MigrationState *s = migrate_get_current();
619 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
620 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
621 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
622 int pct_max = s->parameters.max_cpu_throttle;
624 uint64_t throttle_now = cpu_throttle_get_percentage();
625 uint64_t cpu_now, cpu_ideal, throttle_inc;
627 /* We have not started throttling yet. Let's start it. */
628 if (!cpu_throttle_active()) {
629 cpu_throttle_set(pct_initial);
630 } else {
631 /* Throttling already on, just increase the rate */
632 if (!pct_tailslow) {
633 throttle_inc = pct_increment;
634 } else {
635 /* Compute the ideal CPU percentage used by Guest, which may
636 * make the dirty rate match the dirty rate threshold. */
637 cpu_now = 100 - throttle_now;
638 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
639 bytes_dirty_period);
640 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
642 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
647 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
649 * @rs: current RAM state
650 * @current_addr: address for the zero page
652 * Update the xbzrle cache to reflect a page that's been sent as all 0.
653 * The important thing is that a stale (not-yet-0'd) page be replaced
654 * by the new data.
655 * As a bonus, if the page wasn't in the cache it gets added so that
656 * when a small write is made into the 0'd page it gets XBZRLE sent.
658 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
660 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
661 return;
664 /* We don't care if this fails to allocate a new cache page
665 * as long as it updated an old one */
666 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
667 ram_counters.dirty_sync_count);
670 #define ENCODING_FLAG_XBZRLE 0x1
673 * save_xbzrle_page: compress and send current page
675 * Returns: 1 means that we wrote the page
676 * 0 means that page is identical to the one already sent
677 * -1 means that xbzrle would be longer than normal
679 * @rs: current RAM state
680 * @current_data: pointer to the address of the page contents
681 * @current_addr: addr of the page
682 * @block: block that contains the page we want to send
683 * @offset: offset inside the block for the page
684 * @last_stage: if we are at the completion stage
686 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
687 ram_addr_t current_addr, RAMBlock *block,
688 ram_addr_t offset, bool last_stage)
690 int encoded_len = 0, bytes_xbzrle;
691 uint8_t *prev_cached_page;
693 if (!cache_is_cached(XBZRLE.cache, current_addr,
694 ram_counters.dirty_sync_count)) {
695 xbzrle_counters.cache_miss++;
696 if (!last_stage) {
697 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
698 ram_counters.dirty_sync_count) == -1) {
699 return -1;
700 } else {
701 /* update *current_data when the page has been
702 inserted into cache */
703 *current_data = get_cached_data(XBZRLE.cache, current_addr);
706 return -1;
710 * Reaching here means the page has hit the xbzrle cache, no matter what
711 * encoding result it is (normal encoding, overflow or skipping the page),
712 * count the page as encoded. This is used to calculate the encoding rate.
714 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
715 * 2nd page turns out to be skipped (i.e. no new bytes written to the
716 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
717 * skipped page included. In this way, the encoding rate can tell if the
718 * guest page is good for xbzrle encoding.
720 xbzrle_counters.pages++;
721 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
723 /* save current buffer into memory */
724 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
726 /* XBZRLE encoding (if there is no overflow) */
727 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
728 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
729 TARGET_PAGE_SIZE);
732 * Update the cache contents, so that it corresponds to the data
733 * sent, in all cases except where we skip the page.
735 if (!last_stage && encoded_len != 0) {
736 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
738 * In the case where we couldn't compress, ensure that the caller
739 * sends the data from the cache, since the guest might have
740 * changed the RAM since we copied it.
742 *current_data = prev_cached_page;
745 if (encoded_len == 0) {
746 trace_save_xbzrle_page_skipping();
747 return 0;
748 } else if (encoded_len == -1) {
749 trace_save_xbzrle_page_overflow();
750 xbzrle_counters.overflow++;
751 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
752 return -1;
755 /* Send XBZRLE based compressed page */
756 bytes_xbzrle = save_page_header(rs, rs->f, block,
757 offset | RAM_SAVE_FLAG_XBZRLE);
758 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
759 qemu_put_be16(rs->f, encoded_len);
760 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
761 bytes_xbzrle += encoded_len + 1 + 2;
763 * Like compressed_size (please see update_compress_thread_counts),
764 * the xbzrle encoded bytes don't count the 8 byte header with
765 * RAM_SAVE_FLAG_CONTINUE.
767 xbzrle_counters.bytes += bytes_xbzrle - 8;
768 ram_counters.transferred += bytes_xbzrle;
770 return 1;
774 * migration_bitmap_find_dirty: find the next dirty page from start
776 * Returns the page offset within memory region of the start of a dirty page
778 * @rs: current RAM state
779 * @rb: RAMBlock where to search for dirty pages
780 * @start: page where we start the search
782 static inline
783 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
784 unsigned long start)
786 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
787 unsigned long *bitmap = rb->bmap;
788 unsigned long next;
790 if (ramblock_is_ignored(rb)) {
791 return size;
795 * When the free page optimization is enabled, we need to check the bitmap
796 * to send the non-free pages rather than all the pages in the bulk stage.
798 if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
799 next = start + 1;
800 } else {
801 next = find_next_bit(bitmap, size, start);
804 return next;
807 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
808 RAMBlock *rb,
809 unsigned long page)
811 bool ret;
813 qemu_mutex_lock(&rs->bitmap_mutex);
816 * Clear dirty bitmap if needed. This _must_ be called before we
817 * send any of the page in the chunk because we need to make sure
818 * we can capture further page content changes when we sync dirty
819 * log the next time. So as long as we are going to send any of
820 * the page in the chunk we clear the remote dirty bitmap for all.
821 * Clearing it earlier won't be a problem, but too late will.
823 if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
824 uint8_t shift = rb->clear_bmap_shift;
825 hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
826 hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
829 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
830 * can make things easier sometimes since then start address
831 * of the small chunk will always be 64 pages aligned so the
832 * bitmap will always be aligned to unsigned long. We should
833 * even be able to remove this restriction but I'm simply
834 * keeping it.
836 assert(shift >= 6);
837 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
838 memory_region_clear_dirty_bitmap(rb->mr, start, size);
841 ret = test_and_clear_bit(page, rb->bmap);
843 if (ret) {
844 rs->migration_dirty_pages--;
846 qemu_mutex_unlock(&rs->bitmap_mutex);
848 return ret;
851 /* Called with RCU critical section */
852 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
854 uint64_t new_dirty_pages =
855 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
857 rs->migration_dirty_pages += new_dirty_pages;
858 rs->num_dirty_pages_period += new_dirty_pages;
862 * ram_pagesize_summary: calculate all the pagesizes of a VM
864 * Returns a summary bitmap of the page sizes of all RAMBlocks
866 * For VMs with just normal pages this is equivalent to the host page
867 * size. If it's got some huge pages then it's the OR of all the
868 * different page sizes.
870 uint64_t ram_pagesize_summary(void)
872 RAMBlock *block;
873 uint64_t summary = 0;
875 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
876 summary |= block->page_size;
879 return summary;
882 uint64_t ram_get_total_transferred_pages(void)
884 return ram_counters.normal + ram_counters.duplicate +
885 compression_counters.pages + xbzrle_counters.pages;
888 static void migration_update_rates(RAMState *rs, int64_t end_time)
890 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
891 double compressed_size;
893 /* calculate period counters */
894 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
895 / (end_time - rs->time_last_bitmap_sync);
897 if (!page_count) {
898 return;
901 if (migrate_use_xbzrle()) {
902 double encoded_size, unencoded_size;
904 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
905 rs->xbzrle_cache_miss_prev) / page_count;
906 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
907 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
908 TARGET_PAGE_SIZE;
909 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
910 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
911 xbzrle_counters.encoding_rate = 0;
912 } else {
913 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
915 rs->xbzrle_pages_prev = xbzrle_counters.pages;
916 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
919 if (migrate_use_compression()) {
920 compression_counters.busy_rate = (double)(compression_counters.busy -
921 rs->compress_thread_busy_prev) / page_count;
922 rs->compress_thread_busy_prev = compression_counters.busy;
924 compressed_size = compression_counters.compressed_size -
925 rs->compressed_size_prev;
926 if (compressed_size) {
927 double uncompressed_size = (compression_counters.pages -
928 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
930 /* Compression-Ratio = Uncompressed-size / Compressed-size */
931 compression_counters.compression_rate =
932 uncompressed_size / compressed_size;
934 rs->compress_pages_prev = compression_counters.pages;
935 rs->compressed_size_prev = compression_counters.compressed_size;
940 static void migration_trigger_throttle(RAMState *rs)
942 MigrationState *s = migrate_get_current();
943 uint64_t threshold = s->parameters.throttle_trigger_threshold;
945 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
946 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
947 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
949 /* During block migration the auto-converge logic incorrectly detects
950 * that ram migration makes no progress. Avoid this by disabling the
951 * throttling logic during the bulk phase of block migration. */
952 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
953 /* The following detection logic can be refined later. For now:
954 Check to see if the ratio between dirtied bytes and the approx.
955 amount of bytes that just got transferred since the last time
956 we were in this routine reaches the threshold. If that happens
957 twice, start or increase throttling. */
959 if ((bytes_dirty_period > bytes_dirty_threshold) &&
960 (++rs->dirty_rate_high_cnt >= 2)) {
961 trace_migration_throttle();
962 rs->dirty_rate_high_cnt = 0;
963 mig_throttle_guest_down(bytes_dirty_period,
964 bytes_dirty_threshold);
969 static void migration_bitmap_sync(RAMState *rs)
971 RAMBlock *block;
972 int64_t end_time;
974 ram_counters.dirty_sync_count++;
976 if (!rs->time_last_bitmap_sync) {
977 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
980 trace_migration_bitmap_sync_start();
981 memory_global_dirty_log_sync();
983 qemu_mutex_lock(&rs->bitmap_mutex);
984 WITH_RCU_READ_LOCK_GUARD() {
985 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
986 ramblock_sync_dirty_bitmap(rs, block);
988 ram_counters.remaining = ram_bytes_remaining();
990 qemu_mutex_unlock(&rs->bitmap_mutex);
992 memory_global_after_dirty_log_sync();
993 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
995 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
997 /* more than 1 second = 1000 millisecons */
998 if (end_time > rs->time_last_bitmap_sync + 1000) {
999 migration_trigger_throttle(rs);
1001 migration_update_rates(rs, end_time);
1003 rs->target_page_count_prev = rs->target_page_count;
1005 /* reset period counters */
1006 rs->time_last_bitmap_sync = end_time;
1007 rs->num_dirty_pages_period = 0;
1008 rs->bytes_xfer_prev = ram_counters.transferred;
1010 if (migrate_use_events()) {
1011 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1015 static void migration_bitmap_sync_precopy(RAMState *rs)
1017 Error *local_err = NULL;
1020 * The current notifier usage is just an optimization to migration, so we
1021 * don't stop the normal migration process in the error case.
1023 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1024 error_report_err(local_err);
1025 local_err = NULL;
1028 migration_bitmap_sync(rs);
1030 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1031 error_report_err(local_err);
1036 * save_zero_page_to_file: send the zero page to the file
1038 * Returns the size of data written to the file, 0 means the page is not
1039 * a zero page
1041 * @rs: current RAM state
1042 * @file: the file where the data is saved
1043 * @block: block that contains the page we want to send
1044 * @offset: offset inside the block for the page
1046 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1047 RAMBlock *block, ram_addr_t offset)
1049 uint8_t *p = block->host + offset;
1050 int len = 0;
1052 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1053 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1054 qemu_put_byte(file, 0);
1055 len += 1;
1057 return len;
1061 * save_zero_page: send the zero page to the stream
1063 * Returns the number of pages written.
1065 * @rs: current RAM state
1066 * @block: block that contains the page we want to send
1067 * @offset: offset inside the block for the page
1069 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1071 int len = save_zero_page_to_file(rs, rs->f, block, offset);
1073 if (len) {
1074 ram_counters.duplicate++;
1075 ram_counters.transferred += len;
1076 return 1;
1078 return -1;
1081 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1083 if (!migrate_release_ram() || !migration_in_postcopy()) {
1084 return;
1087 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1091 * @pages: the number of pages written by the control path,
1092 * < 0 - error
1093 * > 0 - number of pages written
1095 * Return true if the pages has been saved, otherwise false is returned.
1097 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1098 int *pages)
1100 uint64_t bytes_xmit = 0;
1101 int ret;
1103 *pages = -1;
1104 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1105 &bytes_xmit);
1106 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1107 return false;
1110 if (bytes_xmit) {
1111 ram_counters.transferred += bytes_xmit;
1112 *pages = 1;
1115 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1116 return true;
1119 if (bytes_xmit > 0) {
1120 ram_counters.normal++;
1121 } else if (bytes_xmit == 0) {
1122 ram_counters.duplicate++;
1125 return true;
1129 * directly send the page to the stream
1131 * Returns the number of pages written.
1133 * @rs: current RAM state
1134 * @block: block that contains the page we want to send
1135 * @offset: offset inside the block for the page
1136 * @buf: the page to be sent
1137 * @async: send to page asyncly
1139 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1140 uint8_t *buf, bool async)
1142 ram_counters.transferred += save_page_header(rs, rs->f, block,
1143 offset | RAM_SAVE_FLAG_PAGE);
1144 if (async) {
1145 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1146 migrate_release_ram() &
1147 migration_in_postcopy());
1148 } else {
1149 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1151 ram_counters.transferred += TARGET_PAGE_SIZE;
1152 ram_counters.normal++;
1153 return 1;
1157 * ram_save_page: send the given page to the stream
1159 * Returns the number of pages written.
1160 * < 0 - error
1161 * >=0 - Number of pages written - this might legally be 0
1162 * if xbzrle noticed the page was the same.
1164 * @rs: current RAM state
1165 * @block: block that contains the page we want to send
1166 * @offset: offset inside the block for the page
1167 * @last_stage: if we are at the completion stage
1169 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1171 int pages = -1;
1172 uint8_t *p;
1173 bool send_async = true;
1174 RAMBlock *block = pss->block;
1175 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1176 ram_addr_t current_addr = block->offset + offset;
1178 p = block->host + offset;
1179 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1181 XBZRLE_cache_lock();
1182 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1183 migrate_use_xbzrle()) {
1184 pages = save_xbzrle_page(rs, &p, current_addr, block,
1185 offset, last_stage);
1186 if (!last_stage) {
1187 /* Can't send this cached data async, since the cache page
1188 * might get updated before it gets to the wire
1190 send_async = false;
1194 /* XBZRLE overflow or normal page */
1195 if (pages == -1) {
1196 pages = save_normal_page(rs, block, offset, p, send_async);
1199 XBZRLE_cache_unlock();
1201 return pages;
1204 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1205 ram_addr_t offset)
1207 if (multifd_queue_page(rs->f, block, offset) < 0) {
1208 return -1;
1210 ram_counters.normal++;
1212 return 1;
1215 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1216 ram_addr_t offset, uint8_t *source_buf)
1218 RAMState *rs = ram_state;
1219 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1220 bool zero_page = false;
1221 int ret;
1223 if (save_zero_page_to_file(rs, f, block, offset)) {
1224 zero_page = true;
1225 goto exit;
1228 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1231 * copy it to a internal buffer to avoid it being modified by VM
1232 * so that we can catch up the error during compression and
1233 * decompression
1235 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1236 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1237 if (ret < 0) {
1238 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1239 error_report("compressed data failed!");
1240 return false;
1243 exit:
1244 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1245 return zero_page;
1248 static void
1249 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1251 ram_counters.transferred += bytes_xmit;
1253 if (param->zero_page) {
1254 ram_counters.duplicate++;
1255 return;
1258 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1259 compression_counters.compressed_size += bytes_xmit - 8;
1260 compression_counters.pages++;
1263 static bool save_page_use_compression(RAMState *rs);
1265 static void flush_compressed_data(RAMState *rs)
1267 int idx, len, thread_count;
1269 if (!save_page_use_compression(rs)) {
1270 return;
1272 thread_count = migrate_compress_threads();
1274 qemu_mutex_lock(&comp_done_lock);
1275 for (idx = 0; idx < thread_count; idx++) {
1276 while (!comp_param[idx].done) {
1277 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1280 qemu_mutex_unlock(&comp_done_lock);
1282 for (idx = 0; idx < thread_count; idx++) {
1283 qemu_mutex_lock(&comp_param[idx].mutex);
1284 if (!comp_param[idx].quit) {
1285 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1287 * it's safe to fetch zero_page without holding comp_done_lock
1288 * as there is no further request submitted to the thread,
1289 * i.e, the thread should be waiting for a request at this point.
1291 update_compress_thread_counts(&comp_param[idx], len);
1293 qemu_mutex_unlock(&comp_param[idx].mutex);
1297 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1298 ram_addr_t offset)
1300 param->block = block;
1301 param->offset = offset;
1304 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1305 ram_addr_t offset)
1307 int idx, thread_count, bytes_xmit = -1, pages = -1;
1308 bool wait = migrate_compress_wait_thread();
1310 thread_count = migrate_compress_threads();
1311 qemu_mutex_lock(&comp_done_lock);
1312 retry:
1313 for (idx = 0; idx < thread_count; idx++) {
1314 if (comp_param[idx].done) {
1315 comp_param[idx].done = false;
1316 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1317 qemu_mutex_lock(&comp_param[idx].mutex);
1318 set_compress_params(&comp_param[idx], block, offset);
1319 qemu_cond_signal(&comp_param[idx].cond);
1320 qemu_mutex_unlock(&comp_param[idx].mutex);
1321 pages = 1;
1322 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1323 break;
1328 * wait for the free thread if the user specifies 'compress-wait-thread',
1329 * otherwise we will post the page out in the main thread as normal page.
1331 if (pages < 0 && wait) {
1332 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1333 goto retry;
1335 qemu_mutex_unlock(&comp_done_lock);
1337 return pages;
1341 * find_dirty_block: find the next dirty page and update any state
1342 * associated with the search process.
1344 * Returns true if a page is found
1346 * @rs: current RAM state
1347 * @pss: data about the state of the current dirty page scan
1348 * @again: set to false if the search has scanned the whole of RAM
1350 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1352 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1353 if (pss->complete_round && pss->block == rs->last_seen_block &&
1354 pss->page >= rs->last_page) {
1356 * We've been once around the RAM and haven't found anything.
1357 * Give up.
1359 *again = false;
1360 return false;
1362 if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1363 >= pss->block->used_length) {
1364 /* Didn't find anything in this RAM Block */
1365 pss->page = 0;
1366 pss->block = QLIST_NEXT_RCU(pss->block, next);
1367 if (!pss->block) {
1369 * If memory migration starts over, we will meet a dirtied page
1370 * which may still exists in compression threads's ring, so we
1371 * should flush the compressed data to make sure the new page
1372 * is not overwritten by the old one in the destination.
1374 * Also If xbzrle is on, stop using the data compression at this
1375 * point. In theory, xbzrle can do better than compression.
1377 flush_compressed_data(rs);
1379 /* Hit the end of the list */
1380 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1381 /* Flag that we've looped */
1382 pss->complete_round = true;
1383 rs->ram_bulk_stage = false;
1385 /* Didn't find anything this time, but try again on the new block */
1386 *again = true;
1387 return false;
1388 } else {
1389 /* Can go around again, but... */
1390 *again = true;
1391 /* We've found something so probably don't need to */
1392 return true;
1397 * unqueue_page: gets a page of the queue
1399 * Helper for 'get_queued_page' - gets a page off the queue
1401 * Returns the block of the page (or NULL if none available)
1403 * @rs: current RAM state
1404 * @offset: used to return the offset within the RAMBlock
1406 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1408 RAMBlock *block = NULL;
1410 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1411 return NULL;
1414 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1415 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1416 struct RAMSrcPageRequest *entry =
1417 QSIMPLEQ_FIRST(&rs->src_page_requests);
1418 block = entry->rb;
1419 *offset = entry->offset;
1421 if (entry->len > TARGET_PAGE_SIZE) {
1422 entry->len -= TARGET_PAGE_SIZE;
1423 entry->offset += TARGET_PAGE_SIZE;
1424 } else {
1425 memory_region_unref(block->mr);
1426 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1427 g_free(entry);
1428 migration_consume_urgent_request();
1432 return block;
1436 * get_queued_page: unqueue a page from the postcopy requests
1438 * Skips pages that are already sent (!dirty)
1440 * Returns true if a queued page is found
1442 * @rs: current RAM state
1443 * @pss: data about the state of the current dirty page scan
1445 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1447 RAMBlock *block;
1448 ram_addr_t offset;
1449 bool dirty;
1451 do {
1452 block = unqueue_page(rs, &offset);
1454 * We're sending this page, and since it's postcopy nothing else
1455 * will dirty it, and we must make sure it doesn't get sent again
1456 * even if this queue request was received after the background
1457 * search already sent it.
1459 if (block) {
1460 unsigned long page;
1462 page = offset >> TARGET_PAGE_BITS;
1463 dirty = test_bit(page, block->bmap);
1464 if (!dirty) {
1465 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1466 page);
1467 } else {
1468 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1472 } while (block && !dirty);
1474 if (block) {
1476 * As soon as we start servicing pages out of order, then we have
1477 * to kill the bulk stage, since the bulk stage assumes
1478 * in (migration_bitmap_find_and_reset_dirty) that every page is
1479 * dirty, that's no longer true.
1481 rs->ram_bulk_stage = false;
1484 * We want the background search to continue from the queued page
1485 * since the guest is likely to want other pages near to the page
1486 * it just requested.
1488 pss->block = block;
1489 pss->page = offset >> TARGET_PAGE_BITS;
1492 * This unqueued page would break the "one round" check, even is
1493 * really rare.
1495 pss->complete_round = false;
1498 return !!block;
1502 * migration_page_queue_free: drop any remaining pages in the ram
1503 * request queue
1505 * It should be empty at the end anyway, but in error cases there may
1506 * be some left. in case that there is any page left, we drop it.
1509 static void migration_page_queue_free(RAMState *rs)
1511 struct RAMSrcPageRequest *mspr, *next_mspr;
1512 /* This queue generally should be empty - but in the case of a failed
1513 * migration might have some droppings in.
1515 RCU_READ_LOCK_GUARD();
1516 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1517 memory_region_unref(mspr->rb->mr);
1518 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1519 g_free(mspr);
1524 * ram_save_queue_pages: queue the page for transmission
1526 * A request from postcopy destination for example.
1528 * Returns zero on success or negative on error
1530 * @rbname: Name of the RAMBLock of the request. NULL means the
1531 * same that last one.
1532 * @start: starting address from the start of the RAMBlock
1533 * @len: length (in bytes) to send
1535 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1537 RAMBlock *ramblock;
1538 RAMState *rs = ram_state;
1540 ram_counters.postcopy_requests++;
1541 RCU_READ_LOCK_GUARD();
1543 if (!rbname) {
1544 /* Reuse last RAMBlock */
1545 ramblock = rs->last_req_rb;
1547 if (!ramblock) {
1549 * Shouldn't happen, we can't reuse the last RAMBlock if
1550 * it's the 1st request.
1552 error_report("ram_save_queue_pages no previous block");
1553 return -1;
1555 } else {
1556 ramblock = qemu_ram_block_by_name(rbname);
1558 if (!ramblock) {
1559 /* We shouldn't be asked for a non-existent RAMBlock */
1560 error_report("ram_save_queue_pages no block '%s'", rbname);
1561 return -1;
1563 rs->last_req_rb = ramblock;
1565 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1566 if (start+len > ramblock->used_length) {
1567 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1568 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1569 __func__, start, len, ramblock->used_length);
1570 return -1;
1573 struct RAMSrcPageRequest *new_entry =
1574 g_malloc0(sizeof(struct RAMSrcPageRequest));
1575 new_entry->rb = ramblock;
1576 new_entry->offset = start;
1577 new_entry->len = len;
1579 memory_region_ref(ramblock->mr);
1580 qemu_mutex_lock(&rs->src_page_req_mutex);
1581 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1582 migration_make_urgent_request();
1583 qemu_mutex_unlock(&rs->src_page_req_mutex);
1585 return 0;
1588 static bool save_page_use_compression(RAMState *rs)
1590 if (!migrate_use_compression()) {
1591 return false;
1595 * If xbzrle is on, stop using the data compression after first
1596 * round of migration even if compression is enabled. In theory,
1597 * xbzrle can do better than compression.
1599 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1600 return true;
1603 return false;
1607 * try to compress the page before posting it out, return true if the page
1608 * has been properly handled by compression, otherwise needs other
1609 * paths to handle it
1611 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1613 if (!save_page_use_compression(rs)) {
1614 return false;
1618 * When starting the process of a new block, the first page of
1619 * the block should be sent out before other pages in the same
1620 * block, and all the pages in last block should have been sent
1621 * out, keeping this order is important, because the 'cont' flag
1622 * is used to avoid resending the block name.
1624 * We post the fist page as normal page as compression will take
1625 * much CPU resource.
1627 if (block != rs->last_sent_block) {
1628 flush_compressed_data(rs);
1629 return false;
1632 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1633 return true;
1636 compression_counters.busy++;
1637 return false;
1641 * ram_save_target_page: save one target page
1643 * Returns the number of pages written
1645 * @rs: current RAM state
1646 * @pss: data about the page we want to send
1647 * @last_stage: if we are at the completion stage
1649 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1650 bool last_stage)
1652 RAMBlock *block = pss->block;
1653 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1654 int res;
1656 if (control_save_page(rs, block, offset, &res)) {
1657 return res;
1660 if (save_compress_page(rs, block, offset)) {
1661 return 1;
1664 res = save_zero_page(rs, block, offset);
1665 if (res > 0) {
1666 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1667 * page would be stale
1669 if (!save_page_use_compression(rs)) {
1670 XBZRLE_cache_lock();
1671 xbzrle_cache_zero_page(rs, block->offset + offset);
1672 XBZRLE_cache_unlock();
1674 ram_release_pages(block->idstr, offset, res);
1675 return res;
1679 * Do not use multifd for:
1680 * 1. Compression as the first page in the new block should be posted out
1681 * before sending the compressed page
1682 * 2. In postcopy as one whole host page should be placed
1684 if (!save_page_use_compression(rs) && migrate_use_multifd()
1685 && !migration_in_postcopy()) {
1686 return ram_save_multifd_page(rs, block, offset);
1689 return ram_save_page(rs, pss, last_stage);
1693 * ram_save_host_page: save a whole host page
1695 * Starting at *offset send pages up to the end of the current host
1696 * page. It's valid for the initial offset to point into the middle of
1697 * a host page in which case the remainder of the hostpage is sent.
1698 * Only dirty target pages are sent. Note that the host page size may
1699 * be a huge page for this block.
1700 * The saving stops at the boundary of the used_length of the block
1701 * if the RAMBlock isn't a multiple of the host page size.
1703 * Returns the number of pages written or negative on error
1705 * @rs: current RAM state
1706 * @ms: current migration state
1707 * @pss: data about the page we want to send
1708 * @last_stage: if we are at the completion stage
1710 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1711 bool last_stage)
1713 int tmppages, pages = 0;
1714 size_t pagesize_bits =
1715 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1717 if (ramblock_is_ignored(pss->block)) {
1718 error_report("block %s should not be migrated !", pss->block->idstr);
1719 return 0;
1722 do {
1723 /* Check the pages is dirty and if it is send it */
1724 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1725 pss->page++;
1726 continue;
1729 tmppages = ram_save_target_page(rs, pss, last_stage);
1730 if (tmppages < 0) {
1731 return tmppages;
1734 pages += tmppages;
1735 pss->page++;
1736 /* Allow rate limiting to happen in the middle of huge pages */
1737 migration_rate_limit();
1738 } while ((pss->page & (pagesize_bits - 1)) &&
1739 offset_in_ramblock(pss->block,
1740 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
1742 /* The offset we leave with is the last one we looked at */
1743 pss->page--;
1744 return pages;
1748 * ram_find_and_save_block: finds a dirty page and sends it to f
1750 * Called within an RCU critical section.
1752 * Returns the number of pages written where zero means no dirty pages,
1753 * or negative on error
1755 * @rs: current RAM state
1756 * @last_stage: if we are at the completion stage
1758 * On systems where host-page-size > target-page-size it will send all the
1759 * pages in a host page that are dirty.
1762 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1764 PageSearchStatus pss;
1765 int pages = 0;
1766 bool again, found;
1768 /* No dirty page as there is zero RAM */
1769 if (!ram_bytes_total()) {
1770 return pages;
1773 pss.block = rs->last_seen_block;
1774 pss.page = rs->last_page;
1775 pss.complete_round = false;
1777 if (!pss.block) {
1778 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1781 do {
1782 again = true;
1783 found = get_queued_page(rs, &pss);
1785 if (!found) {
1786 /* priority queue empty, so just search for something dirty */
1787 found = find_dirty_block(rs, &pss, &again);
1790 if (found) {
1791 pages = ram_save_host_page(rs, &pss, last_stage);
1793 } while (!pages && again);
1795 rs->last_seen_block = pss.block;
1796 rs->last_page = pss.page;
1798 return pages;
1801 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1803 uint64_t pages = size / TARGET_PAGE_SIZE;
1805 if (zero) {
1806 ram_counters.duplicate += pages;
1807 } else {
1808 ram_counters.normal += pages;
1809 ram_counters.transferred += size;
1810 qemu_update_position(f, size);
1814 static uint64_t ram_bytes_total_common(bool count_ignored)
1816 RAMBlock *block;
1817 uint64_t total = 0;
1819 RCU_READ_LOCK_GUARD();
1821 if (count_ignored) {
1822 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1823 total += block->used_length;
1825 } else {
1826 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1827 total += block->used_length;
1830 return total;
1833 uint64_t ram_bytes_total(void)
1835 return ram_bytes_total_common(false);
1838 static void xbzrle_load_setup(void)
1840 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1843 static void xbzrle_load_cleanup(void)
1845 g_free(XBZRLE.decoded_buf);
1846 XBZRLE.decoded_buf = NULL;
1849 static void ram_state_cleanup(RAMState **rsp)
1851 if (*rsp) {
1852 migration_page_queue_free(*rsp);
1853 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1854 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1855 g_free(*rsp);
1856 *rsp = NULL;
1860 static void xbzrle_cleanup(void)
1862 XBZRLE_cache_lock();
1863 if (XBZRLE.cache) {
1864 cache_fini(XBZRLE.cache);
1865 g_free(XBZRLE.encoded_buf);
1866 g_free(XBZRLE.current_buf);
1867 g_free(XBZRLE.zero_target_page);
1868 XBZRLE.cache = NULL;
1869 XBZRLE.encoded_buf = NULL;
1870 XBZRLE.current_buf = NULL;
1871 XBZRLE.zero_target_page = NULL;
1873 XBZRLE_cache_unlock();
1876 static void ram_save_cleanup(void *opaque)
1878 RAMState **rsp = opaque;
1879 RAMBlock *block;
1881 /* caller have hold iothread lock or is in a bh, so there is
1882 * no writing race against the migration bitmap
1884 memory_global_dirty_log_stop();
1886 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1887 g_free(block->clear_bmap);
1888 block->clear_bmap = NULL;
1889 g_free(block->bmap);
1890 block->bmap = NULL;
1893 xbzrle_cleanup();
1894 compress_threads_save_cleanup();
1895 ram_state_cleanup(rsp);
1898 static void ram_state_reset(RAMState *rs)
1900 rs->last_seen_block = NULL;
1901 rs->last_sent_block = NULL;
1902 rs->last_page = 0;
1903 rs->last_version = ram_list.version;
1904 rs->ram_bulk_stage = true;
1905 rs->fpo_enabled = false;
1908 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1911 * 'expected' is the value you expect the bitmap mostly to be full
1912 * of; it won't bother printing lines that are all this value.
1913 * If 'todump' is null the migration bitmap is dumped.
1915 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1916 unsigned long pages)
1918 int64_t cur;
1919 int64_t linelen = 128;
1920 char linebuf[129];
1922 for (cur = 0; cur < pages; cur += linelen) {
1923 int64_t curb;
1924 bool found = false;
1926 * Last line; catch the case where the line length
1927 * is longer than remaining ram
1929 if (cur + linelen > pages) {
1930 linelen = pages - cur;
1932 for (curb = 0; curb < linelen; curb++) {
1933 bool thisbit = test_bit(cur + curb, todump);
1934 linebuf[curb] = thisbit ? '1' : '.';
1935 found = found || (thisbit != expected);
1937 if (found) {
1938 linebuf[curb] = '\0';
1939 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1944 /* **** functions for postcopy ***** */
1946 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1948 struct RAMBlock *block;
1950 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1951 unsigned long *bitmap = block->bmap;
1952 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1953 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1955 while (run_start < range) {
1956 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1957 ram_discard_range(block->idstr,
1958 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
1959 ((ram_addr_t)(run_end - run_start))
1960 << TARGET_PAGE_BITS);
1961 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1967 * postcopy_send_discard_bm_ram: discard a RAMBlock
1969 * Returns zero on success
1971 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1973 * @ms: current migration state
1974 * @block: RAMBlock to discard
1976 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
1978 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1979 unsigned long current;
1980 unsigned long *bitmap = block->bmap;
1982 for (current = 0; current < end; ) {
1983 unsigned long one = find_next_bit(bitmap, end, current);
1984 unsigned long zero, discard_length;
1986 if (one >= end) {
1987 break;
1990 zero = find_next_zero_bit(bitmap, end, one + 1);
1992 if (zero >= end) {
1993 discard_length = end - one;
1994 } else {
1995 discard_length = zero - one;
1997 postcopy_discard_send_range(ms, one, discard_length);
1998 current = one + discard_length;
2001 return 0;
2005 * postcopy_each_ram_send_discard: discard all RAMBlocks
2007 * Returns 0 for success or negative for error
2009 * Utility for the outgoing postcopy code.
2010 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2011 * passing it bitmap indexes and name.
2012 * (qemu_ram_foreach_block ends up passing unscaled lengths
2013 * which would mean postcopy code would have to deal with target page)
2015 * @ms: current migration state
2017 static int postcopy_each_ram_send_discard(MigrationState *ms)
2019 struct RAMBlock *block;
2020 int ret;
2022 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2023 postcopy_discard_send_init(ms, block->idstr);
2026 * Postcopy sends chunks of bitmap over the wire, but it
2027 * just needs indexes at this point, avoids it having
2028 * target page specific code.
2030 ret = postcopy_send_discard_bm_ram(ms, block);
2031 postcopy_discard_send_finish(ms);
2032 if (ret) {
2033 return ret;
2037 return 0;
2041 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2043 * Helper for postcopy_chunk_hostpages; it's called twice to
2044 * canonicalize the two bitmaps, that are similar, but one is
2045 * inverted.
2047 * Postcopy requires that all target pages in a hostpage are dirty or
2048 * clean, not a mix. This function canonicalizes the bitmaps.
2050 * @ms: current migration state
2051 * @block: block that contains the page we want to canonicalize
2053 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2055 RAMState *rs = ram_state;
2056 unsigned long *bitmap = block->bmap;
2057 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2058 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2059 unsigned long run_start;
2061 if (block->page_size == TARGET_PAGE_SIZE) {
2062 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2063 return;
2066 /* Find a dirty page */
2067 run_start = find_next_bit(bitmap, pages, 0);
2069 while (run_start < pages) {
2072 * If the start of this run of pages is in the middle of a host
2073 * page, then we need to fixup this host page.
2075 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2076 /* Find the end of this run */
2077 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2079 * If the end isn't at the start of a host page, then the
2080 * run doesn't finish at the end of a host page
2081 * and we need to discard.
2085 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2086 unsigned long page;
2087 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2088 host_ratio);
2089 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2091 /* Clean up the bitmap */
2092 for (page = fixup_start_addr;
2093 page < fixup_start_addr + host_ratio; page++) {
2095 * Remark them as dirty, updating the count for any pages
2096 * that weren't previously dirty.
2098 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2102 /* Find the next dirty page for the next iteration */
2103 run_start = find_next_bit(bitmap, pages, run_start);
2108 * postcopy_chunk_hostpages: discard any partially sent host page
2110 * Utility for the outgoing postcopy code.
2112 * Discard any partially sent host-page size chunks, mark any partially
2113 * dirty host-page size chunks as all dirty. In this case the host-page
2114 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2116 * Returns zero on success
2118 * @ms: current migration state
2119 * @block: block we want to work with
2121 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2123 postcopy_discard_send_init(ms, block->idstr);
2126 * Ensure that all partially dirty host pages are made fully dirty.
2128 postcopy_chunk_hostpages_pass(ms, block);
2130 postcopy_discard_send_finish(ms);
2131 return 0;
2135 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2137 * Returns zero on success
2139 * Transmit the set of pages to be discarded after precopy to the target
2140 * these are pages that:
2141 * a) Have been previously transmitted but are now dirty again
2142 * b) Pages that have never been transmitted, this ensures that
2143 * any pages on the destination that have been mapped by background
2144 * tasks get discarded (transparent huge pages is the specific concern)
2145 * Hopefully this is pretty sparse
2147 * @ms: current migration state
2149 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2151 RAMState *rs = ram_state;
2152 RAMBlock *block;
2153 int ret;
2155 RCU_READ_LOCK_GUARD();
2157 /* This should be our last sync, the src is now paused */
2158 migration_bitmap_sync(rs);
2160 /* Easiest way to make sure we don't resume in the middle of a host-page */
2161 rs->last_seen_block = NULL;
2162 rs->last_sent_block = NULL;
2163 rs->last_page = 0;
2165 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2166 /* Deal with TPS != HPS and huge pages */
2167 ret = postcopy_chunk_hostpages(ms, block);
2168 if (ret) {
2169 return ret;
2172 #ifdef DEBUG_POSTCOPY
2173 ram_debug_dump_bitmap(block->bmap, true,
2174 block->used_length >> TARGET_PAGE_BITS);
2175 #endif
2177 trace_ram_postcopy_send_discard_bitmap();
2179 return postcopy_each_ram_send_discard(ms);
2183 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2185 * Returns zero on success
2187 * @rbname: name of the RAMBlock of the request. NULL means the
2188 * same that last one.
2189 * @start: RAMBlock starting page
2190 * @length: RAMBlock size
2192 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2194 trace_ram_discard_range(rbname, start, length);
2196 RCU_READ_LOCK_GUARD();
2197 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2199 if (!rb) {
2200 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2201 return -1;
2205 * On source VM, we don't need to update the received bitmap since
2206 * we don't even have one.
2208 if (rb->receivedmap) {
2209 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2210 length >> qemu_target_page_bits());
2213 return ram_block_discard_range(rb, start, length);
2217 * For every allocation, we will try not to crash the VM if the
2218 * allocation failed.
2220 static int xbzrle_init(void)
2222 Error *local_err = NULL;
2224 if (!migrate_use_xbzrle()) {
2225 return 0;
2228 XBZRLE_cache_lock();
2230 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2231 if (!XBZRLE.zero_target_page) {
2232 error_report("%s: Error allocating zero page", __func__);
2233 goto err_out;
2236 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2237 TARGET_PAGE_SIZE, &local_err);
2238 if (!XBZRLE.cache) {
2239 error_report_err(local_err);
2240 goto free_zero_page;
2243 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2244 if (!XBZRLE.encoded_buf) {
2245 error_report("%s: Error allocating encoded_buf", __func__);
2246 goto free_cache;
2249 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2250 if (!XBZRLE.current_buf) {
2251 error_report("%s: Error allocating current_buf", __func__);
2252 goto free_encoded_buf;
2255 /* We are all good */
2256 XBZRLE_cache_unlock();
2257 return 0;
2259 free_encoded_buf:
2260 g_free(XBZRLE.encoded_buf);
2261 XBZRLE.encoded_buf = NULL;
2262 free_cache:
2263 cache_fini(XBZRLE.cache);
2264 XBZRLE.cache = NULL;
2265 free_zero_page:
2266 g_free(XBZRLE.zero_target_page);
2267 XBZRLE.zero_target_page = NULL;
2268 err_out:
2269 XBZRLE_cache_unlock();
2270 return -ENOMEM;
2273 static int ram_state_init(RAMState **rsp)
2275 *rsp = g_try_new0(RAMState, 1);
2277 if (!*rsp) {
2278 error_report("%s: Init ramstate fail", __func__);
2279 return -1;
2282 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2283 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2284 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2287 * Count the total number of pages used by ram blocks not including any
2288 * gaps due to alignment or unplugs.
2289 * This must match with the initial values of dirty bitmap.
2291 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2292 ram_state_reset(*rsp);
2294 return 0;
2297 static void ram_list_init_bitmaps(void)
2299 MigrationState *ms = migrate_get_current();
2300 RAMBlock *block;
2301 unsigned long pages;
2302 uint8_t shift;
2304 /* Skip setting bitmap if there is no RAM */
2305 if (ram_bytes_total()) {
2306 shift = ms->clear_bitmap_shift;
2307 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2308 error_report("clear_bitmap_shift (%u) too big, using "
2309 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2310 shift = CLEAR_BITMAP_SHIFT_MAX;
2311 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2312 error_report("clear_bitmap_shift (%u) too small, using "
2313 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2314 shift = CLEAR_BITMAP_SHIFT_MIN;
2317 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2318 pages = block->max_length >> TARGET_PAGE_BITS;
2320 * The initial dirty bitmap for migration must be set with all
2321 * ones to make sure we'll migrate every guest RAM page to
2322 * destination.
2323 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2324 * new migration after a failed migration, ram_list.
2325 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2326 * guest memory.
2328 block->bmap = bitmap_new(pages);
2329 bitmap_set(block->bmap, 0, pages);
2330 block->clear_bmap_shift = shift;
2331 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2336 static void ram_init_bitmaps(RAMState *rs)
2338 /* For memory_global_dirty_log_start below. */
2339 qemu_mutex_lock_iothread();
2340 qemu_mutex_lock_ramlist();
2342 WITH_RCU_READ_LOCK_GUARD() {
2343 ram_list_init_bitmaps();
2344 memory_global_dirty_log_start();
2345 migration_bitmap_sync_precopy(rs);
2347 qemu_mutex_unlock_ramlist();
2348 qemu_mutex_unlock_iothread();
2351 static int ram_init_all(RAMState **rsp)
2353 if (ram_state_init(rsp)) {
2354 return -1;
2357 if (xbzrle_init()) {
2358 ram_state_cleanup(rsp);
2359 return -1;
2362 ram_init_bitmaps(*rsp);
2364 return 0;
2367 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2369 RAMBlock *block;
2370 uint64_t pages = 0;
2373 * Postcopy is not using xbzrle/compression, so no need for that.
2374 * Also, since source are already halted, we don't need to care
2375 * about dirty page logging as well.
2378 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2379 pages += bitmap_count_one(block->bmap,
2380 block->used_length >> TARGET_PAGE_BITS);
2383 /* This may not be aligned with current bitmaps. Recalculate. */
2384 rs->migration_dirty_pages = pages;
2386 rs->last_seen_block = NULL;
2387 rs->last_sent_block = NULL;
2388 rs->last_page = 0;
2389 rs->last_version = ram_list.version;
2391 * Disable the bulk stage, otherwise we'll resend the whole RAM no
2392 * matter what we have sent.
2394 rs->ram_bulk_stage = false;
2396 /* Update RAMState cache of output QEMUFile */
2397 rs->f = out;
2399 trace_ram_state_resume_prepare(pages);
2403 * This function clears bits of the free pages reported by the caller from the
2404 * migration dirty bitmap. @addr is the host address corresponding to the
2405 * start of the continuous guest free pages, and @len is the total bytes of
2406 * those pages.
2408 void qemu_guest_free_page_hint(void *addr, size_t len)
2410 RAMBlock *block;
2411 ram_addr_t offset;
2412 size_t used_len, start, npages;
2413 MigrationState *s = migrate_get_current();
2415 /* This function is currently expected to be used during live migration */
2416 if (!migration_is_setup_or_active(s->state)) {
2417 return;
2420 for (; len > 0; len -= used_len, addr += used_len) {
2421 block = qemu_ram_block_from_host(addr, false, &offset);
2422 if (unlikely(!block || offset >= block->used_length)) {
2424 * The implementation might not support RAMBlock resize during
2425 * live migration, but it could happen in theory with future
2426 * updates. So we add a check here to capture that case.
2428 error_report_once("%s unexpected error", __func__);
2429 return;
2432 if (len <= block->used_length - offset) {
2433 used_len = len;
2434 } else {
2435 used_len = block->used_length - offset;
2438 start = offset >> TARGET_PAGE_BITS;
2439 npages = used_len >> TARGET_PAGE_BITS;
2441 qemu_mutex_lock(&ram_state->bitmap_mutex);
2442 ram_state->migration_dirty_pages -=
2443 bitmap_count_one_with_offset(block->bmap, start, npages);
2444 bitmap_clear(block->bmap, start, npages);
2445 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2450 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2451 * long-running RCU critical section. When rcu-reclaims in the code
2452 * start to become numerous it will be necessary to reduce the
2453 * granularity of these critical sections.
2457 * ram_save_setup: Setup RAM for migration
2459 * Returns zero to indicate success and negative for error
2461 * @f: QEMUFile where to send the data
2462 * @opaque: RAMState pointer
2464 static int ram_save_setup(QEMUFile *f, void *opaque)
2466 RAMState **rsp = opaque;
2467 RAMBlock *block;
2469 if (compress_threads_save_setup()) {
2470 return -1;
2473 /* migration has already setup the bitmap, reuse it. */
2474 if (!migration_in_colo_state()) {
2475 if (ram_init_all(rsp) != 0) {
2476 compress_threads_save_cleanup();
2477 return -1;
2480 (*rsp)->f = f;
2482 WITH_RCU_READ_LOCK_GUARD() {
2483 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2485 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2486 qemu_put_byte(f, strlen(block->idstr));
2487 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2488 qemu_put_be64(f, block->used_length);
2489 if (migrate_postcopy_ram() && block->page_size !=
2490 qemu_host_page_size) {
2491 qemu_put_be64(f, block->page_size);
2493 if (migrate_ignore_shared()) {
2494 qemu_put_be64(f, block->mr->addr);
2499 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2500 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2502 multifd_send_sync_main(f);
2503 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2504 qemu_fflush(f);
2506 return 0;
2510 * ram_save_iterate: iterative stage for migration
2512 * Returns zero to indicate success and negative for error
2514 * @f: QEMUFile where to send the data
2515 * @opaque: RAMState pointer
2517 static int ram_save_iterate(QEMUFile *f, void *opaque)
2519 RAMState **temp = opaque;
2520 RAMState *rs = *temp;
2521 int ret = 0;
2522 int i;
2523 int64_t t0;
2524 int done = 0;
2526 if (blk_mig_bulk_active()) {
2527 /* Avoid transferring ram during bulk phase of block migration as
2528 * the bulk phase will usually take a long time and transferring
2529 * ram updates during that time is pointless. */
2530 goto out;
2533 WITH_RCU_READ_LOCK_GUARD() {
2534 if (ram_list.version != rs->last_version) {
2535 ram_state_reset(rs);
2538 /* Read version before ram_list.blocks */
2539 smp_rmb();
2541 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2543 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2544 i = 0;
2545 while ((ret = qemu_file_rate_limit(f)) == 0 ||
2546 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2547 int pages;
2549 if (qemu_file_get_error(f)) {
2550 break;
2553 pages = ram_find_and_save_block(rs, false);
2554 /* no more pages to sent */
2555 if (pages == 0) {
2556 done = 1;
2557 break;
2560 if (pages < 0) {
2561 qemu_file_set_error(f, pages);
2562 break;
2565 rs->target_page_count += pages;
2568 * During postcopy, it is necessary to make sure one whole host
2569 * page is sent in one chunk.
2571 if (migrate_postcopy_ram()) {
2572 flush_compressed_data(rs);
2576 * we want to check in the 1st loop, just in case it was the 1st
2577 * time and we had to sync the dirty bitmap.
2578 * qemu_clock_get_ns() is a bit expensive, so we only check each
2579 * some iterations
2581 if ((i & 63) == 0) {
2582 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2583 1000000;
2584 if (t1 > MAX_WAIT) {
2585 trace_ram_save_iterate_big_wait(t1, i);
2586 break;
2589 i++;
2594 * Must occur before EOS (or any QEMUFile operation)
2595 * because of RDMA protocol.
2597 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2599 out:
2600 if (ret >= 0
2601 && migration_is_setup_or_active(migrate_get_current()->state)) {
2602 multifd_send_sync_main(rs->f);
2603 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2604 qemu_fflush(f);
2605 ram_counters.transferred += 8;
2607 ret = qemu_file_get_error(f);
2609 if (ret < 0) {
2610 return ret;
2613 return done;
2617 * ram_save_complete: function called to send the remaining amount of ram
2619 * Returns zero to indicate success or negative on error
2621 * Called with iothread lock
2623 * @f: QEMUFile where to send the data
2624 * @opaque: RAMState pointer
2626 static int ram_save_complete(QEMUFile *f, void *opaque)
2628 RAMState **temp = opaque;
2629 RAMState *rs = *temp;
2630 int ret = 0;
2632 WITH_RCU_READ_LOCK_GUARD() {
2633 if (!migration_in_postcopy()) {
2634 migration_bitmap_sync_precopy(rs);
2637 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2639 /* try transferring iterative blocks of memory */
2641 /* flush all remaining blocks regardless of rate limiting */
2642 while (true) {
2643 int pages;
2645 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2646 /* no more blocks to sent */
2647 if (pages == 0) {
2648 break;
2650 if (pages < 0) {
2651 ret = pages;
2652 break;
2656 flush_compressed_data(rs);
2657 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2660 if (ret >= 0) {
2661 multifd_send_sync_main(rs->f);
2662 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2663 qemu_fflush(f);
2666 return ret;
2669 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2670 uint64_t *res_precopy_only,
2671 uint64_t *res_compatible,
2672 uint64_t *res_postcopy_only)
2674 RAMState **temp = opaque;
2675 RAMState *rs = *temp;
2676 uint64_t remaining_size;
2678 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2680 if (!migration_in_postcopy() &&
2681 remaining_size < max_size) {
2682 qemu_mutex_lock_iothread();
2683 WITH_RCU_READ_LOCK_GUARD() {
2684 migration_bitmap_sync_precopy(rs);
2686 qemu_mutex_unlock_iothread();
2687 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2690 if (migrate_postcopy_ram()) {
2691 /* We can do postcopy, and all the data is postcopiable */
2692 *res_compatible += remaining_size;
2693 } else {
2694 *res_precopy_only += remaining_size;
2698 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2700 unsigned int xh_len;
2701 int xh_flags;
2702 uint8_t *loaded_data;
2704 /* extract RLE header */
2705 xh_flags = qemu_get_byte(f);
2706 xh_len = qemu_get_be16(f);
2708 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2709 error_report("Failed to load XBZRLE page - wrong compression!");
2710 return -1;
2713 if (xh_len > TARGET_PAGE_SIZE) {
2714 error_report("Failed to load XBZRLE page - len overflow!");
2715 return -1;
2717 loaded_data = XBZRLE.decoded_buf;
2718 /* load data and decode */
2719 /* it can change loaded_data to point to an internal buffer */
2720 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2722 /* decode RLE */
2723 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2724 TARGET_PAGE_SIZE) == -1) {
2725 error_report("Failed to load XBZRLE page - decode error!");
2726 return -1;
2729 return 0;
2733 * ram_block_from_stream: read a RAMBlock id from the migration stream
2735 * Must be called from within a rcu critical section.
2737 * Returns a pointer from within the RCU-protected ram_list.
2739 * @f: QEMUFile where to read the data from
2740 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2742 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2744 static RAMBlock *block = NULL;
2745 char id[256];
2746 uint8_t len;
2748 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2749 if (!block) {
2750 error_report("Ack, bad migration stream!");
2751 return NULL;
2753 return block;
2756 len = qemu_get_byte(f);
2757 qemu_get_buffer(f, (uint8_t *)id, len);
2758 id[len] = 0;
2760 block = qemu_ram_block_by_name(id);
2761 if (!block) {
2762 error_report("Can't find block %s", id);
2763 return NULL;
2766 if (ramblock_is_ignored(block)) {
2767 error_report("block %s should not be migrated !", id);
2768 return NULL;
2771 return block;
2774 static inline void *host_from_ram_block_offset(RAMBlock *block,
2775 ram_addr_t offset)
2777 if (!offset_in_ramblock(block, offset)) {
2778 return NULL;
2781 return block->host + offset;
2784 static inline void *colo_cache_from_block_offset(RAMBlock *block,
2785 ram_addr_t offset, bool record_bitmap)
2787 if (!offset_in_ramblock(block, offset)) {
2788 return NULL;
2790 if (!block->colo_cache) {
2791 error_report("%s: colo_cache is NULL in block :%s",
2792 __func__, block->idstr);
2793 return NULL;
2797 * During colo checkpoint, we need bitmap of these migrated pages.
2798 * It help us to decide which pages in ram cache should be flushed
2799 * into VM's RAM later.
2801 if (record_bitmap &&
2802 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
2803 ram_state->migration_dirty_pages++;
2805 return block->colo_cache + offset;
2809 * ram_handle_compressed: handle the zero page case
2811 * If a page (or a whole RDMA chunk) has been
2812 * determined to be zero, then zap it.
2814 * @host: host address for the zero page
2815 * @ch: what the page is filled from. We only support zero
2816 * @size: size of the zero page
2818 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2820 if (ch != 0 || !is_zero_range(host, size)) {
2821 memset(host, ch, size);
2825 /* return the size after decompression, or negative value on error */
2826 static int
2827 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2828 const uint8_t *source, size_t source_len)
2830 int err;
2832 err = inflateReset(stream);
2833 if (err != Z_OK) {
2834 return -1;
2837 stream->avail_in = source_len;
2838 stream->next_in = (uint8_t *)source;
2839 stream->avail_out = dest_len;
2840 stream->next_out = dest;
2842 err = inflate(stream, Z_NO_FLUSH);
2843 if (err != Z_STREAM_END) {
2844 return -1;
2847 return stream->total_out;
2850 static void *do_data_decompress(void *opaque)
2852 DecompressParam *param = opaque;
2853 unsigned long pagesize;
2854 uint8_t *des;
2855 int len, ret;
2857 qemu_mutex_lock(&param->mutex);
2858 while (!param->quit) {
2859 if (param->des) {
2860 des = param->des;
2861 len = param->len;
2862 param->des = 0;
2863 qemu_mutex_unlock(&param->mutex);
2865 pagesize = TARGET_PAGE_SIZE;
2867 ret = qemu_uncompress_data(&param->stream, des, pagesize,
2868 param->compbuf, len);
2869 if (ret < 0 && migrate_get_current()->decompress_error_check) {
2870 error_report("decompress data failed");
2871 qemu_file_set_error(decomp_file, ret);
2874 qemu_mutex_lock(&decomp_done_lock);
2875 param->done = true;
2876 qemu_cond_signal(&decomp_done_cond);
2877 qemu_mutex_unlock(&decomp_done_lock);
2879 qemu_mutex_lock(&param->mutex);
2880 } else {
2881 qemu_cond_wait(&param->cond, &param->mutex);
2884 qemu_mutex_unlock(&param->mutex);
2886 return NULL;
2889 static int wait_for_decompress_done(void)
2891 int idx, thread_count;
2893 if (!migrate_use_compression()) {
2894 return 0;
2897 thread_count = migrate_decompress_threads();
2898 qemu_mutex_lock(&decomp_done_lock);
2899 for (idx = 0; idx < thread_count; idx++) {
2900 while (!decomp_param[idx].done) {
2901 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2904 qemu_mutex_unlock(&decomp_done_lock);
2905 return qemu_file_get_error(decomp_file);
2908 static void compress_threads_load_cleanup(void)
2910 int i, thread_count;
2912 if (!migrate_use_compression()) {
2913 return;
2915 thread_count = migrate_decompress_threads();
2916 for (i = 0; i < thread_count; i++) {
2918 * we use it as a indicator which shows if the thread is
2919 * properly init'd or not
2921 if (!decomp_param[i].compbuf) {
2922 break;
2925 qemu_mutex_lock(&decomp_param[i].mutex);
2926 decomp_param[i].quit = true;
2927 qemu_cond_signal(&decomp_param[i].cond);
2928 qemu_mutex_unlock(&decomp_param[i].mutex);
2930 for (i = 0; i < thread_count; i++) {
2931 if (!decomp_param[i].compbuf) {
2932 break;
2935 qemu_thread_join(decompress_threads + i);
2936 qemu_mutex_destroy(&decomp_param[i].mutex);
2937 qemu_cond_destroy(&decomp_param[i].cond);
2938 inflateEnd(&decomp_param[i].stream);
2939 g_free(decomp_param[i].compbuf);
2940 decomp_param[i].compbuf = NULL;
2942 g_free(decompress_threads);
2943 g_free(decomp_param);
2944 decompress_threads = NULL;
2945 decomp_param = NULL;
2946 decomp_file = NULL;
2949 static int compress_threads_load_setup(QEMUFile *f)
2951 int i, thread_count;
2953 if (!migrate_use_compression()) {
2954 return 0;
2957 thread_count = migrate_decompress_threads();
2958 decompress_threads = g_new0(QemuThread, thread_count);
2959 decomp_param = g_new0(DecompressParam, thread_count);
2960 qemu_mutex_init(&decomp_done_lock);
2961 qemu_cond_init(&decomp_done_cond);
2962 decomp_file = f;
2963 for (i = 0; i < thread_count; i++) {
2964 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2965 goto exit;
2968 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2969 qemu_mutex_init(&decomp_param[i].mutex);
2970 qemu_cond_init(&decomp_param[i].cond);
2971 decomp_param[i].done = true;
2972 decomp_param[i].quit = false;
2973 qemu_thread_create(decompress_threads + i, "decompress",
2974 do_data_decompress, decomp_param + i,
2975 QEMU_THREAD_JOINABLE);
2977 return 0;
2978 exit:
2979 compress_threads_load_cleanup();
2980 return -1;
2983 static void decompress_data_with_multi_threads(QEMUFile *f,
2984 void *host, int len)
2986 int idx, thread_count;
2988 thread_count = migrate_decompress_threads();
2989 qemu_mutex_lock(&decomp_done_lock);
2990 while (true) {
2991 for (idx = 0; idx < thread_count; idx++) {
2992 if (decomp_param[idx].done) {
2993 decomp_param[idx].done = false;
2994 qemu_mutex_lock(&decomp_param[idx].mutex);
2995 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2996 decomp_param[idx].des = host;
2997 decomp_param[idx].len = len;
2998 qemu_cond_signal(&decomp_param[idx].cond);
2999 qemu_mutex_unlock(&decomp_param[idx].mutex);
3000 break;
3003 if (idx < thread_count) {
3004 break;
3005 } else {
3006 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3009 qemu_mutex_unlock(&decomp_done_lock);
3013 * colo cache: this is for secondary VM, we cache the whole
3014 * memory of the secondary VM, it is need to hold the global lock
3015 * to call this helper.
3017 int colo_init_ram_cache(void)
3019 RAMBlock *block;
3021 WITH_RCU_READ_LOCK_GUARD() {
3022 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3023 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3024 NULL,
3025 false);
3026 if (!block->colo_cache) {
3027 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3028 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3029 block->used_length);
3030 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3031 if (block->colo_cache) {
3032 qemu_anon_ram_free(block->colo_cache, block->used_length);
3033 block->colo_cache = NULL;
3036 return -errno;
3042 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3043 * with to decide which page in cache should be flushed into SVM's RAM. Here
3044 * we use the same name 'ram_bitmap' as for migration.
3046 if (ram_bytes_total()) {
3047 RAMBlock *block;
3049 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3050 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3051 block->bmap = bitmap_new(pages);
3055 ram_state_init(&ram_state);
3056 return 0;
3059 /* TODO: duplicated with ram_init_bitmaps */
3060 void colo_incoming_start_dirty_log(void)
3062 RAMBlock *block = NULL;
3063 /* For memory_global_dirty_log_start below. */
3064 qemu_mutex_lock_iothread();
3065 qemu_mutex_lock_ramlist();
3067 memory_global_dirty_log_sync();
3068 WITH_RCU_READ_LOCK_GUARD() {
3069 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3070 ramblock_sync_dirty_bitmap(ram_state, block);
3071 /* Discard this dirty bitmap record */
3072 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3074 memory_global_dirty_log_start();
3076 ram_state->migration_dirty_pages = 0;
3077 qemu_mutex_unlock_ramlist();
3078 qemu_mutex_unlock_iothread();
3081 /* It is need to hold the global lock to call this helper */
3082 void colo_release_ram_cache(void)
3084 RAMBlock *block;
3086 memory_global_dirty_log_stop();
3087 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3088 g_free(block->bmap);
3089 block->bmap = NULL;
3092 WITH_RCU_READ_LOCK_GUARD() {
3093 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3094 if (block->colo_cache) {
3095 qemu_anon_ram_free(block->colo_cache, block->used_length);
3096 block->colo_cache = NULL;
3100 ram_state_cleanup(&ram_state);
3104 * ram_load_setup: Setup RAM for migration incoming side
3106 * Returns zero to indicate success and negative for error
3108 * @f: QEMUFile where to receive the data
3109 * @opaque: RAMState pointer
3111 static int ram_load_setup(QEMUFile *f, void *opaque)
3113 if (compress_threads_load_setup(f)) {
3114 return -1;
3117 xbzrle_load_setup();
3118 ramblock_recv_map_init();
3120 return 0;
3123 static int ram_load_cleanup(void *opaque)
3125 RAMBlock *rb;
3127 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3128 qemu_ram_block_writeback(rb);
3131 xbzrle_load_cleanup();
3132 compress_threads_load_cleanup();
3134 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3135 g_free(rb->receivedmap);
3136 rb->receivedmap = NULL;
3139 return 0;
3143 * ram_postcopy_incoming_init: allocate postcopy data structures
3145 * Returns 0 for success and negative if there was one error
3147 * @mis: current migration incoming state
3149 * Allocate data structures etc needed by incoming migration with
3150 * postcopy-ram. postcopy-ram's similarly names
3151 * postcopy_ram_incoming_init does the work.
3153 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3155 return postcopy_ram_incoming_init(mis);
3159 * ram_load_postcopy: load a page in postcopy case
3161 * Returns 0 for success or -errno in case of error
3163 * Called in postcopy mode by ram_load().
3164 * rcu_read_lock is taken prior to this being called.
3166 * @f: QEMUFile where to send the data
3168 static int ram_load_postcopy(QEMUFile *f)
3170 int flags = 0, ret = 0;
3171 bool place_needed = false;
3172 bool matches_target_page_size = false;
3173 MigrationIncomingState *mis = migration_incoming_get_current();
3174 /* Temporary page that is later 'placed' */
3175 void *postcopy_host_page = mis->postcopy_tmp_page;
3176 void *this_host = NULL;
3177 bool all_zero = true;
3178 int target_pages = 0;
3180 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3181 ram_addr_t addr;
3182 void *host = NULL;
3183 void *page_buffer = NULL;
3184 void *place_source = NULL;
3185 RAMBlock *block = NULL;
3186 uint8_t ch;
3187 int len;
3189 addr = qemu_get_be64(f);
3192 * If qemu file error, we should stop here, and then "addr"
3193 * may be invalid
3195 ret = qemu_file_get_error(f);
3196 if (ret) {
3197 break;
3200 flags = addr & ~TARGET_PAGE_MASK;
3201 addr &= TARGET_PAGE_MASK;
3203 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3204 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3205 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3206 block = ram_block_from_stream(f, flags);
3208 host = host_from_ram_block_offset(block, addr);
3209 if (!host) {
3210 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3211 ret = -EINVAL;
3212 break;
3214 target_pages++;
3215 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3217 * Postcopy requires that we place whole host pages atomically;
3218 * these may be huge pages for RAMBlocks that are backed by
3219 * hugetlbfs.
3220 * To make it atomic, the data is read into a temporary page
3221 * that's moved into place later.
3222 * The migration protocol uses, possibly smaller, target-pages
3223 * however the source ensures it always sends all the components
3224 * of a host page in one chunk.
3226 page_buffer = postcopy_host_page +
3227 ((uintptr_t)host & (block->page_size - 1));
3228 if (target_pages == 1) {
3229 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3230 block->page_size);
3231 } else {
3232 /* not the 1st TP within the HP */
3233 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3234 (uintptr_t)this_host) {
3235 error_report("Non-same host page %p/%p",
3236 host, this_host);
3237 ret = -EINVAL;
3238 break;
3243 * If it's the last part of a host page then we place the host
3244 * page
3246 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3247 place_needed = true;
3249 place_source = postcopy_host_page;
3252 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3253 case RAM_SAVE_FLAG_ZERO:
3254 ch = qemu_get_byte(f);
3256 * Can skip to set page_buffer when
3257 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3259 if (ch || !matches_target_page_size) {
3260 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3262 if (ch) {
3263 all_zero = false;
3265 break;
3267 case RAM_SAVE_FLAG_PAGE:
3268 all_zero = false;
3269 if (!matches_target_page_size) {
3270 /* For huge pages, we always use temporary buffer */
3271 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3272 } else {
3274 * For small pages that matches target page size, we
3275 * avoid the qemu_file copy. Instead we directly use
3276 * the buffer of QEMUFile to place the page. Note: we
3277 * cannot do any QEMUFile operation before using that
3278 * buffer to make sure the buffer is valid when
3279 * placing the page.
3281 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3282 TARGET_PAGE_SIZE);
3284 break;
3285 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3286 all_zero = false;
3287 len = qemu_get_be32(f);
3288 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3289 error_report("Invalid compressed data length: %d", len);
3290 ret = -EINVAL;
3291 break;
3293 decompress_data_with_multi_threads(f, page_buffer, len);
3294 break;
3296 case RAM_SAVE_FLAG_EOS:
3297 /* normal exit */
3298 multifd_recv_sync_main();
3299 break;
3300 default:
3301 error_report("Unknown combination of migration flags: %#x"
3302 " (postcopy mode)", flags);
3303 ret = -EINVAL;
3304 break;
3307 /* Got the whole host page, wait for decompress before placing. */
3308 if (place_needed) {
3309 ret |= wait_for_decompress_done();
3312 /* Detect for any possible file errors */
3313 if (!ret && qemu_file_get_error(f)) {
3314 ret = qemu_file_get_error(f);
3317 if (!ret && place_needed) {
3318 /* This gets called at the last target page in the host page */
3319 void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3320 block->page_size);
3322 if (all_zero) {
3323 ret = postcopy_place_page_zero(mis, place_dest,
3324 block);
3325 } else {
3326 ret = postcopy_place_page(mis, place_dest,
3327 place_source, block);
3329 place_needed = false;
3330 target_pages = 0;
3331 /* Assume we have a zero page until we detect something different */
3332 all_zero = true;
3336 return ret;
3339 static bool postcopy_is_advised(void)
3341 PostcopyState ps = postcopy_state_get();
3342 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3345 static bool postcopy_is_running(void)
3347 PostcopyState ps = postcopy_state_get();
3348 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3352 * Flush content of RAM cache into SVM's memory.
3353 * Only flush the pages that be dirtied by PVM or SVM or both.
3355 void colo_flush_ram_cache(void)
3357 RAMBlock *block = NULL;
3358 void *dst_host;
3359 void *src_host;
3360 unsigned long offset = 0;
3362 memory_global_dirty_log_sync();
3363 WITH_RCU_READ_LOCK_GUARD() {
3364 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3365 ramblock_sync_dirty_bitmap(ram_state, block);
3369 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3370 WITH_RCU_READ_LOCK_GUARD() {
3371 block = QLIST_FIRST_RCU(&ram_list.blocks);
3373 while (block) {
3374 offset = migration_bitmap_find_dirty(ram_state, block, offset);
3376 if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3377 >= block->used_length) {
3378 offset = 0;
3379 block = QLIST_NEXT_RCU(block, next);
3380 } else {
3381 migration_bitmap_clear_dirty(ram_state, block, offset);
3382 dst_host = block->host
3383 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3384 src_host = block->colo_cache
3385 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3386 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3390 trace_colo_flush_ram_cache_end();
3394 * ram_load_precopy: load pages in precopy case
3396 * Returns 0 for success or -errno in case of error
3398 * Called in precopy mode by ram_load().
3399 * rcu_read_lock is taken prior to this being called.
3401 * @f: QEMUFile where to send the data
3403 static int ram_load_precopy(QEMUFile *f)
3405 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3406 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3407 bool postcopy_advised = postcopy_is_advised();
3408 if (!migrate_use_compression()) {
3409 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3412 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3413 ram_addr_t addr, total_ram_bytes;
3414 void *host = NULL, *host_bak = NULL;
3415 uint8_t ch;
3418 * Yield periodically to let main loop run, but an iteration of
3419 * the main loop is expensive, so do it each some iterations
3421 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3422 aio_co_schedule(qemu_get_current_aio_context(),
3423 qemu_coroutine_self());
3424 qemu_coroutine_yield();
3426 i++;
3428 addr = qemu_get_be64(f);
3429 flags = addr & ~TARGET_PAGE_MASK;
3430 addr &= TARGET_PAGE_MASK;
3432 if (flags & invalid_flags) {
3433 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3434 error_report("Received an unexpected compressed page");
3437 ret = -EINVAL;
3438 break;
3441 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3442 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3443 RAMBlock *block = ram_block_from_stream(f, flags);
3445 host = host_from_ram_block_offset(block, addr);
3447 * After going into COLO stage, we should not load the page
3448 * into SVM's memory directly, we put them into colo_cache firstly.
3449 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3450 * Previously, we copied all these memory in preparing stage of COLO
3451 * while we need to stop VM, which is a time-consuming process.
3452 * Here we optimize it by a trick, back-up every page while in
3453 * migration process while COLO is enabled, though it affects the
3454 * speed of the migration, but it obviously reduce the downtime of
3455 * back-up all SVM'S memory in COLO preparing stage.
3457 if (migration_incoming_colo_enabled()) {
3458 if (migration_incoming_in_colo_state()) {
3459 /* In COLO stage, put all pages into cache temporarily */
3460 host = colo_cache_from_block_offset(block, addr, true);
3461 } else {
3463 * In migration stage but before COLO stage,
3464 * Put all pages into both cache and SVM's memory.
3466 host_bak = colo_cache_from_block_offset(block, addr, false);
3469 if (!host) {
3470 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3471 ret = -EINVAL;
3472 break;
3474 if (!migration_incoming_in_colo_state()) {
3475 ramblock_recv_bitmap_set(block, host);
3478 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3481 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3482 case RAM_SAVE_FLAG_MEM_SIZE:
3483 /* Synchronize RAM block list */
3484 total_ram_bytes = addr;
3485 while (!ret && total_ram_bytes) {
3486 RAMBlock *block;
3487 char id[256];
3488 ram_addr_t length;
3490 len = qemu_get_byte(f);
3491 qemu_get_buffer(f, (uint8_t *)id, len);
3492 id[len] = 0;
3493 length = qemu_get_be64(f);
3495 block = qemu_ram_block_by_name(id);
3496 if (block && !qemu_ram_is_migratable(block)) {
3497 error_report("block %s should not be migrated !", id);
3498 ret = -EINVAL;
3499 } else if (block) {
3500 if (length != block->used_length) {
3501 Error *local_err = NULL;
3503 ret = qemu_ram_resize(block, length,
3504 &local_err);
3505 if (local_err) {
3506 error_report_err(local_err);
3509 /* For postcopy we need to check hugepage sizes match */
3510 if (postcopy_advised &&
3511 block->page_size != qemu_host_page_size) {
3512 uint64_t remote_page_size = qemu_get_be64(f);
3513 if (remote_page_size != block->page_size) {
3514 error_report("Mismatched RAM page size %s "
3515 "(local) %zd != %" PRId64,
3516 id, block->page_size,
3517 remote_page_size);
3518 ret = -EINVAL;
3521 if (migrate_ignore_shared()) {
3522 hwaddr addr = qemu_get_be64(f);
3523 if (ramblock_is_ignored(block) &&
3524 block->mr->addr != addr) {
3525 error_report("Mismatched GPAs for block %s "
3526 "%" PRId64 "!= %" PRId64,
3527 id, (uint64_t)addr,
3528 (uint64_t)block->mr->addr);
3529 ret = -EINVAL;
3532 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3533 block->idstr);
3534 } else {
3535 error_report("Unknown ramblock \"%s\", cannot "
3536 "accept migration", id);
3537 ret = -EINVAL;
3540 total_ram_bytes -= length;
3542 break;
3544 case RAM_SAVE_FLAG_ZERO:
3545 ch = qemu_get_byte(f);
3546 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3547 break;
3549 case RAM_SAVE_FLAG_PAGE:
3550 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3551 break;
3553 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3554 len = qemu_get_be32(f);
3555 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3556 error_report("Invalid compressed data length: %d", len);
3557 ret = -EINVAL;
3558 break;
3560 decompress_data_with_multi_threads(f, host, len);
3561 break;
3563 case RAM_SAVE_FLAG_XBZRLE:
3564 if (load_xbzrle(f, addr, host) < 0) {
3565 error_report("Failed to decompress XBZRLE page at "
3566 RAM_ADDR_FMT, addr);
3567 ret = -EINVAL;
3568 break;
3570 break;
3571 case RAM_SAVE_FLAG_EOS:
3572 /* normal exit */
3573 multifd_recv_sync_main();
3574 break;
3575 default:
3576 if (flags & RAM_SAVE_FLAG_HOOK) {
3577 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3578 } else {
3579 error_report("Unknown combination of migration flags: %#x",
3580 flags);
3581 ret = -EINVAL;
3584 if (!ret) {
3585 ret = qemu_file_get_error(f);
3587 if (!ret && host_bak) {
3588 memcpy(host_bak, host, TARGET_PAGE_SIZE);
3592 ret |= wait_for_decompress_done();
3593 return ret;
3596 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3598 int ret = 0;
3599 static uint64_t seq_iter;
3601 * If system is running in postcopy mode, page inserts to host memory must
3602 * be atomic
3604 bool postcopy_running = postcopy_is_running();
3606 seq_iter++;
3608 if (version_id != 4) {
3609 return -EINVAL;
3613 * This RCU critical section can be very long running.
3614 * When RCU reclaims in the code start to become numerous,
3615 * it will be necessary to reduce the granularity of this
3616 * critical section.
3618 WITH_RCU_READ_LOCK_GUARD() {
3619 if (postcopy_running) {
3620 ret = ram_load_postcopy(f);
3621 } else {
3622 ret = ram_load_precopy(f);
3625 trace_ram_load_complete(ret, seq_iter);
3627 return ret;
3630 static bool ram_has_postcopy(void *opaque)
3632 RAMBlock *rb;
3633 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3634 if (ramblock_is_pmem(rb)) {
3635 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3636 "is not supported now!", rb->idstr, rb->host);
3637 return false;
3641 return migrate_postcopy_ram();
3644 /* Sync all the dirty bitmap with destination VM. */
3645 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3647 RAMBlock *block;
3648 QEMUFile *file = s->to_dst_file;
3649 int ramblock_count = 0;
3651 trace_ram_dirty_bitmap_sync_start();
3653 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3654 qemu_savevm_send_recv_bitmap(file, block->idstr);
3655 trace_ram_dirty_bitmap_request(block->idstr);
3656 ramblock_count++;
3659 trace_ram_dirty_bitmap_sync_wait();
3661 /* Wait until all the ramblocks' dirty bitmap synced */
3662 while (ramblock_count--) {
3663 qemu_sem_wait(&s->rp_state.rp_sem);
3666 trace_ram_dirty_bitmap_sync_complete();
3668 return 0;
3671 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3673 qemu_sem_post(&s->rp_state.rp_sem);
3677 * Read the received bitmap, revert it as the initial dirty bitmap.
3678 * This is only used when the postcopy migration is paused but wants
3679 * to resume from a middle point.
3681 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3683 int ret = -EINVAL;
3684 QEMUFile *file = s->rp_state.from_dst_file;
3685 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3686 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3687 uint64_t size, end_mark;
3689 trace_ram_dirty_bitmap_reload_begin(block->idstr);
3691 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3692 error_report("%s: incorrect state %s", __func__,
3693 MigrationStatus_str(s->state));
3694 return -EINVAL;
3698 * Note: see comments in ramblock_recv_bitmap_send() on why we
3699 * need the endianness conversion, and the paddings.
3701 local_size = ROUND_UP(local_size, 8);
3703 /* Add paddings */
3704 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3706 size = qemu_get_be64(file);
3708 /* The size of the bitmap should match with our ramblock */
3709 if (size != local_size) {
3710 error_report("%s: ramblock '%s' bitmap size mismatch "
3711 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3712 block->idstr, size, local_size);
3713 ret = -EINVAL;
3714 goto out;
3717 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3718 end_mark = qemu_get_be64(file);
3720 ret = qemu_file_get_error(file);
3721 if (ret || size != local_size) {
3722 error_report("%s: read bitmap failed for ramblock '%s': %d"
3723 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3724 __func__, block->idstr, ret, local_size, size);
3725 ret = -EIO;
3726 goto out;
3729 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3730 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3731 __func__, block->idstr, end_mark);
3732 ret = -EINVAL;
3733 goto out;
3737 * Endianness conversion. We are during postcopy (though paused).
3738 * The dirty bitmap won't change. We can directly modify it.
3740 bitmap_from_le(block->bmap, le_bitmap, nbits);
3743 * What we received is "received bitmap". Revert it as the initial
3744 * dirty bitmap for this ramblock.
3746 bitmap_complement(block->bmap, block->bmap, nbits);
3748 trace_ram_dirty_bitmap_reload_complete(block->idstr);
3751 * We succeeded to sync bitmap for current ramblock. If this is
3752 * the last one to sync, we need to notify the main send thread.
3754 ram_dirty_bitmap_reload_notify(s);
3756 ret = 0;
3757 out:
3758 g_free(le_bitmap);
3759 return ret;
3762 static int ram_resume_prepare(MigrationState *s, void *opaque)
3764 RAMState *rs = *(RAMState **)opaque;
3765 int ret;
3767 ret = ram_dirty_bitmap_sync_all(s, rs);
3768 if (ret) {
3769 return ret;
3772 ram_state_resume_prepare(rs, s->to_dst_file);
3774 return 0;
3777 static SaveVMHandlers savevm_ram_handlers = {
3778 .save_setup = ram_save_setup,
3779 .save_live_iterate = ram_save_iterate,
3780 .save_live_complete_postcopy = ram_save_complete,
3781 .save_live_complete_precopy = ram_save_complete,
3782 .has_postcopy = ram_has_postcopy,
3783 .save_live_pending = ram_save_pending,
3784 .load_state = ram_load,
3785 .save_cleanup = ram_save_cleanup,
3786 .load_setup = ram_load_setup,
3787 .load_cleanup = ram_load_cleanup,
3788 .resume_prepare = ram_resume_prepare,
3791 void ram_mig_init(void)
3793 qemu_mutex_init(&XBZRLE.lock);
3794 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);