4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/main-loop.h"
36 #include "migration.h"
37 #include "migration/register.h"
38 #include "migration/misc.h"
39 #include "qemu-file.h"
40 #include "postcopy-ram.h"
41 #include "page_cache.h"
42 #include "qemu/error-report.h"
43 #include "qapi/error.h"
44 #include "qapi/qapi-types-migration.h"
45 #include "qapi/qapi-events-migration.h"
46 #include "qapi/qmp/qerror.h"
48 #include "exec/ram_addr.h"
49 #include "exec/target_page.h"
50 #include "qemu/rcu_queue.h"
51 #include "migration/colo.h"
53 #include "sysemu/cpu-throttle.h"
57 #include "sysemu/runstate.h"
59 #include "hw/boards.h" /* for machine_dump_guest_core() */
61 #if defined(__linux__)
62 #include "qemu/userfaultfd.h"
63 #endif /* defined(__linux__) */
65 /***********************************************************/
66 /* ram save/restore */
68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
69 * worked for pages that where filled with the same char. We switched
70 * it to only search for the zero value. And to avoid confusion with
71 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
74 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
75 #define RAM_SAVE_FLAG_ZERO 0x02
76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
77 #define RAM_SAVE_FLAG_PAGE 0x08
78 #define RAM_SAVE_FLAG_EOS 0x10
79 #define RAM_SAVE_FLAG_CONTINUE 0x20
80 #define RAM_SAVE_FLAG_XBZRLE 0x40
81 /* 0x80 is reserved in migration.h start with 0x100 next */
82 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
84 XBZRLECacheStats xbzrle_counters
;
86 /* struct contains XBZRLE cache and a static page
87 used by the compression */
89 /* buffer used for XBZRLE encoding */
91 /* buffer for storing page content */
93 /* Cache for XBZRLE, Protected by lock. */
96 /* it will store a page full of zeros */
97 uint8_t *zero_target_page
;
98 /* buffer used for XBZRLE decoding */
102 static void XBZRLE_cache_lock(void)
104 if (migrate_use_xbzrle()) {
105 qemu_mutex_lock(&XBZRLE
.lock
);
109 static void XBZRLE_cache_unlock(void)
111 if (migrate_use_xbzrle()) {
112 qemu_mutex_unlock(&XBZRLE
.lock
);
117 * xbzrle_cache_resize: resize the xbzrle cache
119 * This function is called from migrate_params_apply in main
120 * thread, possibly while a migration is in progress. A running
121 * migration may be using the cache and might finish during this call,
122 * hence changes to the cache are protected by XBZRLE.lock().
124 * Returns 0 for success or -1 for error
126 * @new_size: new cache size
127 * @errp: set *errp if the check failed, with reason
129 int xbzrle_cache_resize(uint64_t new_size
, Error
**errp
)
131 PageCache
*new_cache
;
134 /* Check for truncation */
135 if (new_size
!= (size_t)new_size
) {
136 error_setg(errp
, QERR_INVALID_PARAMETER_VALUE
, "cache size",
137 "exceeding address space");
141 if (new_size
== migrate_xbzrle_cache_size()) {
148 if (XBZRLE
.cache
!= NULL
) {
149 new_cache
= cache_init(new_size
, TARGET_PAGE_SIZE
, errp
);
155 cache_fini(XBZRLE
.cache
);
156 XBZRLE
.cache
= new_cache
;
159 XBZRLE_cache_unlock();
163 bool ramblock_is_ignored(RAMBlock
*block
)
165 return !qemu_ram_is_migratable(block
) ||
166 (migrate_ignore_shared() && qemu_ram_is_shared(block
));
169 #undef RAMBLOCK_FOREACH
171 int foreach_not_ignored_block(RAMBlockIterFunc func
, void *opaque
)
176 RCU_READ_LOCK_GUARD();
178 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
179 ret
= func(block
, opaque
);
187 static void ramblock_recv_map_init(void)
191 RAMBLOCK_FOREACH_NOT_IGNORED(rb
) {
192 assert(!rb
->receivedmap
);
193 rb
->receivedmap
= bitmap_new(rb
->max_length
>> qemu_target_page_bits());
197 int ramblock_recv_bitmap_test(RAMBlock
*rb
, void *host_addr
)
199 return test_bit(ramblock_recv_bitmap_offset(host_addr
, rb
),
203 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock
*rb
, uint64_t byte_offset
)
205 return test_bit(byte_offset
>> TARGET_PAGE_BITS
, rb
->receivedmap
);
208 void ramblock_recv_bitmap_set(RAMBlock
*rb
, void *host_addr
)
210 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr
, rb
), rb
->receivedmap
);
213 void ramblock_recv_bitmap_set_range(RAMBlock
*rb
, void *host_addr
,
216 bitmap_set_atomic(rb
->receivedmap
,
217 ramblock_recv_bitmap_offset(host_addr
, rb
),
221 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
224 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
226 * Returns >0 if success with sent bytes, or <0 if error.
228 int64_t ramblock_recv_bitmap_send(QEMUFile
*file
,
229 const char *block_name
)
231 RAMBlock
*block
= qemu_ram_block_by_name(block_name
);
232 unsigned long *le_bitmap
, nbits
;
236 error_report("%s: invalid block name: %s", __func__
, block_name
);
240 nbits
= block
->postcopy_length
>> TARGET_PAGE_BITS
;
243 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
244 * machines we may need 4 more bytes for padding (see below
245 * comment). So extend it a bit before hand.
247 le_bitmap
= bitmap_new(nbits
+ BITS_PER_LONG
);
250 * Always use little endian when sending the bitmap. This is
251 * required that when source and destination VMs are not using the
252 * same endianness. (Note: big endian won't work.)
254 bitmap_to_le(le_bitmap
, block
->receivedmap
, nbits
);
256 /* Size of the bitmap, in bytes */
257 size
= DIV_ROUND_UP(nbits
, 8);
260 * size is always aligned to 8 bytes for 64bit machines, but it
261 * may not be true for 32bit machines. We need this padding to
262 * make sure the migration can survive even between 32bit and
265 size
= ROUND_UP(size
, 8);
267 qemu_put_be64(file
, size
);
268 qemu_put_buffer(file
, (const uint8_t *)le_bitmap
, size
);
270 * Mark as an end, in case the middle part is screwed up due to
271 * some "mysterious" reason.
273 qemu_put_be64(file
, RAMBLOCK_RECV_BITMAP_ENDING
);
278 if (qemu_file_get_error(file
)) {
279 return qemu_file_get_error(file
);
282 return size
+ sizeof(size
);
286 * An outstanding page request, on the source, having been received
289 struct RAMSrcPageRequest
{
294 QSIMPLEQ_ENTRY(RAMSrcPageRequest
) next_req
;
297 /* State of RAM for migration */
299 /* QEMUFile used for this migration */
301 /* UFFD file descriptor, used in 'write-tracking' migration */
303 /* Last block that we have visited searching for dirty pages */
304 RAMBlock
*last_seen_block
;
305 /* Last block from where we have sent data */
306 RAMBlock
*last_sent_block
;
307 /* Last dirty target page we have sent */
308 ram_addr_t last_page
;
309 /* last ram version we have seen */
310 uint32_t last_version
;
311 /* How many times we have dirty too many pages */
312 int dirty_rate_high_cnt
;
313 /* these variables are used for bitmap sync */
314 /* last time we did a full bitmap_sync */
315 int64_t time_last_bitmap_sync
;
316 /* bytes transferred at start_time */
317 uint64_t bytes_xfer_prev
;
318 /* number of dirty pages since start_time */
319 uint64_t num_dirty_pages_period
;
320 /* xbzrle misses since the beginning of the period */
321 uint64_t xbzrle_cache_miss_prev
;
322 /* Amount of xbzrle pages since the beginning of the period */
323 uint64_t xbzrle_pages_prev
;
324 /* Amount of xbzrle encoded bytes since the beginning of the period */
325 uint64_t xbzrle_bytes_prev
;
326 /* Start using XBZRLE (e.g., after the first round). */
329 /* compression statistics since the beginning of the period */
330 /* amount of count that no free thread to compress data */
331 uint64_t compress_thread_busy_prev
;
332 /* amount bytes after compression */
333 uint64_t compressed_size_prev
;
334 /* amount of compressed pages */
335 uint64_t compress_pages_prev
;
337 /* total handled target pages at the beginning of period */
338 uint64_t target_page_count_prev
;
339 /* total handled target pages since start */
340 uint64_t target_page_count
;
341 /* number of dirty bits in the bitmap */
342 uint64_t migration_dirty_pages
;
343 /* Protects modification of the bitmap and migration dirty pages */
344 QemuMutex bitmap_mutex
;
345 /* The RAMBlock used in the last src_page_requests */
346 RAMBlock
*last_req_rb
;
347 /* Queue of outstanding page requests from the destination */
348 QemuMutex src_page_req_mutex
;
349 QSIMPLEQ_HEAD(, RAMSrcPageRequest
) src_page_requests
;
351 typedef struct RAMState RAMState
;
353 static RAMState
*ram_state
;
355 static NotifierWithReturnList precopy_notifier_list
;
357 void precopy_infrastructure_init(void)
359 notifier_with_return_list_init(&precopy_notifier_list
);
362 void precopy_add_notifier(NotifierWithReturn
*n
)
364 notifier_with_return_list_add(&precopy_notifier_list
, n
);
367 void precopy_remove_notifier(NotifierWithReturn
*n
)
369 notifier_with_return_remove(n
);
372 int precopy_notify(PrecopyNotifyReason reason
, Error
**errp
)
374 PrecopyNotifyData pnd
;
378 return notifier_with_return_list_notify(&precopy_notifier_list
, &pnd
);
381 uint64_t ram_bytes_remaining(void)
383 return ram_state
? (ram_state
->migration_dirty_pages
* TARGET_PAGE_SIZE
) :
387 MigrationStats ram_counters
;
389 /* used by the search for pages to send */
390 struct PageSearchStatus
{
391 /* Current block being searched */
393 /* Current page to search from */
395 /* Set once we wrap around */
398 typedef struct PageSearchStatus PageSearchStatus
;
400 CompressionStats compression_counters
;
402 struct CompressParam
{
412 /* internally used fields */
416 typedef struct CompressParam CompressParam
;
418 struct DecompressParam
{
428 typedef struct DecompressParam DecompressParam
;
430 static CompressParam
*comp_param
;
431 static QemuThread
*compress_threads
;
432 /* comp_done_cond is used to wake up the migration thread when
433 * one of the compression threads has finished the compression.
434 * comp_done_lock is used to co-work with comp_done_cond.
436 static QemuMutex comp_done_lock
;
437 static QemuCond comp_done_cond
;
438 /* The empty QEMUFileOps will be used by file in CompressParam */
439 static const QEMUFileOps empty_ops
= { };
441 static QEMUFile
*decomp_file
;
442 static DecompressParam
*decomp_param
;
443 static QemuThread
*decompress_threads
;
444 static QemuMutex decomp_done_lock
;
445 static QemuCond decomp_done_cond
;
447 static bool do_compress_ram_page(QEMUFile
*f
, z_stream
*stream
, RAMBlock
*block
,
448 ram_addr_t offset
, uint8_t *source_buf
);
450 static void *do_data_compress(void *opaque
)
452 CompressParam
*param
= opaque
;
457 qemu_mutex_lock(¶m
->mutex
);
458 while (!param
->quit
) {
460 block
= param
->block
;
461 offset
= param
->offset
;
463 qemu_mutex_unlock(¶m
->mutex
);
465 zero_page
= do_compress_ram_page(param
->file
, ¶m
->stream
,
466 block
, offset
, param
->originbuf
);
468 qemu_mutex_lock(&comp_done_lock
);
470 param
->zero_page
= zero_page
;
471 qemu_cond_signal(&comp_done_cond
);
472 qemu_mutex_unlock(&comp_done_lock
);
474 qemu_mutex_lock(¶m
->mutex
);
476 qemu_cond_wait(¶m
->cond
, ¶m
->mutex
);
479 qemu_mutex_unlock(¶m
->mutex
);
484 static void compress_threads_save_cleanup(void)
488 if (!migrate_use_compression() || !comp_param
) {
492 thread_count
= migrate_compress_threads();
493 for (i
= 0; i
< thread_count
; i
++) {
495 * we use it as a indicator which shows if the thread is
496 * properly init'd or not
498 if (!comp_param
[i
].file
) {
502 qemu_mutex_lock(&comp_param
[i
].mutex
);
503 comp_param
[i
].quit
= true;
504 qemu_cond_signal(&comp_param
[i
].cond
);
505 qemu_mutex_unlock(&comp_param
[i
].mutex
);
507 qemu_thread_join(compress_threads
+ i
);
508 qemu_mutex_destroy(&comp_param
[i
].mutex
);
509 qemu_cond_destroy(&comp_param
[i
].cond
);
510 deflateEnd(&comp_param
[i
].stream
);
511 g_free(comp_param
[i
].originbuf
);
512 qemu_fclose(comp_param
[i
].file
);
513 comp_param
[i
].file
= NULL
;
515 qemu_mutex_destroy(&comp_done_lock
);
516 qemu_cond_destroy(&comp_done_cond
);
517 g_free(compress_threads
);
519 compress_threads
= NULL
;
523 static int compress_threads_save_setup(void)
527 if (!migrate_use_compression()) {
530 thread_count
= migrate_compress_threads();
531 compress_threads
= g_new0(QemuThread
, thread_count
);
532 comp_param
= g_new0(CompressParam
, thread_count
);
533 qemu_cond_init(&comp_done_cond
);
534 qemu_mutex_init(&comp_done_lock
);
535 for (i
= 0; i
< thread_count
; i
++) {
536 comp_param
[i
].originbuf
= g_try_malloc(TARGET_PAGE_SIZE
);
537 if (!comp_param
[i
].originbuf
) {
541 if (deflateInit(&comp_param
[i
].stream
,
542 migrate_compress_level()) != Z_OK
) {
543 g_free(comp_param
[i
].originbuf
);
547 /* comp_param[i].file is just used as a dummy buffer to save data,
548 * set its ops to empty.
550 comp_param
[i
].file
= qemu_fopen_ops(NULL
, &empty_ops
, false);
551 comp_param
[i
].done
= true;
552 comp_param
[i
].quit
= false;
553 qemu_mutex_init(&comp_param
[i
].mutex
);
554 qemu_cond_init(&comp_param
[i
].cond
);
555 qemu_thread_create(compress_threads
+ i
, "compress",
556 do_data_compress
, comp_param
+ i
,
557 QEMU_THREAD_JOINABLE
);
562 compress_threads_save_cleanup();
567 * save_page_header: write page header to wire
569 * If this is the 1st block, it also writes the block identification
571 * Returns the number of bytes written
573 * @f: QEMUFile where to send the data
574 * @block: block that contains the page we want to send
575 * @offset: offset inside the block for the page
576 * in the lower bits, it contains flags
578 static size_t save_page_header(RAMState
*rs
, QEMUFile
*f
, RAMBlock
*block
,
583 if (block
== rs
->last_sent_block
) {
584 offset
|= RAM_SAVE_FLAG_CONTINUE
;
586 qemu_put_be64(f
, offset
);
589 if (!(offset
& RAM_SAVE_FLAG_CONTINUE
)) {
590 len
= strlen(block
->idstr
);
591 qemu_put_byte(f
, len
);
592 qemu_put_buffer(f
, (uint8_t *)block
->idstr
, len
);
594 rs
->last_sent_block
= block
;
600 * mig_throttle_guest_down: throttle down the guest
602 * Reduce amount of guest cpu execution to hopefully slow down memory
603 * writes. If guest dirty memory rate is reduced below the rate at
604 * which we can transfer pages to the destination then we should be
605 * able to complete migration. Some workloads dirty memory way too
606 * fast and will not effectively converge, even with auto-converge.
608 static void mig_throttle_guest_down(uint64_t bytes_dirty_period
,
609 uint64_t bytes_dirty_threshold
)
611 MigrationState
*s
= migrate_get_current();
612 uint64_t pct_initial
= s
->parameters
.cpu_throttle_initial
;
613 uint64_t pct_increment
= s
->parameters
.cpu_throttle_increment
;
614 bool pct_tailslow
= s
->parameters
.cpu_throttle_tailslow
;
615 int pct_max
= s
->parameters
.max_cpu_throttle
;
617 uint64_t throttle_now
= cpu_throttle_get_percentage();
618 uint64_t cpu_now
, cpu_ideal
, throttle_inc
;
620 /* We have not started throttling yet. Let's start it. */
621 if (!cpu_throttle_active()) {
622 cpu_throttle_set(pct_initial
);
624 /* Throttling already on, just increase the rate */
626 throttle_inc
= pct_increment
;
628 /* Compute the ideal CPU percentage used by Guest, which may
629 * make the dirty rate match the dirty rate threshold. */
630 cpu_now
= 100 - throttle_now
;
631 cpu_ideal
= cpu_now
* (bytes_dirty_threshold
* 1.0 /
633 throttle_inc
= MIN(cpu_now
- cpu_ideal
, pct_increment
);
635 cpu_throttle_set(MIN(throttle_now
+ throttle_inc
, pct_max
));
639 void mig_throttle_counter_reset(void)
641 RAMState
*rs
= ram_state
;
643 rs
->time_last_bitmap_sync
= qemu_clock_get_ms(QEMU_CLOCK_REALTIME
);
644 rs
->num_dirty_pages_period
= 0;
645 rs
->bytes_xfer_prev
= ram_counters
.transferred
;
649 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
651 * @rs: current RAM state
652 * @current_addr: address for the zero page
654 * Update the xbzrle cache to reflect a page that's been sent as all 0.
655 * The important thing is that a stale (not-yet-0'd) page be replaced
657 * As a bonus, if the page wasn't in the cache it gets added so that
658 * when a small write is made into the 0'd page it gets XBZRLE sent.
660 static void xbzrle_cache_zero_page(RAMState
*rs
, ram_addr_t current_addr
)
662 if (!rs
->xbzrle_enabled
) {
666 /* We don't care if this fails to allocate a new cache page
667 * as long as it updated an old one */
668 cache_insert(XBZRLE
.cache
, current_addr
, XBZRLE
.zero_target_page
,
669 ram_counters
.dirty_sync_count
);
672 #define ENCODING_FLAG_XBZRLE 0x1
675 * save_xbzrle_page: compress and send current page
677 * Returns: 1 means that we wrote the page
678 * 0 means that page is identical to the one already sent
679 * -1 means that xbzrle would be longer than normal
681 * @rs: current RAM state
682 * @current_data: pointer to the address of the page contents
683 * @current_addr: addr of the page
684 * @block: block that contains the page we want to send
685 * @offset: offset inside the block for the page
686 * @last_stage: if we are at the completion stage
688 static int save_xbzrle_page(RAMState
*rs
, uint8_t **current_data
,
689 ram_addr_t current_addr
, RAMBlock
*block
,
690 ram_addr_t offset
, bool last_stage
)
692 int encoded_len
= 0, bytes_xbzrle
;
693 uint8_t *prev_cached_page
;
695 if (!cache_is_cached(XBZRLE
.cache
, current_addr
,
696 ram_counters
.dirty_sync_count
)) {
697 xbzrle_counters
.cache_miss
++;
699 if (cache_insert(XBZRLE
.cache
, current_addr
, *current_data
,
700 ram_counters
.dirty_sync_count
) == -1) {
703 /* update *current_data when the page has been
704 inserted into cache */
705 *current_data
= get_cached_data(XBZRLE
.cache
, current_addr
);
712 * Reaching here means the page has hit the xbzrle cache, no matter what
713 * encoding result it is (normal encoding, overflow or skipping the page),
714 * count the page as encoded. This is used to calculate the encoding rate.
716 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
717 * 2nd page turns out to be skipped (i.e. no new bytes written to the
718 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
719 * skipped page included. In this way, the encoding rate can tell if the
720 * guest page is good for xbzrle encoding.
722 xbzrle_counters
.pages
++;
723 prev_cached_page
= get_cached_data(XBZRLE
.cache
, current_addr
);
725 /* save current buffer into memory */
726 memcpy(XBZRLE
.current_buf
, *current_data
, TARGET_PAGE_SIZE
);
728 /* XBZRLE encoding (if there is no overflow) */
729 encoded_len
= xbzrle_encode_buffer(prev_cached_page
, XBZRLE
.current_buf
,
730 TARGET_PAGE_SIZE
, XBZRLE
.encoded_buf
,
734 * Update the cache contents, so that it corresponds to the data
735 * sent, in all cases except where we skip the page.
737 if (!last_stage
&& encoded_len
!= 0) {
738 memcpy(prev_cached_page
, XBZRLE
.current_buf
, TARGET_PAGE_SIZE
);
740 * In the case where we couldn't compress, ensure that the caller
741 * sends the data from the cache, since the guest might have
742 * changed the RAM since we copied it.
744 *current_data
= prev_cached_page
;
747 if (encoded_len
== 0) {
748 trace_save_xbzrle_page_skipping();
750 } else if (encoded_len
== -1) {
751 trace_save_xbzrle_page_overflow();
752 xbzrle_counters
.overflow
++;
753 xbzrle_counters
.bytes
+= TARGET_PAGE_SIZE
;
757 /* Send XBZRLE based compressed page */
758 bytes_xbzrle
= save_page_header(rs
, rs
->f
, block
,
759 offset
| RAM_SAVE_FLAG_XBZRLE
);
760 qemu_put_byte(rs
->f
, ENCODING_FLAG_XBZRLE
);
761 qemu_put_be16(rs
->f
, encoded_len
);
762 qemu_put_buffer(rs
->f
, XBZRLE
.encoded_buf
, encoded_len
);
763 bytes_xbzrle
+= encoded_len
+ 1 + 2;
765 * Like compressed_size (please see update_compress_thread_counts),
766 * the xbzrle encoded bytes don't count the 8 byte header with
767 * RAM_SAVE_FLAG_CONTINUE.
769 xbzrle_counters
.bytes
+= bytes_xbzrle
- 8;
770 ram_counters
.transferred
+= bytes_xbzrle
;
776 * migration_bitmap_find_dirty: find the next dirty page from start
778 * Returns the page offset within memory region of the start of a dirty page
780 * @rs: current RAM state
781 * @rb: RAMBlock where to search for dirty pages
782 * @start: page where we start the search
785 unsigned long migration_bitmap_find_dirty(RAMState
*rs
, RAMBlock
*rb
,
788 unsigned long size
= rb
->used_length
>> TARGET_PAGE_BITS
;
789 unsigned long *bitmap
= rb
->bmap
;
791 if (ramblock_is_ignored(rb
)) {
795 return find_next_bit(bitmap
, size
, start
);
798 static void migration_clear_memory_region_dirty_bitmap(RAMBlock
*rb
,
804 if (!rb
->clear_bmap
|| !clear_bmap_test_and_clear(rb
, page
)) {
808 shift
= rb
->clear_bmap_shift
;
810 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
811 * can make things easier sometimes since then start address
812 * of the small chunk will always be 64 pages aligned so the
813 * bitmap will always be aligned to unsigned long. We should
814 * even be able to remove this restriction but I'm simply
819 size
= 1ULL << (TARGET_PAGE_BITS
+ shift
);
820 start
= QEMU_ALIGN_DOWN((ram_addr_t
)page
<< TARGET_PAGE_BITS
, size
);
821 trace_migration_bitmap_clear_dirty(rb
->idstr
, start
, size
, page
);
822 memory_region_clear_dirty_bitmap(rb
->mr
, start
, size
);
826 migration_clear_memory_region_dirty_bitmap_range(RAMBlock
*rb
,
828 unsigned long npages
)
830 unsigned long i
, chunk_pages
= 1UL << rb
->clear_bmap_shift
;
831 unsigned long chunk_start
= QEMU_ALIGN_DOWN(start
, chunk_pages
);
832 unsigned long chunk_end
= QEMU_ALIGN_UP(start
+ npages
, chunk_pages
);
835 * Clear pages from start to start + npages - 1, so the end boundary is
838 for (i
= chunk_start
; i
< chunk_end
; i
+= chunk_pages
) {
839 migration_clear_memory_region_dirty_bitmap(rb
, i
);
844 * colo_bitmap_find_diry:find contiguous dirty pages from start
846 * Returns the page offset within memory region of the start of the contiguout
849 * @rs: current RAM state
850 * @rb: RAMBlock where to search for dirty pages
851 * @start: page where we start the search
852 * @num: the number of contiguous dirty pages
855 unsigned long colo_bitmap_find_dirty(RAMState
*rs
, RAMBlock
*rb
,
856 unsigned long start
, unsigned long *num
)
858 unsigned long size
= rb
->used_length
>> TARGET_PAGE_BITS
;
859 unsigned long *bitmap
= rb
->bmap
;
860 unsigned long first
, next
;
864 if (ramblock_is_ignored(rb
)) {
868 first
= find_next_bit(bitmap
, size
, start
);
872 next
= find_next_zero_bit(bitmap
, size
, first
+ 1);
873 assert(next
>= first
);
878 static inline bool migration_bitmap_clear_dirty(RAMState
*rs
,
885 * Clear dirty bitmap if needed. This _must_ be called before we
886 * send any of the page in the chunk because we need to make sure
887 * we can capture further page content changes when we sync dirty
888 * log the next time. So as long as we are going to send any of
889 * the page in the chunk we clear the remote dirty bitmap for all.
890 * Clearing it earlier won't be a problem, but too late will.
892 migration_clear_memory_region_dirty_bitmap(rb
, page
);
894 ret
= test_and_clear_bit(page
, rb
->bmap
);
896 rs
->migration_dirty_pages
--;
902 static void dirty_bitmap_clear_section(MemoryRegionSection
*section
,
905 const hwaddr offset
= section
->offset_within_region
;
906 const hwaddr size
= int128_get64(section
->size
);
907 const unsigned long start
= offset
>> TARGET_PAGE_BITS
;
908 const unsigned long npages
= size
>> TARGET_PAGE_BITS
;
909 RAMBlock
*rb
= section
->mr
->ram_block
;
910 uint64_t *cleared_bits
= opaque
;
913 * We don't grab ram_state->bitmap_mutex because we expect to run
914 * only when starting migration or during postcopy recovery where
915 * we don't have concurrent access.
917 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
918 migration_clear_memory_region_dirty_bitmap_range(rb
, start
, npages
);
920 *cleared_bits
+= bitmap_count_one_with_offset(rb
->bmap
, start
, npages
);
921 bitmap_clear(rb
->bmap
, start
, npages
);
925 * Exclude all dirty pages from migration that fall into a discarded range as
926 * managed by a RamDiscardManager responsible for the mapped memory region of
927 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
929 * Discarded pages ("logically unplugged") have undefined content and must
930 * not get migrated, because even reading these pages for migration might
931 * result in undesired behavior.
933 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
935 * Note: The result is only stable while migrating (precopy/postcopy).
937 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock
*rb
)
939 uint64_t cleared_bits
= 0;
941 if (rb
->mr
&& rb
->bmap
&& memory_region_has_ram_discard_manager(rb
->mr
)) {
942 RamDiscardManager
*rdm
= memory_region_get_ram_discard_manager(rb
->mr
);
943 MemoryRegionSection section
= {
945 .offset_within_region
= 0,
946 .size
= int128_make64(qemu_ram_get_used_length(rb
)),
949 ram_discard_manager_replay_discarded(rdm
, §ion
,
950 dirty_bitmap_clear_section
,
957 * Check if a host-page aligned page falls into a discarded range as managed by
958 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
960 * Note: The result is only stable while migrating (precopy/postcopy).
962 bool ramblock_page_is_discarded(RAMBlock
*rb
, ram_addr_t start
)
964 if (rb
->mr
&& memory_region_has_ram_discard_manager(rb
->mr
)) {
965 RamDiscardManager
*rdm
= memory_region_get_ram_discard_manager(rb
->mr
);
966 MemoryRegionSection section
= {
968 .offset_within_region
= start
,
969 .size
= int128_make64(qemu_ram_pagesize(rb
)),
972 return !ram_discard_manager_is_populated(rdm
, §ion
);
977 /* Called with RCU critical section */
978 static void ramblock_sync_dirty_bitmap(RAMState
*rs
, RAMBlock
*rb
)
980 uint64_t new_dirty_pages
=
981 cpu_physical_memory_sync_dirty_bitmap(rb
, 0, rb
->used_length
);
983 rs
->migration_dirty_pages
+= new_dirty_pages
;
984 rs
->num_dirty_pages_period
+= new_dirty_pages
;
988 * ram_pagesize_summary: calculate all the pagesizes of a VM
990 * Returns a summary bitmap of the page sizes of all RAMBlocks
992 * For VMs with just normal pages this is equivalent to the host page
993 * size. If it's got some huge pages then it's the OR of all the
994 * different page sizes.
996 uint64_t ram_pagesize_summary(void)
999 uint64_t summary
= 0;
1001 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
1002 summary
|= block
->page_size
;
1008 uint64_t ram_get_total_transferred_pages(void)
1010 return ram_counters
.normal
+ ram_counters
.duplicate
+
1011 compression_counters
.pages
+ xbzrle_counters
.pages
;
1014 static void migration_update_rates(RAMState
*rs
, int64_t end_time
)
1016 uint64_t page_count
= rs
->target_page_count
- rs
->target_page_count_prev
;
1017 double compressed_size
;
1019 /* calculate period counters */
1020 ram_counters
.dirty_pages_rate
= rs
->num_dirty_pages_period
* 1000
1021 / (end_time
- rs
->time_last_bitmap_sync
);
1027 if (migrate_use_xbzrle()) {
1028 double encoded_size
, unencoded_size
;
1030 xbzrle_counters
.cache_miss_rate
= (double)(xbzrle_counters
.cache_miss
-
1031 rs
->xbzrle_cache_miss_prev
) / page_count
;
1032 rs
->xbzrle_cache_miss_prev
= xbzrle_counters
.cache_miss
;
1033 unencoded_size
= (xbzrle_counters
.pages
- rs
->xbzrle_pages_prev
) *
1035 encoded_size
= xbzrle_counters
.bytes
- rs
->xbzrle_bytes_prev
;
1036 if (xbzrle_counters
.pages
== rs
->xbzrle_pages_prev
|| !encoded_size
) {
1037 xbzrle_counters
.encoding_rate
= 0;
1039 xbzrle_counters
.encoding_rate
= unencoded_size
/ encoded_size
;
1041 rs
->xbzrle_pages_prev
= xbzrle_counters
.pages
;
1042 rs
->xbzrle_bytes_prev
= xbzrle_counters
.bytes
;
1045 if (migrate_use_compression()) {
1046 compression_counters
.busy_rate
= (double)(compression_counters
.busy
-
1047 rs
->compress_thread_busy_prev
) / page_count
;
1048 rs
->compress_thread_busy_prev
= compression_counters
.busy
;
1050 compressed_size
= compression_counters
.compressed_size
-
1051 rs
->compressed_size_prev
;
1052 if (compressed_size
) {
1053 double uncompressed_size
= (compression_counters
.pages
-
1054 rs
->compress_pages_prev
) * TARGET_PAGE_SIZE
;
1056 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1057 compression_counters
.compression_rate
=
1058 uncompressed_size
/ compressed_size
;
1060 rs
->compress_pages_prev
= compression_counters
.pages
;
1061 rs
->compressed_size_prev
= compression_counters
.compressed_size
;
1066 static void migration_trigger_throttle(RAMState
*rs
)
1068 MigrationState
*s
= migrate_get_current();
1069 uint64_t threshold
= s
->parameters
.throttle_trigger_threshold
;
1071 uint64_t bytes_xfer_period
= ram_counters
.transferred
- rs
->bytes_xfer_prev
;
1072 uint64_t bytes_dirty_period
= rs
->num_dirty_pages_period
* TARGET_PAGE_SIZE
;
1073 uint64_t bytes_dirty_threshold
= bytes_xfer_period
* threshold
/ 100;
1075 /* During block migration the auto-converge logic incorrectly detects
1076 * that ram migration makes no progress. Avoid this by disabling the
1077 * throttling logic during the bulk phase of block migration. */
1078 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1079 /* The following detection logic can be refined later. For now:
1080 Check to see if the ratio between dirtied bytes and the approx.
1081 amount of bytes that just got transferred since the last time
1082 we were in this routine reaches the threshold. If that happens
1083 twice, start or increase throttling. */
1085 if ((bytes_dirty_period
> bytes_dirty_threshold
) &&
1086 (++rs
->dirty_rate_high_cnt
>= 2)) {
1087 trace_migration_throttle();
1088 rs
->dirty_rate_high_cnt
= 0;
1089 mig_throttle_guest_down(bytes_dirty_period
,
1090 bytes_dirty_threshold
);
1095 static void migration_bitmap_sync(RAMState
*rs
)
1100 ram_counters
.dirty_sync_count
++;
1102 if (!rs
->time_last_bitmap_sync
) {
1103 rs
->time_last_bitmap_sync
= qemu_clock_get_ms(QEMU_CLOCK_REALTIME
);
1106 trace_migration_bitmap_sync_start();
1107 memory_global_dirty_log_sync();
1109 qemu_mutex_lock(&rs
->bitmap_mutex
);
1110 WITH_RCU_READ_LOCK_GUARD() {
1111 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
1112 ramblock_sync_dirty_bitmap(rs
, block
);
1114 ram_counters
.remaining
= ram_bytes_remaining();
1116 qemu_mutex_unlock(&rs
->bitmap_mutex
);
1118 memory_global_after_dirty_log_sync();
1119 trace_migration_bitmap_sync_end(rs
->num_dirty_pages_period
);
1121 end_time
= qemu_clock_get_ms(QEMU_CLOCK_REALTIME
);
1123 /* more than 1 second = 1000 millisecons */
1124 if (end_time
> rs
->time_last_bitmap_sync
+ 1000) {
1125 migration_trigger_throttle(rs
);
1127 migration_update_rates(rs
, end_time
);
1129 rs
->target_page_count_prev
= rs
->target_page_count
;
1131 /* reset period counters */
1132 rs
->time_last_bitmap_sync
= end_time
;
1133 rs
->num_dirty_pages_period
= 0;
1134 rs
->bytes_xfer_prev
= ram_counters
.transferred
;
1136 if (migrate_use_events()) {
1137 qapi_event_send_migration_pass(ram_counters
.dirty_sync_count
);
1141 static void migration_bitmap_sync_precopy(RAMState
*rs
)
1143 Error
*local_err
= NULL
;
1146 * The current notifier usage is just an optimization to migration, so we
1147 * don't stop the normal migration process in the error case.
1149 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC
, &local_err
)) {
1150 error_report_err(local_err
);
1154 migration_bitmap_sync(rs
);
1156 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC
, &local_err
)) {
1157 error_report_err(local_err
);
1162 * save_zero_page_to_file: send the zero page to the file
1164 * Returns the size of data written to the file, 0 means the page is not
1167 * @rs: current RAM state
1168 * @file: the file where the data is saved
1169 * @block: block that contains the page we want to send
1170 * @offset: offset inside the block for the page
1172 static int save_zero_page_to_file(RAMState
*rs
, QEMUFile
*file
,
1173 RAMBlock
*block
, ram_addr_t offset
)
1175 uint8_t *p
= block
->host
+ offset
;
1178 if (buffer_is_zero(p
, TARGET_PAGE_SIZE
)) {
1179 len
+= save_page_header(rs
, file
, block
, offset
| RAM_SAVE_FLAG_ZERO
);
1180 qemu_put_byte(file
, 0);
1187 * save_zero_page: send the zero page to the stream
1189 * Returns the number of pages written.
1191 * @rs: current RAM state
1192 * @block: block that contains the page we want to send
1193 * @offset: offset inside the block for the page
1195 static int save_zero_page(RAMState
*rs
, RAMBlock
*block
, ram_addr_t offset
)
1197 int len
= save_zero_page_to_file(rs
, rs
->f
, block
, offset
);
1200 ram_counters
.duplicate
++;
1201 ram_counters
.transferred
+= len
;
1207 static void ram_release_pages(const char *rbname
, uint64_t offset
, int pages
)
1209 if (!migrate_release_ram() || !migration_in_postcopy()) {
1213 ram_discard_range(rbname
, offset
, ((ram_addr_t
)pages
) << TARGET_PAGE_BITS
);
1217 * @pages: the number of pages written by the control path,
1219 * > 0 - number of pages written
1221 * Return true if the pages has been saved, otherwise false is returned.
1223 static bool control_save_page(RAMState
*rs
, RAMBlock
*block
, ram_addr_t offset
,
1226 uint64_t bytes_xmit
= 0;
1230 ret
= ram_control_save_page(rs
->f
, block
->offset
, offset
, TARGET_PAGE_SIZE
,
1232 if (ret
== RAM_SAVE_CONTROL_NOT_SUPP
) {
1237 ram_counters
.transferred
+= bytes_xmit
;
1241 if (ret
== RAM_SAVE_CONTROL_DELAYED
) {
1245 if (bytes_xmit
> 0) {
1246 ram_counters
.normal
++;
1247 } else if (bytes_xmit
== 0) {
1248 ram_counters
.duplicate
++;
1255 * directly send the page to the stream
1257 * Returns the number of pages written.
1259 * @rs: current RAM state
1260 * @block: block that contains the page we want to send
1261 * @offset: offset inside the block for the page
1262 * @buf: the page to be sent
1263 * @async: send to page asyncly
1265 static int save_normal_page(RAMState
*rs
, RAMBlock
*block
, ram_addr_t offset
,
1266 uint8_t *buf
, bool async
)
1268 ram_counters
.transferred
+= save_page_header(rs
, rs
->f
, block
,
1269 offset
| RAM_SAVE_FLAG_PAGE
);
1271 qemu_put_buffer_async(rs
->f
, buf
, TARGET_PAGE_SIZE
,
1272 migrate_release_ram() &
1273 migration_in_postcopy());
1275 qemu_put_buffer(rs
->f
, buf
, TARGET_PAGE_SIZE
);
1277 ram_counters
.transferred
+= TARGET_PAGE_SIZE
;
1278 ram_counters
.normal
++;
1283 * ram_save_page: send the given page to the stream
1285 * Returns the number of pages written.
1287 * >=0 - Number of pages written - this might legally be 0
1288 * if xbzrle noticed the page was the same.
1290 * @rs: current RAM state
1291 * @block: block that contains the page we want to send
1292 * @offset: offset inside the block for the page
1293 * @last_stage: if we are at the completion stage
1295 static int ram_save_page(RAMState
*rs
, PageSearchStatus
*pss
, bool last_stage
)
1299 bool send_async
= true;
1300 RAMBlock
*block
= pss
->block
;
1301 ram_addr_t offset
= ((ram_addr_t
)pss
->page
) << TARGET_PAGE_BITS
;
1302 ram_addr_t current_addr
= block
->offset
+ offset
;
1304 p
= block
->host
+ offset
;
1305 trace_ram_save_page(block
->idstr
, (uint64_t)offset
, p
);
1307 XBZRLE_cache_lock();
1308 if (rs
->xbzrle_enabled
&& !migration_in_postcopy()) {
1309 pages
= save_xbzrle_page(rs
, &p
, current_addr
, block
,
1310 offset
, last_stage
);
1312 /* Can't send this cached data async, since the cache page
1313 * might get updated before it gets to the wire
1319 /* XBZRLE overflow or normal page */
1321 pages
= save_normal_page(rs
, block
, offset
, p
, send_async
);
1324 XBZRLE_cache_unlock();
1329 static int ram_save_multifd_page(RAMState
*rs
, RAMBlock
*block
,
1332 if (multifd_queue_page(rs
->f
, block
, offset
) < 0) {
1335 ram_counters
.normal
++;
1340 static bool do_compress_ram_page(QEMUFile
*f
, z_stream
*stream
, RAMBlock
*block
,
1341 ram_addr_t offset
, uint8_t *source_buf
)
1343 RAMState
*rs
= ram_state
;
1344 uint8_t *p
= block
->host
+ (offset
& TARGET_PAGE_MASK
);
1345 bool zero_page
= false;
1348 if (save_zero_page_to_file(rs
, f
, block
, offset
)) {
1353 save_page_header(rs
, f
, block
, offset
| RAM_SAVE_FLAG_COMPRESS_PAGE
);
1356 * copy it to a internal buffer to avoid it being modified by VM
1357 * so that we can catch up the error during compression and
1360 memcpy(source_buf
, p
, TARGET_PAGE_SIZE
);
1361 ret
= qemu_put_compression_data(f
, stream
, source_buf
, TARGET_PAGE_SIZE
);
1363 qemu_file_set_error(migrate_get_current()->to_dst_file
, ret
);
1364 error_report("compressed data failed!");
1369 ram_release_pages(block
->idstr
, offset
& TARGET_PAGE_MASK
, 1);
1374 update_compress_thread_counts(const CompressParam
*param
, int bytes_xmit
)
1376 ram_counters
.transferred
+= bytes_xmit
;
1378 if (param
->zero_page
) {
1379 ram_counters
.duplicate
++;
1383 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1384 compression_counters
.compressed_size
+= bytes_xmit
- 8;
1385 compression_counters
.pages
++;
1388 static bool save_page_use_compression(RAMState
*rs
);
1390 static void flush_compressed_data(RAMState
*rs
)
1392 int idx
, len
, thread_count
;
1394 if (!save_page_use_compression(rs
)) {
1397 thread_count
= migrate_compress_threads();
1399 qemu_mutex_lock(&comp_done_lock
);
1400 for (idx
= 0; idx
< thread_count
; idx
++) {
1401 while (!comp_param
[idx
].done
) {
1402 qemu_cond_wait(&comp_done_cond
, &comp_done_lock
);
1405 qemu_mutex_unlock(&comp_done_lock
);
1407 for (idx
= 0; idx
< thread_count
; idx
++) {
1408 qemu_mutex_lock(&comp_param
[idx
].mutex
);
1409 if (!comp_param
[idx
].quit
) {
1410 len
= qemu_put_qemu_file(rs
->f
, comp_param
[idx
].file
);
1412 * it's safe to fetch zero_page without holding comp_done_lock
1413 * as there is no further request submitted to the thread,
1414 * i.e, the thread should be waiting for a request at this point.
1416 update_compress_thread_counts(&comp_param
[idx
], len
);
1418 qemu_mutex_unlock(&comp_param
[idx
].mutex
);
1422 static inline void set_compress_params(CompressParam
*param
, RAMBlock
*block
,
1425 param
->block
= block
;
1426 param
->offset
= offset
;
1429 static int compress_page_with_multi_thread(RAMState
*rs
, RAMBlock
*block
,
1432 int idx
, thread_count
, bytes_xmit
= -1, pages
= -1;
1433 bool wait
= migrate_compress_wait_thread();
1435 thread_count
= migrate_compress_threads();
1436 qemu_mutex_lock(&comp_done_lock
);
1438 for (idx
= 0; idx
< thread_count
; idx
++) {
1439 if (comp_param
[idx
].done
) {
1440 comp_param
[idx
].done
= false;
1441 bytes_xmit
= qemu_put_qemu_file(rs
->f
, comp_param
[idx
].file
);
1442 qemu_mutex_lock(&comp_param
[idx
].mutex
);
1443 set_compress_params(&comp_param
[idx
], block
, offset
);
1444 qemu_cond_signal(&comp_param
[idx
].cond
);
1445 qemu_mutex_unlock(&comp_param
[idx
].mutex
);
1447 update_compress_thread_counts(&comp_param
[idx
], bytes_xmit
);
1453 * wait for the free thread if the user specifies 'compress-wait-thread',
1454 * otherwise we will post the page out in the main thread as normal page.
1456 if (pages
< 0 && wait
) {
1457 qemu_cond_wait(&comp_done_cond
, &comp_done_lock
);
1460 qemu_mutex_unlock(&comp_done_lock
);
1466 * find_dirty_block: find the next dirty page and update any state
1467 * associated with the search process.
1469 * Returns true if a page is found
1471 * @rs: current RAM state
1472 * @pss: data about the state of the current dirty page scan
1473 * @again: set to false if the search has scanned the whole of RAM
1475 static bool find_dirty_block(RAMState
*rs
, PageSearchStatus
*pss
, bool *again
)
1477 pss
->page
= migration_bitmap_find_dirty(rs
, pss
->block
, pss
->page
);
1478 if (pss
->complete_round
&& pss
->block
== rs
->last_seen_block
&&
1479 pss
->page
>= rs
->last_page
) {
1481 * We've been once around the RAM and haven't found anything.
1487 if (!offset_in_ramblock(pss
->block
,
1488 ((ram_addr_t
)pss
->page
) << TARGET_PAGE_BITS
)) {
1489 /* Didn't find anything in this RAM Block */
1491 pss
->block
= QLIST_NEXT_RCU(pss
->block
, next
);
1494 * If memory migration starts over, we will meet a dirtied page
1495 * which may still exists in compression threads's ring, so we
1496 * should flush the compressed data to make sure the new page
1497 * is not overwritten by the old one in the destination.
1499 * Also If xbzrle is on, stop using the data compression at this
1500 * point. In theory, xbzrle can do better than compression.
1502 flush_compressed_data(rs
);
1504 /* Hit the end of the list */
1505 pss
->block
= QLIST_FIRST_RCU(&ram_list
.blocks
);
1506 /* Flag that we've looped */
1507 pss
->complete_round
= true;
1508 /* After the first round, enable XBZRLE. */
1509 if (migrate_use_xbzrle()) {
1510 rs
->xbzrle_enabled
= true;
1513 /* Didn't find anything this time, but try again on the new block */
1517 /* Can go around again, but... */
1519 /* We've found something so probably don't need to */
1525 * unqueue_page: gets a page of the queue
1527 * Helper for 'get_queued_page' - gets a page off the queue
1529 * Returns the block of the page (or NULL if none available)
1531 * @rs: current RAM state
1532 * @offset: used to return the offset within the RAMBlock
1534 static RAMBlock
*unqueue_page(RAMState
*rs
, ram_addr_t
*offset
)
1536 RAMBlock
*block
= NULL
;
1538 if (QSIMPLEQ_EMPTY_ATOMIC(&rs
->src_page_requests
)) {
1542 QEMU_LOCK_GUARD(&rs
->src_page_req_mutex
);
1543 if (!QSIMPLEQ_EMPTY(&rs
->src_page_requests
)) {
1544 struct RAMSrcPageRequest
*entry
=
1545 QSIMPLEQ_FIRST(&rs
->src_page_requests
);
1547 *offset
= entry
->offset
;
1549 if (entry
->len
> TARGET_PAGE_SIZE
) {
1550 entry
->len
-= TARGET_PAGE_SIZE
;
1551 entry
->offset
+= TARGET_PAGE_SIZE
;
1553 memory_region_unref(block
->mr
);
1554 QSIMPLEQ_REMOVE_HEAD(&rs
->src_page_requests
, next_req
);
1556 migration_consume_urgent_request();
1563 #if defined(__linux__)
1565 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1566 * is found, return RAM block pointer and page offset
1568 * Returns pointer to the RAMBlock containing faulting page,
1569 * NULL if no write faults are pending
1571 * @rs: current RAM state
1572 * @offset: page offset from the beginning of the block
1574 static RAMBlock
*poll_fault_page(RAMState
*rs
, ram_addr_t
*offset
)
1576 struct uffd_msg uffd_msg
;
1581 if (!migrate_background_snapshot()) {
1585 res
= uffd_read_events(rs
->uffdio_fd
, &uffd_msg
, 1);
1590 page_address
= (void *)(uintptr_t) uffd_msg
.arg
.pagefault
.address
;
1591 block
= qemu_ram_block_from_host(page_address
, false, offset
);
1592 assert(block
&& (block
->flags
& RAM_UF_WRITEPROTECT
) != 0);
1597 * ram_save_release_protection: release UFFD write protection after
1598 * a range of pages has been saved
1600 * @rs: current RAM state
1601 * @pss: page-search-status structure
1602 * @start_page: index of the first page in the range relative to pss->block
1604 * Returns 0 on success, negative value in case of an error
1606 static int ram_save_release_protection(RAMState
*rs
, PageSearchStatus
*pss
,
1607 unsigned long start_page
)
1611 /* Check if page is from UFFD-managed region. */
1612 if (pss
->block
->flags
& RAM_UF_WRITEPROTECT
) {
1613 void *page_address
= pss
->block
->host
+ (start_page
<< TARGET_PAGE_BITS
);
1614 uint64_t run_length
= (pss
->page
- start_page
+ 1) << TARGET_PAGE_BITS
;
1616 /* Flush async buffers before un-protect. */
1618 /* Un-protect memory range. */
1619 res
= uffd_change_protection(rs
->uffdio_fd
, page_address
, run_length
,
1626 /* ram_write_tracking_available: check if kernel supports required UFFD features
1628 * Returns true if supports, false otherwise
1630 bool ram_write_tracking_available(void)
1632 uint64_t uffd_features
;
1635 res
= uffd_query_features(&uffd_features
);
1637 (uffd_features
& UFFD_FEATURE_PAGEFAULT_FLAG_WP
) != 0);
1640 /* ram_write_tracking_compatible: check if guest configuration is
1641 * compatible with 'write-tracking'
1643 * Returns true if compatible, false otherwise
1645 bool ram_write_tracking_compatible(void)
1647 const uint64_t uffd_ioctls_mask
= BIT(_UFFDIO_WRITEPROTECT
);
1652 /* Open UFFD file descriptor */
1653 uffd_fd
= uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP
, false);
1658 RCU_READ_LOCK_GUARD();
1660 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
1661 uint64_t uffd_ioctls
;
1663 /* Nothing to do with read-only and MMIO-writable regions */
1664 if (block
->mr
->readonly
|| block
->mr
->rom_device
) {
1667 /* Try to register block memory via UFFD-IO to track writes */
1668 if (uffd_register_memory(uffd_fd
, block
->host
, block
->max_length
,
1669 UFFDIO_REGISTER_MODE_WP
, &uffd_ioctls
)) {
1672 if ((uffd_ioctls
& uffd_ioctls_mask
) != uffd_ioctls_mask
) {
1679 uffd_close_fd(uffd_fd
);
1683 static inline void populate_read_range(RAMBlock
*block
, ram_addr_t offset
,
1687 * We read one byte of each page; this will preallocate page tables if
1688 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1689 * where no page was populated yet. This might require adaption when
1690 * supporting other mappings, like shmem.
1692 for (; offset
< size
; offset
+= block
->page_size
) {
1693 char tmp
= *((char *)block
->host
+ offset
);
1695 /* Don't optimize the read out */
1696 asm volatile("" : "+r" (tmp
));
1700 static inline int populate_read_section(MemoryRegionSection
*section
,
1703 const hwaddr size
= int128_get64(section
->size
);
1704 hwaddr offset
= section
->offset_within_region
;
1705 RAMBlock
*block
= section
->mr
->ram_block
;
1707 populate_read_range(block
, offset
, size
);
1712 * ram_block_populate_read: preallocate page tables and populate pages in the
1713 * RAM block by reading a byte of each page.
1715 * Since it's solely used for userfault_fd WP feature, here we just
1716 * hardcode page size to qemu_real_host_page_size.
1718 * @block: RAM block to populate
1720 static void ram_block_populate_read(RAMBlock
*rb
)
1723 * Skip populating all pages that fall into a discarded range as managed by
1724 * a RamDiscardManager responsible for the mapped memory region of the
1725 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1726 * must not get populated automatically. We don't have to track
1727 * modifications via userfaultfd WP reliably, because these pages will
1728 * not be part of the migration stream either way -- see
1729 * ramblock_dirty_bitmap_exclude_discarded_pages().
1731 * Note: The result is only stable while migrating (precopy/postcopy).
1733 if (rb
->mr
&& memory_region_has_ram_discard_manager(rb
->mr
)) {
1734 RamDiscardManager
*rdm
= memory_region_get_ram_discard_manager(rb
->mr
);
1735 MemoryRegionSection section
= {
1737 .offset_within_region
= 0,
1738 .size
= rb
->mr
->size
,
1741 ram_discard_manager_replay_populated(rdm
, §ion
,
1742 populate_read_section
, NULL
);
1744 populate_read_range(rb
, 0, rb
->used_length
);
1749 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1751 void ram_write_tracking_prepare(void)
1755 RCU_READ_LOCK_GUARD();
1757 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
1758 /* Nothing to do with read-only and MMIO-writable regions */
1759 if (block
->mr
->readonly
|| block
->mr
->rom_device
) {
1764 * Populate pages of the RAM block before enabling userfault_fd
1767 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1768 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1769 * pages with pte_none() entries in page table.
1771 ram_block_populate_read(block
);
1776 * ram_write_tracking_start: start UFFD-WP memory tracking
1778 * Returns 0 for success or negative value in case of error
1780 int ram_write_tracking_start(void)
1783 RAMState
*rs
= ram_state
;
1786 /* Open UFFD file descriptor */
1787 uffd_fd
= uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP
, true);
1791 rs
->uffdio_fd
= uffd_fd
;
1793 RCU_READ_LOCK_GUARD();
1795 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
1796 /* Nothing to do with read-only and MMIO-writable regions */
1797 if (block
->mr
->readonly
|| block
->mr
->rom_device
) {
1801 /* Register block memory with UFFD to track writes */
1802 if (uffd_register_memory(rs
->uffdio_fd
, block
->host
,
1803 block
->max_length
, UFFDIO_REGISTER_MODE_WP
, NULL
)) {
1806 /* Apply UFFD write protection to the block memory range */
1807 if (uffd_change_protection(rs
->uffdio_fd
, block
->host
,
1808 block
->max_length
, true, false)) {
1811 block
->flags
|= RAM_UF_WRITEPROTECT
;
1812 memory_region_ref(block
->mr
);
1814 trace_ram_write_tracking_ramblock_start(block
->idstr
, block
->page_size
,
1815 block
->host
, block
->max_length
);
1821 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1823 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
1824 if ((block
->flags
& RAM_UF_WRITEPROTECT
) == 0) {
1828 * In case some memory block failed to be write-protected
1829 * remove protection and unregister all succeeded RAM blocks
1831 uffd_change_protection(rs
->uffdio_fd
, block
->host
, block
->max_length
,
1833 uffd_unregister_memory(rs
->uffdio_fd
, block
->host
, block
->max_length
);
1834 /* Cleanup flags and remove reference */
1835 block
->flags
&= ~RAM_UF_WRITEPROTECT
;
1836 memory_region_unref(block
->mr
);
1839 uffd_close_fd(uffd_fd
);
1845 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1847 void ram_write_tracking_stop(void)
1849 RAMState
*rs
= ram_state
;
1852 RCU_READ_LOCK_GUARD();
1854 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
1855 if ((block
->flags
& RAM_UF_WRITEPROTECT
) == 0) {
1858 /* Remove protection and unregister all affected RAM blocks */
1859 uffd_change_protection(rs
->uffdio_fd
, block
->host
, block
->max_length
,
1861 uffd_unregister_memory(rs
->uffdio_fd
, block
->host
, block
->max_length
);
1863 trace_ram_write_tracking_ramblock_stop(block
->idstr
, block
->page_size
,
1864 block
->host
, block
->max_length
);
1866 /* Cleanup flags and remove reference */
1867 block
->flags
&= ~RAM_UF_WRITEPROTECT
;
1868 memory_region_unref(block
->mr
);
1871 /* Finally close UFFD file descriptor */
1872 uffd_close_fd(rs
->uffdio_fd
);
1877 /* No target OS support, stubs just fail or ignore */
1879 static RAMBlock
*poll_fault_page(RAMState
*rs
, ram_addr_t
*offset
)
1887 static int ram_save_release_protection(RAMState
*rs
, PageSearchStatus
*pss
,
1888 unsigned long start_page
)
1897 bool ram_write_tracking_available(void)
1902 bool ram_write_tracking_compatible(void)
1908 int ram_write_tracking_start(void)
1914 void ram_write_tracking_stop(void)
1918 #endif /* defined(__linux__) */
1921 * get_queued_page: unqueue a page from the postcopy requests
1923 * Skips pages that are already sent (!dirty)
1925 * Returns true if a queued page is found
1927 * @rs: current RAM state
1928 * @pss: data about the state of the current dirty page scan
1930 static bool get_queued_page(RAMState
*rs
, PageSearchStatus
*pss
)
1937 block
= unqueue_page(rs
, &offset
);
1939 * We're sending this page, and since it's postcopy nothing else
1940 * will dirty it, and we must make sure it doesn't get sent again
1941 * even if this queue request was received after the background
1942 * search already sent it.
1947 page
= offset
>> TARGET_PAGE_BITS
;
1948 dirty
= test_bit(page
, block
->bmap
);
1950 trace_get_queued_page_not_dirty(block
->idstr
, (uint64_t)offset
,
1953 trace_get_queued_page(block
->idstr
, (uint64_t)offset
, page
);
1957 } while (block
&& !dirty
);
1961 * Poll write faults too if background snapshot is enabled; that's
1962 * when we have vcpus got blocked by the write protected pages.
1964 block
= poll_fault_page(rs
, &offset
);
1969 * We want the background search to continue from the queued page
1970 * since the guest is likely to want other pages near to the page
1971 * it just requested.
1974 pss
->page
= offset
>> TARGET_PAGE_BITS
;
1977 * This unqueued page would break the "one round" check, even is
1980 pss
->complete_round
= false;
1987 * migration_page_queue_free: drop any remaining pages in the ram
1990 * It should be empty at the end anyway, but in error cases there may
1991 * be some left. in case that there is any page left, we drop it.
1994 static void migration_page_queue_free(RAMState
*rs
)
1996 struct RAMSrcPageRequest
*mspr
, *next_mspr
;
1997 /* This queue generally should be empty - but in the case of a failed
1998 * migration might have some droppings in.
2000 RCU_READ_LOCK_GUARD();
2001 QSIMPLEQ_FOREACH_SAFE(mspr
, &rs
->src_page_requests
, next_req
, next_mspr
) {
2002 memory_region_unref(mspr
->rb
->mr
);
2003 QSIMPLEQ_REMOVE_HEAD(&rs
->src_page_requests
, next_req
);
2009 * ram_save_queue_pages: queue the page for transmission
2011 * A request from postcopy destination for example.
2013 * Returns zero on success or negative on error
2015 * @rbname: Name of the RAMBLock of the request. NULL means the
2016 * same that last one.
2017 * @start: starting address from the start of the RAMBlock
2018 * @len: length (in bytes) to send
2020 int ram_save_queue_pages(const char *rbname
, ram_addr_t start
, ram_addr_t len
)
2023 RAMState
*rs
= ram_state
;
2025 ram_counters
.postcopy_requests
++;
2026 RCU_READ_LOCK_GUARD();
2029 /* Reuse last RAMBlock */
2030 ramblock
= rs
->last_req_rb
;
2034 * Shouldn't happen, we can't reuse the last RAMBlock if
2035 * it's the 1st request.
2037 error_report("ram_save_queue_pages no previous block");
2041 ramblock
= qemu_ram_block_by_name(rbname
);
2044 /* We shouldn't be asked for a non-existent RAMBlock */
2045 error_report("ram_save_queue_pages no block '%s'", rbname
);
2048 rs
->last_req_rb
= ramblock
;
2050 trace_ram_save_queue_pages(ramblock
->idstr
, start
, len
);
2051 if (!offset_in_ramblock(ramblock
, start
+ len
- 1)) {
2052 error_report("%s request overrun start=" RAM_ADDR_FMT
" len="
2053 RAM_ADDR_FMT
" blocklen=" RAM_ADDR_FMT
,
2054 __func__
, start
, len
, ramblock
->used_length
);
2058 struct RAMSrcPageRequest
*new_entry
=
2059 g_malloc0(sizeof(struct RAMSrcPageRequest
));
2060 new_entry
->rb
= ramblock
;
2061 new_entry
->offset
= start
;
2062 new_entry
->len
= len
;
2064 memory_region_ref(ramblock
->mr
);
2065 qemu_mutex_lock(&rs
->src_page_req_mutex
);
2066 QSIMPLEQ_INSERT_TAIL(&rs
->src_page_requests
, new_entry
, next_req
);
2067 migration_make_urgent_request();
2068 qemu_mutex_unlock(&rs
->src_page_req_mutex
);
2073 static bool save_page_use_compression(RAMState
*rs
)
2075 if (!migrate_use_compression()) {
2080 * If xbzrle is enabled (e.g., after first round of migration), stop
2081 * using the data compression. In theory, xbzrle can do better than
2084 if (rs
->xbzrle_enabled
) {
2092 * try to compress the page before posting it out, return true if the page
2093 * has been properly handled by compression, otherwise needs other
2094 * paths to handle it
2096 static bool save_compress_page(RAMState
*rs
, RAMBlock
*block
, ram_addr_t offset
)
2098 if (!save_page_use_compression(rs
)) {
2103 * When starting the process of a new block, the first page of
2104 * the block should be sent out before other pages in the same
2105 * block, and all the pages in last block should have been sent
2106 * out, keeping this order is important, because the 'cont' flag
2107 * is used to avoid resending the block name.
2109 * We post the fist page as normal page as compression will take
2110 * much CPU resource.
2112 if (block
!= rs
->last_sent_block
) {
2113 flush_compressed_data(rs
);
2117 if (compress_page_with_multi_thread(rs
, block
, offset
) > 0) {
2121 compression_counters
.busy
++;
2126 * ram_save_target_page: save one target page
2128 * Returns the number of pages written
2130 * @rs: current RAM state
2131 * @pss: data about the page we want to send
2132 * @last_stage: if we are at the completion stage
2134 static int ram_save_target_page(RAMState
*rs
, PageSearchStatus
*pss
,
2137 RAMBlock
*block
= pss
->block
;
2138 ram_addr_t offset
= ((ram_addr_t
)pss
->page
) << TARGET_PAGE_BITS
;
2141 if (control_save_page(rs
, block
, offset
, &res
)) {
2145 if (save_compress_page(rs
, block
, offset
)) {
2149 res
= save_zero_page(rs
, block
, offset
);
2151 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2152 * page would be stale
2154 if (!save_page_use_compression(rs
)) {
2155 XBZRLE_cache_lock();
2156 xbzrle_cache_zero_page(rs
, block
->offset
+ offset
);
2157 XBZRLE_cache_unlock();
2159 ram_release_pages(block
->idstr
, offset
, res
);
2164 * Do not use multifd for:
2165 * 1. Compression as the first page in the new block should be posted out
2166 * before sending the compressed page
2167 * 2. In postcopy as one whole host page should be placed
2169 if (!save_page_use_compression(rs
) && migrate_use_multifd()
2170 && !migration_in_postcopy()) {
2171 return ram_save_multifd_page(rs
, block
, offset
);
2174 return ram_save_page(rs
, pss
, last_stage
);
2178 * ram_save_host_page: save a whole host page
2180 * Starting at *offset send pages up to the end of the current host
2181 * page. It's valid for the initial offset to point into the middle of
2182 * a host page in which case the remainder of the hostpage is sent.
2183 * Only dirty target pages are sent. Note that the host page size may
2184 * be a huge page for this block.
2185 * The saving stops at the boundary of the used_length of the block
2186 * if the RAMBlock isn't a multiple of the host page size.
2188 * Returns the number of pages written or negative on error
2190 * @rs: current RAM state
2191 * @ms: current migration state
2192 * @pss: data about the page we want to send
2193 * @last_stage: if we are at the completion stage
2195 static int ram_save_host_page(RAMState
*rs
, PageSearchStatus
*pss
,
2198 int tmppages
, pages
= 0;
2199 size_t pagesize_bits
=
2200 qemu_ram_pagesize(pss
->block
) >> TARGET_PAGE_BITS
;
2201 unsigned long hostpage_boundary
=
2202 QEMU_ALIGN_UP(pss
->page
+ 1, pagesize_bits
);
2203 unsigned long start_page
= pss
->page
;
2206 if (ramblock_is_ignored(pss
->block
)) {
2207 error_report("block %s should not be migrated !", pss
->block
->idstr
);
2212 /* Check the pages is dirty and if it is send it */
2213 if (migration_bitmap_clear_dirty(rs
, pss
->block
, pss
->page
)) {
2214 tmppages
= ram_save_target_page(rs
, pss
, last_stage
);
2221 * Allow rate limiting to happen in the middle of huge pages if
2222 * something is sent in the current iteration.
2224 if (pagesize_bits
> 1 && tmppages
> 0) {
2225 migration_rate_limit();
2228 pss
->page
= migration_bitmap_find_dirty(rs
, pss
->block
, pss
->page
);
2229 } while ((pss
->page
< hostpage_boundary
) &&
2230 offset_in_ramblock(pss
->block
,
2231 ((ram_addr_t
)pss
->page
) << TARGET_PAGE_BITS
));
2232 /* The offset we leave with is the min boundary of host page and block */
2233 pss
->page
= MIN(pss
->page
, hostpage_boundary
) - 1;
2235 res
= ram_save_release_protection(rs
, pss
, start_page
);
2236 return (res
< 0 ? res
: pages
);
2240 * ram_find_and_save_block: finds a dirty page and sends it to f
2242 * Called within an RCU critical section.
2244 * Returns the number of pages written where zero means no dirty pages,
2245 * or negative on error
2247 * @rs: current RAM state
2248 * @last_stage: if we are at the completion stage
2250 * On systems where host-page-size > target-page-size it will send all the
2251 * pages in a host page that are dirty.
2254 static int ram_find_and_save_block(RAMState
*rs
, bool last_stage
)
2256 PageSearchStatus pss
;
2260 /* No dirty page as there is zero RAM */
2261 if (!ram_bytes_total()) {
2265 pss
.block
= rs
->last_seen_block
;
2266 pss
.page
= rs
->last_page
;
2267 pss
.complete_round
= false;
2270 pss
.block
= QLIST_FIRST_RCU(&ram_list
.blocks
);
2275 found
= get_queued_page(rs
, &pss
);
2278 /* priority queue empty, so just search for something dirty */
2279 found
= find_dirty_block(rs
, &pss
, &again
);
2283 pages
= ram_save_host_page(rs
, &pss
, last_stage
);
2285 } while (!pages
&& again
);
2287 rs
->last_seen_block
= pss
.block
;
2288 rs
->last_page
= pss
.page
;
2293 void acct_update_position(QEMUFile
*f
, size_t size
, bool zero
)
2295 uint64_t pages
= size
/ TARGET_PAGE_SIZE
;
2298 ram_counters
.duplicate
+= pages
;
2300 ram_counters
.normal
+= pages
;
2301 ram_counters
.transferred
+= size
;
2302 qemu_update_position(f
, size
);
2306 static uint64_t ram_bytes_total_common(bool count_ignored
)
2311 RCU_READ_LOCK_GUARD();
2313 if (count_ignored
) {
2314 RAMBLOCK_FOREACH_MIGRATABLE(block
) {
2315 total
+= block
->used_length
;
2318 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2319 total
+= block
->used_length
;
2325 uint64_t ram_bytes_total(void)
2327 return ram_bytes_total_common(false);
2330 static void xbzrle_load_setup(void)
2332 XBZRLE
.decoded_buf
= g_malloc(TARGET_PAGE_SIZE
);
2335 static void xbzrle_load_cleanup(void)
2337 g_free(XBZRLE
.decoded_buf
);
2338 XBZRLE
.decoded_buf
= NULL
;
2341 static void ram_state_cleanup(RAMState
**rsp
)
2344 migration_page_queue_free(*rsp
);
2345 qemu_mutex_destroy(&(*rsp
)->bitmap_mutex
);
2346 qemu_mutex_destroy(&(*rsp
)->src_page_req_mutex
);
2352 static void xbzrle_cleanup(void)
2354 XBZRLE_cache_lock();
2356 cache_fini(XBZRLE
.cache
);
2357 g_free(XBZRLE
.encoded_buf
);
2358 g_free(XBZRLE
.current_buf
);
2359 g_free(XBZRLE
.zero_target_page
);
2360 XBZRLE
.cache
= NULL
;
2361 XBZRLE
.encoded_buf
= NULL
;
2362 XBZRLE
.current_buf
= NULL
;
2363 XBZRLE
.zero_target_page
= NULL
;
2365 XBZRLE_cache_unlock();
2368 static void ram_save_cleanup(void *opaque
)
2370 RAMState
**rsp
= opaque
;
2373 /* We don't use dirty log with background snapshots */
2374 if (!migrate_background_snapshot()) {
2375 /* caller have hold iothread lock or is in a bh, so there is
2376 * no writing race against the migration bitmap
2378 if (global_dirty_tracking
& GLOBAL_DIRTY_MIGRATION
) {
2380 * do not stop dirty log without starting it, since
2381 * memory_global_dirty_log_stop will assert that
2382 * memory_global_dirty_log_start/stop used in pairs
2384 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION
);
2388 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2389 g_free(block
->clear_bmap
);
2390 block
->clear_bmap
= NULL
;
2391 g_free(block
->bmap
);
2396 compress_threads_save_cleanup();
2397 ram_state_cleanup(rsp
);
2400 static void ram_state_reset(RAMState
*rs
)
2402 rs
->last_seen_block
= NULL
;
2403 rs
->last_sent_block
= NULL
;
2405 rs
->last_version
= ram_list
.version
;
2406 rs
->xbzrle_enabled
= false;
2409 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2412 * 'expected' is the value you expect the bitmap mostly to be full
2413 * of; it won't bother printing lines that are all this value.
2414 * If 'todump' is null the migration bitmap is dumped.
2416 void ram_debug_dump_bitmap(unsigned long *todump
, bool expected
,
2417 unsigned long pages
)
2420 int64_t linelen
= 128;
2423 for (cur
= 0; cur
< pages
; cur
+= linelen
) {
2427 * Last line; catch the case where the line length
2428 * is longer than remaining ram
2430 if (cur
+ linelen
> pages
) {
2431 linelen
= pages
- cur
;
2433 for (curb
= 0; curb
< linelen
; curb
++) {
2434 bool thisbit
= test_bit(cur
+ curb
, todump
);
2435 linebuf
[curb
] = thisbit
? '1' : '.';
2436 found
= found
|| (thisbit
!= expected
);
2439 linebuf
[curb
] = '\0';
2440 fprintf(stderr
, "0x%08" PRIx64
" : %s\n", cur
, linebuf
);
2445 /* **** functions for postcopy ***** */
2447 void ram_postcopy_migrated_memory_release(MigrationState
*ms
)
2449 struct RAMBlock
*block
;
2451 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2452 unsigned long *bitmap
= block
->bmap
;
2453 unsigned long range
= block
->used_length
>> TARGET_PAGE_BITS
;
2454 unsigned long run_start
= find_next_zero_bit(bitmap
, range
, 0);
2456 while (run_start
< range
) {
2457 unsigned long run_end
= find_next_bit(bitmap
, range
, run_start
+ 1);
2458 ram_discard_range(block
->idstr
,
2459 ((ram_addr_t
)run_start
) << TARGET_PAGE_BITS
,
2460 ((ram_addr_t
)(run_end
- run_start
))
2461 << TARGET_PAGE_BITS
);
2462 run_start
= find_next_zero_bit(bitmap
, range
, run_end
+ 1);
2468 * postcopy_send_discard_bm_ram: discard a RAMBlock
2470 * Returns zero on success
2472 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2474 * @ms: current migration state
2475 * @block: RAMBlock to discard
2477 static int postcopy_send_discard_bm_ram(MigrationState
*ms
, RAMBlock
*block
)
2479 unsigned long end
= block
->used_length
>> TARGET_PAGE_BITS
;
2480 unsigned long current
;
2481 unsigned long *bitmap
= block
->bmap
;
2483 for (current
= 0; current
< end
; ) {
2484 unsigned long one
= find_next_bit(bitmap
, end
, current
);
2485 unsigned long zero
, discard_length
;
2491 zero
= find_next_zero_bit(bitmap
, end
, one
+ 1);
2494 discard_length
= end
- one
;
2496 discard_length
= zero
- one
;
2498 postcopy_discard_send_range(ms
, one
, discard_length
);
2499 current
= one
+ discard_length
;
2506 * postcopy_each_ram_send_discard: discard all RAMBlocks
2508 * Returns 0 for success or negative for error
2510 * Utility for the outgoing postcopy code.
2511 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2512 * passing it bitmap indexes and name.
2513 * (qemu_ram_foreach_block ends up passing unscaled lengths
2514 * which would mean postcopy code would have to deal with target page)
2516 * @ms: current migration state
2518 static int postcopy_each_ram_send_discard(MigrationState
*ms
)
2520 struct RAMBlock
*block
;
2523 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2524 postcopy_discard_send_init(ms
, block
->idstr
);
2527 * Postcopy sends chunks of bitmap over the wire, but it
2528 * just needs indexes at this point, avoids it having
2529 * target page specific code.
2531 ret
= postcopy_send_discard_bm_ram(ms
, block
);
2532 postcopy_discard_send_finish(ms
);
2542 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2544 * Helper for postcopy_chunk_hostpages; it's called twice to
2545 * canonicalize the two bitmaps, that are similar, but one is
2548 * Postcopy requires that all target pages in a hostpage are dirty or
2549 * clean, not a mix. This function canonicalizes the bitmaps.
2551 * @ms: current migration state
2552 * @block: block that contains the page we want to canonicalize
2554 static void postcopy_chunk_hostpages_pass(MigrationState
*ms
, RAMBlock
*block
)
2556 RAMState
*rs
= ram_state
;
2557 unsigned long *bitmap
= block
->bmap
;
2558 unsigned int host_ratio
= block
->page_size
/ TARGET_PAGE_SIZE
;
2559 unsigned long pages
= block
->used_length
>> TARGET_PAGE_BITS
;
2560 unsigned long run_start
;
2562 if (block
->page_size
== TARGET_PAGE_SIZE
) {
2563 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2567 /* Find a dirty page */
2568 run_start
= find_next_bit(bitmap
, pages
, 0);
2570 while (run_start
< pages
) {
2573 * If the start of this run of pages is in the middle of a host
2574 * page, then we need to fixup this host page.
2576 if (QEMU_IS_ALIGNED(run_start
, host_ratio
)) {
2577 /* Find the end of this run */
2578 run_start
= find_next_zero_bit(bitmap
, pages
, run_start
+ 1);
2580 * If the end isn't at the start of a host page, then the
2581 * run doesn't finish at the end of a host page
2582 * and we need to discard.
2586 if (!QEMU_IS_ALIGNED(run_start
, host_ratio
)) {
2588 unsigned long fixup_start_addr
= QEMU_ALIGN_DOWN(run_start
,
2590 run_start
= QEMU_ALIGN_UP(run_start
, host_ratio
);
2592 /* Clean up the bitmap */
2593 for (page
= fixup_start_addr
;
2594 page
< fixup_start_addr
+ host_ratio
; page
++) {
2596 * Remark them as dirty, updating the count for any pages
2597 * that weren't previously dirty.
2599 rs
->migration_dirty_pages
+= !test_and_set_bit(page
, bitmap
);
2603 /* Find the next dirty page for the next iteration */
2604 run_start
= find_next_bit(bitmap
, pages
, run_start
);
2609 * postcopy_chunk_hostpages: discard any partially sent host page
2611 * Utility for the outgoing postcopy code.
2613 * Discard any partially sent host-page size chunks, mark any partially
2614 * dirty host-page size chunks as all dirty. In this case the host-page
2615 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2617 * Returns zero on success
2619 * @ms: current migration state
2620 * @block: block we want to work with
2622 static int postcopy_chunk_hostpages(MigrationState
*ms
, RAMBlock
*block
)
2624 postcopy_discard_send_init(ms
, block
->idstr
);
2627 * Ensure that all partially dirty host pages are made fully dirty.
2629 postcopy_chunk_hostpages_pass(ms
, block
);
2631 postcopy_discard_send_finish(ms
);
2636 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2638 * Returns zero on success
2640 * Transmit the set of pages to be discarded after precopy to the target
2641 * these are pages that:
2642 * a) Have been previously transmitted but are now dirty again
2643 * b) Pages that have never been transmitted, this ensures that
2644 * any pages on the destination that have been mapped by background
2645 * tasks get discarded (transparent huge pages is the specific concern)
2646 * Hopefully this is pretty sparse
2648 * @ms: current migration state
2650 int ram_postcopy_send_discard_bitmap(MigrationState
*ms
)
2652 RAMState
*rs
= ram_state
;
2656 RCU_READ_LOCK_GUARD();
2658 /* This should be our last sync, the src is now paused */
2659 migration_bitmap_sync(rs
);
2661 /* Easiest way to make sure we don't resume in the middle of a host-page */
2662 rs
->last_seen_block
= NULL
;
2663 rs
->last_sent_block
= NULL
;
2666 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2667 /* Deal with TPS != HPS and huge pages */
2668 ret
= postcopy_chunk_hostpages(ms
, block
);
2673 #ifdef DEBUG_POSTCOPY
2674 ram_debug_dump_bitmap(block
->bmap
, true,
2675 block
->used_length
>> TARGET_PAGE_BITS
);
2678 trace_ram_postcopy_send_discard_bitmap();
2680 return postcopy_each_ram_send_discard(ms
);
2684 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2686 * Returns zero on success
2688 * @rbname: name of the RAMBlock of the request. NULL means the
2689 * same that last one.
2690 * @start: RAMBlock starting page
2691 * @length: RAMBlock size
2693 int ram_discard_range(const char *rbname
, uint64_t start
, size_t length
)
2695 trace_ram_discard_range(rbname
, start
, length
);
2697 RCU_READ_LOCK_GUARD();
2698 RAMBlock
*rb
= qemu_ram_block_by_name(rbname
);
2701 error_report("ram_discard_range: Failed to find block '%s'", rbname
);
2706 * On source VM, we don't need to update the received bitmap since
2707 * we don't even have one.
2709 if (rb
->receivedmap
) {
2710 bitmap_clear(rb
->receivedmap
, start
>> qemu_target_page_bits(),
2711 length
>> qemu_target_page_bits());
2714 return ram_block_discard_range(rb
, start
, length
);
2718 * For every allocation, we will try not to crash the VM if the
2719 * allocation failed.
2721 static int xbzrle_init(void)
2723 Error
*local_err
= NULL
;
2725 if (!migrate_use_xbzrle()) {
2729 XBZRLE_cache_lock();
2731 XBZRLE
.zero_target_page
= g_try_malloc0(TARGET_PAGE_SIZE
);
2732 if (!XBZRLE
.zero_target_page
) {
2733 error_report("%s: Error allocating zero page", __func__
);
2737 XBZRLE
.cache
= cache_init(migrate_xbzrle_cache_size(),
2738 TARGET_PAGE_SIZE
, &local_err
);
2739 if (!XBZRLE
.cache
) {
2740 error_report_err(local_err
);
2741 goto free_zero_page
;
2744 XBZRLE
.encoded_buf
= g_try_malloc0(TARGET_PAGE_SIZE
);
2745 if (!XBZRLE
.encoded_buf
) {
2746 error_report("%s: Error allocating encoded_buf", __func__
);
2750 XBZRLE
.current_buf
= g_try_malloc(TARGET_PAGE_SIZE
);
2751 if (!XBZRLE
.current_buf
) {
2752 error_report("%s: Error allocating current_buf", __func__
);
2753 goto free_encoded_buf
;
2756 /* We are all good */
2757 XBZRLE_cache_unlock();
2761 g_free(XBZRLE
.encoded_buf
);
2762 XBZRLE
.encoded_buf
= NULL
;
2764 cache_fini(XBZRLE
.cache
);
2765 XBZRLE
.cache
= NULL
;
2767 g_free(XBZRLE
.zero_target_page
);
2768 XBZRLE
.zero_target_page
= NULL
;
2770 XBZRLE_cache_unlock();
2774 static int ram_state_init(RAMState
**rsp
)
2776 *rsp
= g_try_new0(RAMState
, 1);
2779 error_report("%s: Init ramstate fail", __func__
);
2783 qemu_mutex_init(&(*rsp
)->bitmap_mutex
);
2784 qemu_mutex_init(&(*rsp
)->src_page_req_mutex
);
2785 QSIMPLEQ_INIT(&(*rsp
)->src_page_requests
);
2788 * Count the total number of pages used by ram blocks not including any
2789 * gaps due to alignment or unplugs.
2790 * This must match with the initial values of dirty bitmap.
2792 (*rsp
)->migration_dirty_pages
= ram_bytes_total() >> TARGET_PAGE_BITS
;
2793 ram_state_reset(*rsp
);
2798 static void ram_list_init_bitmaps(void)
2800 MigrationState
*ms
= migrate_get_current();
2802 unsigned long pages
;
2805 /* Skip setting bitmap if there is no RAM */
2806 if (ram_bytes_total()) {
2807 shift
= ms
->clear_bitmap_shift
;
2808 if (shift
> CLEAR_BITMAP_SHIFT_MAX
) {
2809 error_report("clear_bitmap_shift (%u) too big, using "
2810 "max value (%u)", shift
, CLEAR_BITMAP_SHIFT_MAX
);
2811 shift
= CLEAR_BITMAP_SHIFT_MAX
;
2812 } else if (shift
< CLEAR_BITMAP_SHIFT_MIN
) {
2813 error_report("clear_bitmap_shift (%u) too small, using "
2814 "min value (%u)", shift
, CLEAR_BITMAP_SHIFT_MIN
);
2815 shift
= CLEAR_BITMAP_SHIFT_MIN
;
2818 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2819 pages
= block
->max_length
>> TARGET_PAGE_BITS
;
2821 * The initial dirty bitmap for migration must be set with all
2822 * ones to make sure we'll migrate every guest RAM page to
2824 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2825 * new migration after a failed migration, ram_list.
2826 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2829 block
->bmap
= bitmap_new(pages
);
2830 bitmap_set(block
->bmap
, 0, pages
);
2831 block
->clear_bmap_shift
= shift
;
2832 block
->clear_bmap
= bitmap_new(clear_bmap_size(pages
, shift
));
2837 static void migration_bitmap_clear_discarded_pages(RAMState
*rs
)
2839 unsigned long pages
;
2842 RCU_READ_LOCK_GUARD();
2844 RAMBLOCK_FOREACH_NOT_IGNORED(rb
) {
2845 pages
= ramblock_dirty_bitmap_clear_discarded_pages(rb
);
2846 rs
->migration_dirty_pages
-= pages
;
2850 static void ram_init_bitmaps(RAMState
*rs
)
2852 /* For memory_global_dirty_log_start below. */
2853 qemu_mutex_lock_iothread();
2854 qemu_mutex_lock_ramlist();
2856 WITH_RCU_READ_LOCK_GUARD() {
2857 ram_list_init_bitmaps();
2858 /* We don't use dirty log with background snapshots */
2859 if (!migrate_background_snapshot()) {
2860 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION
);
2861 migration_bitmap_sync_precopy(rs
);
2864 qemu_mutex_unlock_ramlist();
2865 qemu_mutex_unlock_iothread();
2868 * After an eventual first bitmap sync, fixup the initial bitmap
2869 * containing all 1s to exclude any discarded pages from migration.
2871 migration_bitmap_clear_discarded_pages(rs
);
2874 static int ram_init_all(RAMState
**rsp
)
2876 if (ram_state_init(rsp
)) {
2880 if (xbzrle_init()) {
2881 ram_state_cleanup(rsp
);
2885 ram_init_bitmaps(*rsp
);
2890 static void ram_state_resume_prepare(RAMState
*rs
, QEMUFile
*out
)
2896 * Postcopy is not using xbzrle/compression, so no need for that.
2897 * Also, since source are already halted, we don't need to care
2898 * about dirty page logging as well.
2901 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2902 pages
+= bitmap_count_one(block
->bmap
,
2903 block
->used_length
>> TARGET_PAGE_BITS
);
2906 /* This may not be aligned with current bitmaps. Recalculate. */
2907 rs
->migration_dirty_pages
= pages
;
2909 ram_state_reset(rs
);
2911 /* Update RAMState cache of output QEMUFile */
2914 trace_ram_state_resume_prepare(pages
);
2918 * This function clears bits of the free pages reported by the caller from the
2919 * migration dirty bitmap. @addr is the host address corresponding to the
2920 * start of the continuous guest free pages, and @len is the total bytes of
2923 void qemu_guest_free_page_hint(void *addr
, size_t len
)
2927 size_t used_len
, start
, npages
;
2928 MigrationState
*s
= migrate_get_current();
2930 /* This function is currently expected to be used during live migration */
2931 if (!migration_is_setup_or_active(s
->state
)) {
2935 for (; len
> 0; len
-= used_len
, addr
+= used_len
) {
2936 block
= qemu_ram_block_from_host(addr
, false, &offset
);
2937 if (unlikely(!block
|| offset
>= block
->used_length
)) {
2939 * The implementation might not support RAMBlock resize during
2940 * live migration, but it could happen in theory with future
2941 * updates. So we add a check here to capture that case.
2943 error_report_once("%s unexpected error", __func__
);
2947 if (len
<= block
->used_length
- offset
) {
2950 used_len
= block
->used_length
- offset
;
2953 start
= offset
>> TARGET_PAGE_BITS
;
2954 npages
= used_len
>> TARGET_PAGE_BITS
;
2956 qemu_mutex_lock(&ram_state
->bitmap_mutex
);
2958 * The skipped free pages are equavalent to be sent from clear_bmap's
2959 * perspective, so clear the bits from the memory region bitmap which
2960 * are initially set. Otherwise those skipped pages will be sent in
2961 * the next round after syncing from the memory region bitmap.
2963 migration_clear_memory_region_dirty_bitmap_range(block
, start
, npages
);
2964 ram_state
->migration_dirty_pages
-=
2965 bitmap_count_one_with_offset(block
->bmap
, start
, npages
);
2966 bitmap_clear(block
->bmap
, start
, npages
);
2967 qemu_mutex_unlock(&ram_state
->bitmap_mutex
);
2972 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2973 * long-running RCU critical section. When rcu-reclaims in the code
2974 * start to become numerous it will be necessary to reduce the
2975 * granularity of these critical sections.
2979 * ram_save_setup: Setup RAM for migration
2981 * Returns zero to indicate success and negative for error
2983 * @f: QEMUFile where to send the data
2984 * @opaque: RAMState pointer
2986 static int ram_save_setup(QEMUFile
*f
, void *opaque
)
2988 RAMState
**rsp
= opaque
;
2991 if (compress_threads_save_setup()) {
2995 /* migration has already setup the bitmap, reuse it. */
2996 if (!migration_in_colo_state()) {
2997 if (ram_init_all(rsp
) != 0) {
2998 compress_threads_save_cleanup();
3004 WITH_RCU_READ_LOCK_GUARD() {
3005 qemu_put_be64(f
, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE
);
3007 RAMBLOCK_FOREACH_MIGRATABLE(block
) {
3008 qemu_put_byte(f
, strlen(block
->idstr
));
3009 qemu_put_buffer(f
, (uint8_t *)block
->idstr
, strlen(block
->idstr
));
3010 qemu_put_be64(f
, block
->used_length
);
3011 if (migrate_postcopy_ram() && block
->page_size
!=
3012 qemu_host_page_size
) {
3013 qemu_put_be64(f
, block
->page_size
);
3015 if (migrate_ignore_shared()) {
3016 qemu_put_be64(f
, block
->mr
->addr
);
3021 ram_control_before_iterate(f
, RAM_CONTROL_SETUP
);
3022 ram_control_after_iterate(f
, RAM_CONTROL_SETUP
);
3024 multifd_send_sync_main(f
);
3025 qemu_put_be64(f
, RAM_SAVE_FLAG_EOS
);
3032 * ram_save_iterate: iterative stage for migration
3034 * Returns zero to indicate success and negative for error
3036 * @f: QEMUFile where to send the data
3037 * @opaque: RAMState pointer
3039 static int ram_save_iterate(QEMUFile
*f
, void *opaque
)
3041 RAMState
**temp
= opaque
;
3042 RAMState
*rs
= *temp
;
3048 if (blk_mig_bulk_active()) {
3049 /* Avoid transferring ram during bulk phase of block migration as
3050 * the bulk phase will usually take a long time and transferring
3051 * ram updates during that time is pointless. */
3056 * We'll take this lock a little bit long, but it's okay for two reasons.
3057 * Firstly, the only possible other thread to take it is who calls
3058 * qemu_guest_free_page_hint(), which should be rare; secondly, see
3059 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3060 * guarantees that we'll at least released it in a regular basis.
3062 qemu_mutex_lock(&rs
->bitmap_mutex
);
3063 WITH_RCU_READ_LOCK_GUARD() {
3064 if (ram_list
.version
!= rs
->last_version
) {
3065 ram_state_reset(rs
);
3068 /* Read version before ram_list.blocks */
3071 ram_control_before_iterate(f
, RAM_CONTROL_ROUND
);
3073 t0
= qemu_clock_get_ns(QEMU_CLOCK_REALTIME
);
3075 while ((ret
= qemu_file_rate_limit(f
)) == 0 ||
3076 !QSIMPLEQ_EMPTY(&rs
->src_page_requests
)) {
3079 if (qemu_file_get_error(f
)) {
3083 pages
= ram_find_and_save_block(rs
, false);
3084 /* no more pages to sent */
3091 qemu_file_set_error(f
, pages
);
3095 rs
->target_page_count
+= pages
;
3098 * During postcopy, it is necessary to make sure one whole host
3099 * page is sent in one chunk.
3101 if (migrate_postcopy_ram()) {
3102 flush_compressed_data(rs
);
3106 * we want to check in the 1st loop, just in case it was the 1st
3107 * time and we had to sync the dirty bitmap.
3108 * qemu_clock_get_ns() is a bit expensive, so we only check each
3111 if ((i
& 63) == 0) {
3112 uint64_t t1
= (qemu_clock_get_ns(QEMU_CLOCK_REALTIME
) - t0
) /
3114 if (t1
> MAX_WAIT
) {
3115 trace_ram_save_iterate_big_wait(t1
, i
);
3122 qemu_mutex_unlock(&rs
->bitmap_mutex
);
3125 * Must occur before EOS (or any QEMUFile operation)
3126 * because of RDMA protocol.
3128 ram_control_after_iterate(f
, RAM_CONTROL_ROUND
);
3132 && migration_is_setup_or_active(migrate_get_current()->state
)) {
3133 multifd_send_sync_main(rs
->f
);
3134 qemu_put_be64(f
, RAM_SAVE_FLAG_EOS
);
3136 ram_counters
.transferred
+= 8;
3138 ret
= qemu_file_get_error(f
);
3148 * ram_save_complete: function called to send the remaining amount of ram
3150 * Returns zero to indicate success or negative on error
3152 * Called with iothread lock
3154 * @f: QEMUFile where to send the data
3155 * @opaque: RAMState pointer
3157 static int ram_save_complete(QEMUFile
*f
, void *opaque
)
3159 RAMState
**temp
= opaque
;
3160 RAMState
*rs
= *temp
;
3163 WITH_RCU_READ_LOCK_GUARD() {
3164 if (!migration_in_postcopy()) {
3165 migration_bitmap_sync_precopy(rs
);
3168 ram_control_before_iterate(f
, RAM_CONTROL_FINISH
);
3170 /* try transferring iterative blocks of memory */
3172 /* flush all remaining blocks regardless of rate limiting */
3176 pages
= ram_find_and_save_block(rs
, !migration_in_colo_state());
3177 /* no more blocks to sent */
3187 flush_compressed_data(rs
);
3188 ram_control_after_iterate(f
, RAM_CONTROL_FINISH
);
3192 multifd_send_sync_main(rs
->f
);
3193 qemu_put_be64(f
, RAM_SAVE_FLAG_EOS
);
3200 static void ram_save_pending(QEMUFile
*f
, void *opaque
, uint64_t max_size
,
3201 uint64_t *res_precopy_only
,
3202 uint64_t *res_compatible
,
3203 uint64_t *res_postcopy_only
)
3205 RAMState
**temp
= opaque
;
3206 RAMState
*rs
= *temp
;
3207 uint64_t remaining_size
;
3209 remaining_size
= rs
->migration_dirty_pages
* TARGET_PAGE_SIZE
;
3211 if (!migration_in_postcopy() &&
3212 remaining_size
< max_size
) {
3213 qemu_mutex_lock_iothread();
3214 WITH_RCU_READ_LOCK_GUARD() {
3215 migration_bitmap_sync_precopy(rs
);
3217 qemu_mutex_unlock_iothread();
3218 remaining_size
= rs
->migration_dirty_pages
* TARGET_PAGE_SIZE
;
3221 if (migrate_postcopy_ram()) {
3222 /* We can do postcopy, and all the data is postcopiable */
3223 *res_compatible
+= remaining_size
;
3225 *res_precopy_only
+= remaining_size
;
3229 static int load_xbzrle(QEMUFile
*f
, ram_addr_t addr
, void *host
)
3231 unsigned int xh_len
;
3233 uint8_t *loaded_data
;
3235 /* extract RLE header */
3236 xh_flags
= qemu_get_byte(f
);
3237 xh_len
= qemu_get_be16(f
);
3239 if (xh_flags
!= ENCODING_FLAG_XBZRLE
) {
3240 error_report("Failed to load XBZRLE page - wrong compression!");
3244 if (xh_len
> TARGET_PAGE_SIZE
) {
3245 error_report("Failed to load XBZRLE page - len overflow!");
3248 loaded_data
= XBZRLE
.decoded_buf
;
3249 /* load data and decode */
3250 /* it can change loaded_data to point to an internal buffer */
3251 qemu_get_buffer_in_place(f
, &loaded_data
, xh_len
);
3254 if (xbzrle_decode_buffer(loaded_data
, xh_len
, host
,
3255 TARGET_PAGE_SIZE
) == -1) {
3256 error_report("Failed to load XBZRLE page - decode error!");
3264 * ram_block_from_stream: read a RAMBlock id from the migration stream
3266 * Must be called from within a rcu critical section.
3268 * Returns a pointer from within the RCU-protected ram_list.
3270 * @f: QEMUFile where to read the data from
3271 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3273 static inline RAMBlock
*ram_block_from_stream(QEMUFile
*f
, int flags
)
3275 static RAMBlock
*block
;
3279 if (flags
& RAM_SAVE_FLAG_CONTINUE
) {
3281 error_report("Ack, bad migration stream!");
3287 len
= qemu_get_byte(f
);
3288 qemu_get_buffer(f
, (uint8_t *)id
, len
);
3291 block
= qemu_ram_block_by_name(id
);
3293 error_report("Can't find block %s", id
);
3297 if (ramblock_is_ignored(block
)) {
3298 error_report("block %s should not be migrated !", id
);
3305 static inline void *host_from_ram_block_offset(RAMBlock
*block
,
3308 if (!offset_in_ramblock(block
, offset
)) {
3312 return block
->host
+ offset
;
3315 static void *host_page_from_ram_block_offset(RAMBlock
*block
,
3318 /* Note: Explicitly no check against offset_in_ramblock(). */
3319 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block
->host
+ offset
),
3323 static ram_addr_t
host_page_offset_from_ram_block_offset(RAMBlock
*block
,
3326 return ((uintptr_t)block
->host
+ offset
) & (block
->page_size
- 1);
3329 static inline void *colo_cache_from_block_offset(RAMBlock
*block
,
3330 ram_addr_t offset
, bool record_bitmap
)
3332 if (!offset_in_ramblock(block
, offset
)) {
3335 if (!block
->colo_cache
) {
3336 error_report("%s: colo_cache is NULL in block :%s",
3337 __func__
, block
->idstr
);
3342 * During colo checkpoint, we need bitmap of these migrated pages.
3343 * It help us to decide which pages in ram cache should be flushed
3344 * into VM's RAM later.
3346 if (record_bitmap
&&
3347 !test_and_set_bit(offset
>> TARGET_PAGE_BITS
, block
->bmap
)) {
3348 ram_state
->migration_dirty_pages
++;
3350 return block
->colo_cache
+ offset
;
3354 * ram_handle_compressed: handle the zero page case
3356 * If a page (or a whole RDMA chunk) has been
3357 * determined to be zero, then zap it.
3359 * @host: host address for the zero page
3360 * @ch: what the page is filled from. We only support zero
3361 * @size: size of the zero page
3363 void ram_handle_compressed(void *host
, uint8_t ch
, uint64_t size
)
3365 if (ch
!= 0 || !buffer_is_zero(host
, size
)) {
3366 memset(host
, ch
, size
);
3370 /* return the size after decompression, or negative value on error */
3372 qemu_uncompress_data(z_stream
*stream
, uint8_t *dest
, size_t dest_len
,
3373 const uint8_t *source
, size_t source_len
)
3377 err
= inflateReset(stream
);
3382 stream
->avail_in
= source_len
;
3383 stream
->next_in
= (uint8_t *)source
;
3384 stream
->avail_out
= dest_len
;
3385 stream
->next_out
= dest
;
3387 err
= inflate(stream
, Z_NO_FLUSH
);
3388 if (err
!= Z_STREAM_END
) {
3392 return stream
->total_out
;
3395 static void *do_data_decompress(void *opaque
)
3397 DecompressParam
*param
= opaque
;
3398 unsigned long pagesize
;
3402 qemu_mutex_lock(¶m
->mutex
);
3403 while (!param
->quit
) {
3408 qemu_mutex_unlock(¶m
->mutex
);
3410 pagesize
= TARGET_PAGE_SIZE
;
3412 ret
= qemu_uncompress_data(¶m
->stream
, des
, pagesize
,
3413 param
->compbuf
, len
);
3414 if (ret
< 0 && migrate_get_current()->decompress_error_check
) {
3415 error_report("decompress data failed");
3416 qemu_file_set_error(decomp_file
, ret
);
3419 qemu_mutex_lock(&decomp_done_lock
);
3421 qemu_cond_signal(&decomp_done_cond
);
3422 qemu_mutex_unlock(&decomp_done_lock
);
3424 qemu_mutex_lock(¶m
->mutex
);
3426 qemu_cond_wait(¶m
->cond
, ¶m
->mutex
);
3429 qemu_mutex_unlock(¶m
->mutex
);
3434 static int wait_for_decompress_done(void)
3436 int idx
, thread_count
;
3438 if (!migrate_use_compression()) {
3442 thread_count
= migrate_decompress_threads();
3443 qemu_mutex_lock(&decomp_done_lock
);
3444 for (idx
= 0; idx
< thread_count
; idx
++) {
3445 while (!decomp_param
[idx
].done
) {
3446 qemu_cond_wait(&decomp_done_cond
, &decomp_done_lock
);
3449 qemu_mutex_unlock(&decomp_done_lock
);
3450 return qemu_file_get_error(decomp_file
);
3453 static void compress_threads_load_cleanup(void)
3455 int i
, thread_count
;
3457 if (!migrate_use_compression()) {
3460 thread_count
= migrate_decompress_threads();
3461 for (i
= 0; i
< thread_count
; i
++) {
3463 * we use it as a indicator which shows if the thread is
3464 * properly init'd or not
3466 if (!decomp_param
[i
].compbuf
) {
3470 qemu_mutex_lock(&decomp_param
[i
].mutex
);
3471 decomp_param
[i
].quit
= true;
3472 qemu_cond_signal(&decomp_param
[i
].cond
);
3473 qemu_mutex_unlock(&decomp_param
[i
].mutex
);
3475 for (i
= 0; i
< thread_count
; i
++) {
3476 if (!decomp_param
[i
].compbuf
) {
3480 qemu_thread_join(decompress_threads
+ i
);
3481 qemu_mutex_destroy(&decomp_param
[i
].mutex
);
3482 qemu_cond_destroy(&decomp_param
[i
].cond
);
3483 inflateEnd(&decomp_param
[i
].stream
);
3484 g_free(decomp_param
[i
].compbuf
);
3485 decomp_param
[i
].compbuf
= NULL
;
3487 g_free(decompress_threads
);
3488 g_free(decomp_param
);
3489 decompress_threads
= NULL
;
3490 decomp_param
= NULL
;
3494 static int compress_threads_load_setup(QEMUFile
*f
)
3496 int i
, thread_count
;
3498 if (!migrate_use_compression()) {
3502 thread_count
= migrate_decompress_threads();
3503 decompress_threads
= g_new0(QemuThread
, thread_count
);
3504 decomp_param
= g_new0(DecompressParam
, thread_count
);
3505 qemu_mutex_init(&decomp_done_lock
);
3506 qemu_cond_init(&decomp_done_cond
);
3508 for (i
= 0; i
< thread_count
; i
++) {
3509 if (inflateInit(&decomp_param
[i
].stream
) != Z_OK
) {
3513 decomp_param
[i
].compbuf
= g_malloc0(compressBound(TARGET_PAGE_SIZE
));
3514 qemu_mutex_init(&decomp_param
[i
].mutex
);
3515 qemu_cond_init(&decomp_param
[i
].cond
);
3516 decomp_param
[i
].done
= true;
3517 decomp_param
[i
].quit
= false;
3518 qemu_thread_create(decompress_threads
+ i
, "decompress",
3519 do_data_decompress
, decomp_param
+ i
,
3520 QEMU_THREAD_JOINABLE
);
3524 compress_threads_load_cleanup();
3528 static void decompress_data_with_multi_threads(QEMUFile
*f
,
3529 void *host
, int len
)
3531 int idx
, thread_count
;
3533 thread_count
= migrate_decompress_threads();
3534 QEMU_LOCK_GUARD(&decomp_done_lock
);
3536 for (idx
= 0; idx
< thread_count
; idx
++) {
3537 if (decomp_param
[idx
].done
) {
3538 decomp_param
[idx
].done
= false;
3539 qemu_mutex_lock(&decomp_param
[idx
].mutex
);
3540 qemu_get_buffer(f
, decomp_param
[idx
].compbuf
, len
);
3541 decomp_param
[idx
].des
= host
;
3542 decomp_param
[idx
].len
= len
;
3543 qemu_cond_signal(&decomp_param
[idx
].cond
);
3544 qemu_mutex_unlock(&decomp_param
[idx
].mutex
);
3548 if (idx
< thread_count
) {
3551 qemu_cond_wait(&decomp_done_cond
, &decomp_done_lock
);
3556 static void colo_init_ram_state(void)
3558 ram_state_init(&ram_state
);
3562 * colo cache: this is for secondary VM, we cache the whole
3563 * memory of the secondary VM, it is need to hold the global lock
3564 * to call this helper.
3566 int colo_init_ram_cache(void)
3570 WITH_RCU_READ_LOCK_GUARD() {
3571 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3572 block
->colo_cache
= qemu_anon_ram_alloc(block
->used_length
,
3573 NULL
, false, false);
3574 if (!block
->colo_cache
) {
3575 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3576 "size 0x" RAM_ADDR_FMT
, __func__
, block
->idstr
,
3577 block
->used_length
);
3578 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3579 if (block
->colo_cache
) {
3580 qemu_anon_ram_free(block
->colo_cache
, block
->used_length
);
3581 block
->colo_cache
= NULL
;
3586 if (!machine_dump_guest_core(current_machine
)) {
3587 qemu_madvise(block
->colo_cache
, block
->used_length
,
3588 QEMU_MADV_DONTDUMP
);
3594 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3595 * with to decide which page in cache should be flushed into SVM's RAM. Here
3596 * we use the same name 'ram_bitmap' as for migration.
3598 if (ram_bytes_total()) {
3601 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3602 unsigned long pages
= block
->max_length
>> TARGET_PAGE_BITS
;
3603 block
->bmap
= bitmap_new(pages
);
3607 colo_init_ram_state();
3611 /* TODO: duplicated with ram_init_bitmaps */
3612 void colo_incoming_start_dirty_log(void)
3614 RAMBlock
*block
= NULL
;
3615 /* For memory_global_dirty_log_start below. */
3616 qemu_mutex_lock_iothread();
3617 qemu_mutex_lock_ramlist();
3619 memory_global_dirty_log_sync();
3620 WITH_RCU_READ_LOCK_GUARD() {
3621 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3622 ramblock_sync_dirty_bitmap(ram_state
, block
);
3623 /* Discard this dirty bitmap record */
3624 bitmap_zero(block
->bmap
, block
->max_length
>> TARGET_PAGE_BITS
);
3626 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION
);
3628 ram_state
->migration_dirty_pages
= 0;
3629 qemu_mutex_unlock_ramlist();
3630 qemu_mutex_unlock_iothread();
3633 /* It is need to hold the global lock to call this helper */
3634 void colo_release_ram_cache(void)
3638 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION
);
3639 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3640 g_free(block
->bmap
);
3644 WITH_RCU_READ_LOCK_GUARD() {
3645 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3646 if (block
->colo_cache
) {
3647 qemu_anon_ram_free(block
->colo_cache
, block
->used_length
);
3648 block
->colo_cache
= NULL
;
3652 ram_state_cleanup(&ram_state
);
3656 * ram_load_setup: Setup RAM for migration incoming side
3658 * Returns zero to indicate success and negative for error
3660 * @f: QEMUFile where to receive the data
3661 * @opaque: RAMState pointer
3663 static int ram_load_setup(QEMUFile
*f
, void *opaque
)
3665 if (compress_threads_load_setup(f
)) {
3669 xbzrle_load_setup();
3670 ramblock_recv_map_init();
3675 static int ram_load_cleanup(void *opaque
)
3679 RAMBLOCK_FOREACH_NOT_IGNORED(rb
) {
3680 qemu_ram_block_writeback(rb
);
3683 xbzrle_load_cleanup();
3684 compress_threads_load_cleanup();
3686 RAMBLOCK_FOREACH_NOT_IGNORED(rb
) {
3687 g_free(rb
->receivedmap
);
3688 rb
->receivedmap
= NULL
;
3695 * ram_postcopy_incoming_init: allocate postcopy data structures
3697 * Returns 0 for success and negative if there was one error
3699 * @mis: current migration incoming state
3701 * Allocate data structures etc needed by incoming migration with
3702 * postcopy-ram. postcopy-ram's similarly names
3703 * postcopy_ram_incoming_init does the work.
3705 int ram_postcopy_incoming_init(MigrationIncomingState
*mis
)
3707 return postcopy_ram_incoming_init(mis
);
3711 * ram_load_postcopy: load a page in postcopy case
3713 * Returns 0 for success or -errno in case of error
3715 * Called in postcopy mode by ram_load().
3716 * rcu_read_lock is taken prior to this being called.
3718 * @f: QEMUFile where to send the data
3720 static int ram_load_postcopy(QEMUFile
*f
)
3722 int flags
= 0, ret
= 0;
3723 bool place_needed
= false;
3724 bool matches_target_page_size
= false;
3725 MigrationIncomingState
*mis
= migration_incoming_get_current();
3726 /* Temporary page that is later 'placed' */
3727 void *postcopy_host_page
= mis
->postcopy_tmp_page
;
3728 void *host_page
= NULL
;
3729 bool all_zero
= true;
3730 int target_pages
= 0;
3732 while (!ret
&& !(flags
& RAM_SAVE_FLAG_EOS
)) {
3734 void *page_buffer
= NULL
;
3735 void *place_source
= NULL
;
3736 RAMBlock
*block
= NULL
;
3740 addr
= qemu_get_be64(f
);
3743 * If qemu file error, we should stop here, and then "addr"
3746 ret
= qemu_file_get_error(f
);
3751 flags
= addr
& ~TARGET_PAGE_MASK
;
3752 addr
&= TARGET_PAGE_MASK
;
3754 trace_ram_load_postcopy_loop((uint64_t)addr
, flags
);
3755 if (flags
& (RAM_SAVE_FLAG_ZERO
| RAM_SAVE_FLAG_PAGE
|
3756 RAM_SAVE_FLAG_COMPRESS_PAGE
)) {
3757 block
= ram_block_from_stream(f
, flags
);
3764 * Relying on used_length is racy and can result in false positives.
3765 * We might place pages beyond used_length in case RAM was shrunk
3766 * while in postcopy, which is fine - trying to place via
3767 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3769 if (!block
->host
|| addr
>= block
->postcopy_length
) {
3770 error_report("Illegal RAM offset " RAM_ADDR_FMT
, addr
);
3775 matches_target_page_size
= block
->page_size
== TARGET_PAGE_SIZE
;
3777 * Postcopy requires that we place whole host pages atomically;
3778 * these may be huge pages for RAMBlocks that are backed by
3780 * To make it atomic, the data is read into a temporary page
3781 * that's moved into place later.
3782 * The migration protocol uses, possibly smaller, target-pages
3783 * however the source ensures it always sends all the components
3784 * of a host page in one chunk.
3786 page_buffer
= postcopy_host_page
+
3787 host_page_offset_from_ram_block_offset(block
, addr
);
3788 /* If all TP are zero then we can optimise the place */
3789 if (target_pages
== 1) {
3790 host_page
= host_page_from_ram_block_offset(block
, addr
);
3791 } else if (host_page
!= host_page_from_ram_block_offset(block
,
3793 /* not the 1st TP within the HP */
3794 error_report("Non-same host page %p/%p", host_page
,
3795 host_page_from_ram_block_offset(block
, addr
));
3801 * If it's the last part of a host page then we place the host
3804 if (target_pages
== (block
->page_size
/ TARGET_PAGE_SIZE
)) {
3805 place_needed
= true;
3807 place_source
= postcopy_host_page
;
3810 switch (flags
& ~RAM_SAVE_FLAG_CONTINUE
) {
3811 case RAM_SAVE_FLAG_ZERO
:
3812 ch
= qemu_get_byte(f
);
3814 * Can skip to set page_buffer when
3815 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3817 if (ch
|| !matches_target_page_size
) {
3818 memset(page_buffer
, ch
, TARGET_PAGE_SIZE
);
3825 case RAM_SAVE_FLAG_PAGE
:
3827 if (!matches_target_page_size
) {
3828 /* For huge pages, we always use temporary buffer */
3829 qemu_get_buffer(f
, page_buffer
, TARGET_PAGE_SIZE
);
3832 * For small pages that matches target page size, we
3833 * avoid the qemu_file copy. Instead we directly use
3834 * the buffer of QEMUFile to place the page. Note: we
3835 * cannot do any QEMUFile operation before using that
3836 * buffer to make sure the buffer is valid when
3839 qemu_get_buffer_in_place(f
, (uint8_t **)&place_source
,
3843 case RAM_SAVE_FLAG_COMPRESS_PAGE
:
3845 len
= qemu_get_be32(f
);
3846 if (len
< 0 || len
> compressBound(TARGET_PAGE_SIZE
)) {
3847 error_report("Invalid compressed data length: %d", len
);
3851 decompress_data_with_multi_threads(f
, page_buffer
, len
);
3854 case RAM_SAVE_FLAG_EOS
:
3856 multifd_recv_sync_main();
3859 error_report("Unknown combination of migration flags: 0x%x"
3860 " (postcopy mode)", flags
);
3865 /* Got the whole host page, wait for decompress before placing. */
3867 ret
|= wait_for_decompress_done();
3870 /* Detect for any possible file errors */
3871 if (!ret
&& qemu_file_get_error(f
)) {
3872 ret
= qemu_file_get_error(f
);
3875 if (!ret
&& place_needed
) {
3877 ret
= postcopy_place_page_zero(mis
, host_page
, block
);
3879 ret
= postcopy_place_page(mis
, host_page
, place_source
,
3882 place_needed
= false;
3884 /* Assume we have a zero page until we detect something different */
3892 static bool postcopy_is_advised(void)
3894 PostcopyState ps
= postcopy_state_get();
3895 return ps
>= POSTCOPY_INCOMING_ADVISE
&& ps
< POSTCOPY_INCOMING_END
;
3898 static bool postcopy_is_running(void)
3900 PostcopyState ps
= postcopy_state_get();
3901 return ps
>= POSTCOPY_INCOMING_LISTENING
&& ps
< POSTCOPY_INCOMING_END
;
3905 * Flush content of RAM cache into SVM's memory.
3906 * Only flush the pages that be dirtied by PVM or SVM or both.
3908 void colo_flush_ram_cache(void)
3910 RAMBlock
*block
= NULL
;
3913 unsigned long offset
= 0;
3915 memory_global_dirty_log_sync();
3916 WITH_RCU_READ_LOCK_GUARD() {
3917 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3918 ramblock_sync_dirty_bitmap(ram_state
, block
);
3922 trace_colo_flush_ram_cache_begin(ram_state
->migration_dirty_pages
);
3923 WITH_RCU_READ_LOCK_GUARD() {
3924 block
= QLIST_FIRST_RCU(&ram_list
.blocks
);
3927 unsigned long num
= 0;
3929 offset
= colo_bitmap_find_dirty(ram_state
, block
, offset
, &num
);
3930 if (!offset_in_ramblock(block
,
3931 ((ram_addr_t
)offset
) << TARGET_PAGE_BITS
)) {
3934 block
= QLIST_NEXT_RCU(block
, next
);
3936 unsigned long i
= 0;
3938 for (i
= 0; i
< num
; i
++) {
3939 migration_bitmap_clear_dirty(ram_state
, block
, offset
+ i
);
3941 dst_host
= block
->host
3942 + (((ram_addr_t
)offset
) << TARGET_PAGE_BITS
);
3943 src_host
= block
->colo_cache
3944 + (((ram_addr_t
)offset
) << TARGET_PAGE_BITS
);
3945 memcpy(dst_host
, src_host
, TARGET_PAGE_SIZE
* num
);
3950 trace_colo_flush_ram_cache_end();
3954 * ram_load_precopy: load pages in precopy case
3956 * Returns 0 for success or -errno in case of error
3958 * Called in precopy mode by ram_load().
3959 * rcu_read_lock is taken prior to this being called.
3961 * @f: QEMUFile where to send the data
3963 static int ram_load_precopy(QEMUFile
*f
)
3965 int flags
= 0, ret
= 0, invalid_flags
= 0, len
= 0, i
= 0;
3966 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3967 bool postcopy_advised
= postcopy_is_advised();
3968 if (!migrate_use_compression()) {
3969 invalid_flags
|= RAM_SAVE_FLAG_COMPRESS_PAGE
;
3972 while (!ret
&& !(flags
& RAM_SAVE_FLAG_EOS
)) {
3973 ram_addr_t addr
, total_ram_bytes
;
3974 void *host
= NULL
, *host_bak
= NULL
;
3978 * Yield periodically to let main loop run, but an iteration of
3979 * the main loop is expensive, so do it each some iterations
3981 if ((i
& 32767) == 0 && qemu_in_coroutine()) {
3982 aio_co_schedule(qemu_get_current_aio_context(),
3983 qemu_coroutine_self());
3984 qemu_coroutine_yield();
3988 addr
= qemu_get_be64(f
);
3989 flags
= addr
& ~TARGET_PAGE_MASK
;
3990 addr
&= TARGET_PAGE_MASK
;
3992 if (flags
& invalid_flags
) {
3993 if (flags
& invalid_flags
& RAM_SAVE_FLAG_COMPRESS_PAGE
) {
3994 error_report("Received an unexpected compressed page");
4001 if (flags
& (RAM_SAVE_FLAG_ZERO
| RAM_SAVE_FLAG_PAGE
|
4002 RAM_SAVE_FLAG_COMPRESS_PAGE
| RAM_SAVE_FLAG_XBZRLE
)) {
4003 RAMBlock
*block
= ram_block_from_stream(f
, flags
);
4005 host
= host_from_ram_block_offset(block
, addr
);
4007 * After going into COLO stage, we should not load the page
4008 * into SVM's memory directly, we put them into colo_cache firstly.
4009 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4010 * Previously, we copied all these memory in preparing stage of COLO
4011 * while we need to stop VM, which is a time-consuming process.
4012 * Here we optimize it by a trick, back-up every page while in
4013 * migration process while COLO is enabled, though it affects the
4014 * speed of the migration, but it obviously reduce the downtime of
4015 * back-up all SVM'S memory in COLO preparing stage.
4017 if (migration_incoming_colo_enabled()) {
4018 if (migration_incoming_in_colo_state()) {
4019 /* In COLO stage, put all pages into cache temporarily */
4020 host
= colo_cache_from_block_offset(block
, addr
, true);
4023 * In migration stage but before COLO stage,
4024 * Put all pages into both cache and SVM's memory.
4026 host_bak
= colo_cache_from_block_offset(block
, addr
, false);
4030 error_report("Illegal RAM offset " RAM_ADDR_FMT
, addr
);
4034 if (!migration_incoming_in_colo_state()) {
4035 ramblock_recv_bitmap_set(block
, host
);
4038 trace_ram_load_loop(block
->idstr
, (uint64_t)addr
, flags
, host
);
4041 switch (flags
& ~RAM_SAVE_FLAG_CONTINUE
) {
4042 case RAM_SAVE_FLAG_MEM_SIZE
:
4043 /* Synchronize RAM block list */
4044 total_ram_bytes
= addr
;
4045 while (!ret
&& total_ram_bytes
) {
4050 len
= qemu_get_byte(f
);
4051 qemu_get_buffer(f
, (uint8_t *)id
, len
);
4053 length
= qemu_get_be64(f
);
4055 block
= qemu_ram_block_by_name(id
);
4056 if (block
&& !qemu_ram_is_migratable(block
)) {
4057 error_report("block %s should not be migrated !", id
);
4060 if (length
!= block
->used_length
) {
4061 Error
*local_err
= NULL
;
4063 ret
= qemu_ram_resize(block
, length
,
4066 error_report_err(local_err
);
4069 /* For postcopy we need to check hugepage sizes match */
4070 if (postcopy_advised
&& migrate_postcopy_ram() &&
4071 block
->page_size
!= qemu_host_page_size
) {
4072 uint64_t remote_page_size
= qemu_get_be64(f
);
4073 if (remote_page_size
!= block
->page_size
) {
4074 error_report("Mismatched RAM page size %s "
4075 "(local) %zd != %" PRId64
,
4076 id
, block
->page_size
,
4081 if (migrate_ignore_shared()) {
4082 hwaddr addr
= qemu_get_be64(f
);
4083 if (ramblock_is_ignored(block
) &&
4084 block
->mr
->addr
!= addr
) {
4085 error_report("Mismatched GPAs for block %s "
4086 "%" PRId64
"!= %" PRId64
,
4088 (uint64_t)block
->mr
->addr
);
4092 ram_control_load_hook(f
, RAM_CONTROL_BLOCK_REG
,
4095 error_report("Unknown ramblock \"%s\", cannot "
4096 "accept migration", id
);
4100 total_ram_bytes
-= length
;
4104 case RAM_SAVE_FLAG_ZERO
:
4105 ch
= qemu_get_byte(f
);
4106 ram_handle_compressed(host
, ch
, TARGET_PAGE_SIZE
);
4109 case RAM_SAVE_FLAG_PAGE
:
4110 qemu_get_buffer(f
, host
, TARGET_PAGE_SIZE
);
4113 case RAM_SAVE_FLAG_COMPRESS_PAGE
:
4114 len
= qemu_get_be32(f
);
4115 if (len
< 0 || len
> compressBound(TARGET_PAGE_SIZE
)) {
4116 error_report("Invalid compressed data length: %d", len
);
4120 decompress_data_with_multi_threads(f
, host
, len
);
4123 case RAM_SAVE_FLAG_XBZRLE
:
4124 if (load_xbzrle(f
, addr
, host
) < 0) {
4125 error_report("Failed to decompress XBZRLE page at "
4126 RAM_ADDR_FMT
, addr
);
4131 case RAM_SAVE_FLAG_EOS
:
4133 multifd_recv_sync_main();
4136 if (flags
& RAM_SAVE_FLAG_HOOK
) {
4137 ram_control_load_hook(f
, RAM_CONTROL_HOOK
, NULL
);
4139 error_report("Unknown combination of migration flags: 0x%x",
4145 ret
= qemu_file_get_error(f
);
4147 if (!ret
&& host_bak
) {
4148 memcpy(host_bak
, host
, TARGET_PAGE_SIZE
);
4152 ret
|= wait_for_decompress_done();
4156 static int ram_load(QEMUFile
*f
, void *opaque
, int version_id
)
4159 static uint64_t seq_iter
;
4161 * If system is running in postcopy mode, page inserts to host memory must
4164 bool postcopy_running
= postcopy_is_running();
4168 if (version_id
!= 4) {
4173 * This RCU critical section can be very long running.
4174 * When RCU reclaims in the code start to become numerous,
4175 * it will be necessary to reduce the granularity of this
4178 WITH_RCU_READ_LOCK_GUARD() {
4179 if (postcopy_running
) {
4180 ret
= ram_load_postcopy(f
);
4182 ret
= ram_load_precopy(f
);
4185 trace_ram_load_complete(ret
, seq_iter
);
4190 static bool ram_has_postcopy(void *opaque
)
4193 RAMBLOCK_FOREACH_NOT_IGNORED(rb
) {
4194 if (ramblock_is_pmem(rb
)) {
4195 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4196 "is not supported now!", rb
->idstr
, rb
->host
);
4201 return migrate_postcopy_ram();
4204 /* Sync all the dirty bitmap with destination VM. */
4205 static int ram_dirty_bitmap_sync_all(MigrationState
*s
, RAMState
*rs
)
4208 QEMUFile
*file
= s
->to_dst_file
;
4209 int ramblock_count
= 0;
4211 trace_ram_dirty_bitmap_sync_start();
4213 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
4214 qemu_savevm_send_recv_bitmap(file
, block
->idstr
);
4215 trace_ram_dirty_bitmap_request(block
->idstr
);
4219 trace_ram_dirty_bitmap_sync_wait();
4221 /* Wait until all the ramblocks' dirty bitmap synced */
4222 while (ramblock_count
--) {
4223 qemu_sem_wait(&s
->rp_state
.rp_sem
);
4226 trace_ram_dirty_bitmap_sync_complete();
4231 static void ram_dirty_bitmap_reload_notify(MigrationState
*s
)
4233 qemu_sem_post(&s
->rp_state
.rp_sem
);
4237 * Read the received bitmap, revert it as the initial dirty bitmap.
4238 * This is only used when the postcopy migration is paused but wants
4239 * to resume from a middle point.
4241 int ram_dirty_bitmap_reload(MigrationState
*s
, RAMBlock
*block
)
4244 /* from_dst_file is always valid because we're within rp_thread */
4245 QEMUFile
*file
= s
->rp_state
.from_dst_file
;
4246 unsigned long *le_bitmap
, nbits
= block
->used_length
>> TARGET_PAGE_BITS
;
4247 uint64_t local_size
= DIV_ROUND_UP(nbits
, 8);
4248 uint64_t size
, end_mark
;
4250 trace_ram_dirty_bitmap_reload_begin(block
->idstr
);
4252 if (s
->state
!= MIGRATION_STATUS_POSTCOPY_RECOVER
) {
4253 error_report("%s: incorrect state %s", __func__
,
4254 MigrationStatus_str(s
->state
));
4259 * Note: see comments in ramblock_recv_bitmap_send() on why we
4260 * need the endianness conversion, and the paddings.
4262 local_size
= ROUND_UP(local_size
, 8);
4265 le_bitmap
= bitmap_new(nbits
+ BITS_PER_LONG
);
4267 size
= qemu_get_be64(file
);
4269 /* The size of the bitmap should match with our ramblock */
4270 if (size
!= local_size
) {
4271 error_report("%s: ramblock '%s' bitmap size mismatch "
4272 "(0x%"PRIx64
" != 0x%"PRIx64
")", __func__
,
4273 block
->idstr
, size
, local_size
);
4278 size
= qemu_get_buffer(file
, (uint8_t *)le_bitmap
, local_size
);
4279 end_mark
= qemu_get_be64(file
);
4281 ret
= qemu_file_get_error(file
);
4282 if (ret
|| size
!= local_size
) {
4283 error_report("%s: read bitmap failed for ramblock '%s': %d"
4284 " (size 0x%"PRIx64
", got: 0x%"PRIx64
")",
4285 __func__
, block
->idstr
, ret
, local_size
, size
);
4290 if (end_mark
!= RAMBLOCK_RECV_BITMAP_ENDING
) {
4291 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64
,
4292 __func__
, block
->idstr
, end_mark
);
4298 * Endianness conversion. We are during postcopy (though paused).
4299 * The dirty bitmap won't change. We can directly modify it.
4301 bitmap_from_le(block
->bmap
, le_bitmap
, nbits
);
4304 * What we received is "received bitmap". Revert it as the initial
4305 * dirty bitmap for this ramblock.
4307 bitmap_complement(block
->bmap
, block
->bmap
, nbits
);
4309 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4310 ramblock_dirty_bitmap_clear_discarded_pages(block
);
4312 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4313 trace_ram_dirty_bitmap_reload_complete(block
->idstr
);
4316 * We succeeded to sync bitmap for current ramblock. If this is
4317 * the last one to sync, we need to notify the main send thread.
4319 ram_dirty_bitmap_reload_notify(s
);
4327 static int ram_resume_prepare(MigrationState
*s
, void *opaque
)
4329 RAMState
*rs
= *(RAMState
**)opaque
;
4332 ret
= ram_dirty_bitmap_sync_all(s
, rs
);
4337 ram_state_resume_prepare(rs
, s
->to_dst_file
);
4342 static SaveVMHandlers savevm_ram_handlers
= {
4343 .save_setup
= ram_save_setup
,
4344 .save_live_iterate
= ram_save_iterate
,
4345 .save_live_complete_postcopy
= ram_save_complete
,
4346 .save_live_complete_precopy
= ram_save_complete
,
4347 .has_postcopy
= ram_has_postcopy
,
4348 .save_live_pending
= ram_save_pending
,
4349 .load_state
= ram_load
,
4350 .save_cleanup
= ram_save_cleanup
,
4351 .load_setup
= ram_load_setup
,
4352 .load_cleanup
= ram_load_cleanup
,
4353 .resume_prepare
= ram_resume_prepare
,
4356 static void ram_mig_ram_block_resized(RAMBlockNotifier
*n
, void *host
,
4357 size_t old_size
, size_t new_size
)
4359 PostcopyState ps
= postcopy_state_get();
4361 RAMBlock
*rb
= qemu_ram_block_from_host(host
, false, &offset
);
4364 if (ramblock_is_ignored(rb
)) {
4368 if (!migration_is_idle()) {
4370 * Precopy code on the source cannot deal with the size of RAM blocks
4371 * changing at random points in time - especially after sending the
4372 * RAM block sizes in the migration stream, they must no longer change.
4373 * Abort and indicate a proper reason.
4375 error_setg(&err
, "RAM block '%s' resized during precopy.", rb
->idstr
);
4376 migration_cancel(err
);
4381 case POSTCOPY_INCOMING_ADVISE
:
4383 * Update what ram_postcopy_incoming_init()->init_range() does at the
4384 * time postcopy was advised. Syncing RAM blocks with the source will
4385 * result in RAM resizes.
4387 if (old_size
< new_size
) {
4388 if (ram_discard_range(rb
->idstr
, old_size
, new_size
- old_size
)) {
4389 error_report("RAM block '%s' discard of resized RAM failed",
4393 rb
->postcopy_length
= new_size
;
4395 case POSTCOPY_INCOMING_NONE
:
4396 case POSTCOPY_INCOMING_RUNNING
:
4397 case POSTCOPY_INCOMING_END
:
4399 * Once our guest is running, postcopy does no longer care about
4400 * resizes. When growing, the new memory was not available on the
4401 * source, no handler needed.
4405 error_report("RAM block '%s' resized during postcopy state: %d",
4411 static RAMBlockNotifier ram_mig_ram_notifier
= {
4412 .ram_block_resized
= ram_mig_ram_block_resized
,
4415 void ram_mig_init(void)
4417 qemu_mutex_init(&XBZRLE
.lock
);
4418 register_savevm_live("ram", 0, 4, &savevm_ram_handlers
, &ram_state
);
4419 ram_block_notifier_add(&ram_mig_ram_notifier
);