4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
33 #include <sys/types.h>
37 #include "monitor/monitor.h"
38 #include "sysemu/sysemu.h"
39 #include "qemu/bitops.h"
40 #include "qemu/bitmap.h"
41 #include "hw/i386/pc.h"
42 #include "hw/pci/pci.h"
43 #include "hw/audio/audio.h"
44 #include "migration/migration.h"
45 #include "exec/address-spaces.h"
46 #include "migration/page_cache.h"
47 #include "qemu/config-file.h"
48 #include "qemu/error-report.h"
49 #include "qmp-commands.h"
51 #include "exec/cpu-all.h"
52 #include "exec/ram_addr.h"
53 #include "qemu/host-utils.h"
54 #include "qemu/rcu_queue.h"
56 #ifdef DEBUG_MIGRATION_RAM
57 #define DPRINTF(fmt, ...) \
58 do { fprintf(stdout, "migration_ram: " fmt, ## __VA_ARGS__); } while (0)
60 #define DPRINTF(fmt, ...) \
64 static bool mig_throttle_on
;
65 static int dirty_rate_high_cnt
;
66 static void check_guest_throttling(void);
68 static uint64_t bitmap_sync_count
;
70 /***********************************************************/
71 /* ram save/restore */
73 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
74 #define RAM_SAVE_FLAG_COMPRESS 0x02
75 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
76 #define RAM_SAVE_FLAG_PAGE 0x08
77 #define RAM_SAVE_FLAG_EOS 0x10
78 #define RAM_SAVE_FLAG_CONTINUE 0x20
79 #define RAM_SAVE_FLAG_XBZRLE 0x40
80 /* 0x80 is reserved in migration.h start with 0x100 next */
81 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
83 static const uint8_t ZERO_TARGET_PAGE
[TARGET_PAGE_SIZE
];
85 static inline bool is_zero_range(uint8_t *p
, uint64_t size
)
87 return buffer_find_nonzero_offset(p
, size
) == size
;
90 /* struct contains XBZRLE cache and a static page
91 used by the compression */
93 /* buffer used for XBZRLE encoding */
95 /* buffer for storing page content */
97 /* Cache for XBZRLE, Protected by lock. */
102 /* buffer used for XBZRLE decoding */
103 static uint8_t *xbzrle_decoded_buf
;
105 static void XBZRLE_cache_lock(void)
107 if (migrate_use_xbzrle())
108 qemu_mutex_lock(&XBZRLE
.lock
);
111 static void XBZRLE_cache_unlock(void)
113 if (migrate_use_xbzrle())
114 qemu_mutex_unlock(&XBZRLE
.lock
);
118 * called from qmp_migrate_set_cache_size in main thread, possibly while
119 * a migration is in progress.
120 * A running migration maybe using the cache and might finish during this
121 * call, hence changes to the cache are protected by XBZRLE.lock().
123 int64_t xbzrle_cache_resize(int64_t new_size
)
125 PageCache
*new_cache
;
128 if (new_size
< TARGET_PAGE_SIZE
) {
134 if (XBZRLE
.cache
!= NULL
) {
135 if (pow2floor(new_size
) == migrate_xbzrle_cache_size()) {
138 new_cache
= cache_init(new_size
/ TARGET_PAGE_SIZE
,
141 error_report("Error creating cache");
146 cache_fini(XBZRLE
.cache
);
147 XBZRLE
.cache
= new_cache
;
151 ret
= pow2floor(new_size
);
153 XBZRLE_cache_unlock();
157 /* accounting for migration statistics */
158 typedef struct AccountingInfo
{
160 uint64_t skipped_pages
;
163 uint64_t xbzrle_bytes
;
164 uint64_t xbzrle_pages
;
165 uint64_t xbzrle_cache_miss
;
166 double xbzrle_cache_miss_rate
;
167 uint64_t xbzrle_overflows
;
170 static AccountingInfo acct_info
;
172 static void acct_clear(void)
174 memset(&acct_info
, 0, sizeof(acct_info
));
177 uint64_t dup_mig_bytes_transferred(void)
179 return acct_info
.dup_pages
* TARGET_PAGE_SIZE
;
182 uint64_t dup_mig_pages_transferred(void)
184 return acct_info
.dup_pages
;
187 uint64_t skipped_mig_bytes_transferred(void)
189 return acct_info
.skipped_pages
* TARGET_PAGE_SIZE
;
192 uint64_t skipped_mig_pages_transferred(void)
194 return acct_info
.skipped_pages
;
197 uint64_t norm_mig_bytes_transferred(void)
199 return acct_info
.norm_pages
* TARGET_PAGE_SIZE
;
202 uint64_t norm_mig_pages_transferred(void)
204 return acct_info
.norm_pages
;
207 uint64_t xbzrle_mig_bytes_transferred(void)
209 return acct_info
.xbzrle_bytes
;
212 uint64_t xbzrle_mig_pages_transferred(void)
214 return acct_info
.xbzrle_pages
;
217 uint64_t xbzrle_mig_pages_cache_miss(void)
219 return acct_info
.xbzrle_cache_miss
;
222 double xbzrle_mig_cache_miss_rate(void)
224 return acct_info
.xbzrle_cache_miss_rate
;
227 uint64_t xbzrle_mig_pages_overflow(void)
229 return acct_info
.xbzrle_overflows
;
232 /* This is the last block that we have visited serching for dirty pages
234 static RAMBlock
*last_seen_block
;
235 /* This is the last block from where we have sent data */
236 static RAMBlock
*last_sent_block
;
237 static ram_addr_t last_offset
;
238 static unsigned long *migration_bitmap
;
239 static uint64_t migration_dirty_pages
;
240 static uint32_t last_version
;
241 static bool ram_bulk_stage
;
243 struct CompressParam
{
252 typedef struct CompressParam CompressParam
;
254 struct DecompressParam
{
262 typedef struct DecompressParam DecompressParam
;
264 static CompressParam
*comp_param
;
265 static QemuThread
*compress_threads
;
266 /* comp_done_cond is used to wake up the migration thread when
267 * one of the compression threads has finished the compression.
268 * comp_done_lock is used to co-work with comp_done_cond.
270 static QemuMutex
*comp_done_lock
;
271 static QemuCond
*comp_done_cond
;
272 /* The empty QEMUFileOps will be used by file in CompressParam */
273 static const QEMUFileOps empty_ops
= { };
275 static bool compression_switch
;
276 static bool quit_comp_thread
;
277 static bool quit_decomp_thread
;
278 static DecompressParam
*decomp_param
;
279 static QemuThread
*decompress_threads
;
280 static uint8_t *compressed_data_buf
;
282 static int do_compress_ram_page(CompressParam
*param
);
284 static void *do_data_compress(void *opaque
)
286 CompressParam
*param
= opaque
;
288 while (!quit_comp_thread
) {
289 qemu_mutex_lock(¶m
->mutex
);
290 /* Re-check the quit_comp_thread in case of
291 * terminate_compression_threads is called just before
292 * qemu_mutex_lock(¶m->mutex) and after
293 * while(!quit_comp_thread), re-check it here can make
294 * sure the compression thread terminate as expected.
296 while (!param
->start
&& !quit_comp_thread
) {
297 qemu_cond_wait(¶m
->cond
, ¶m
->mutex
);
299 if (!quit_comp_thread
) {
300 do_compress_ram_page(param
);
302 param
->start
= false;
303 qemu_mutex_unlock(¶m
->mutex
);
305 qemu_mutex_lock(comp_done_lock
);
307 qemu_cond_signal(comp_done_cond
);
308 qemu_mutex_unlock(comp_done_lock
);
314 static inline void terminate_compression_threads(void)
316 int idx
, thread_count
;
318 thread_count
= migrate_compress_threads();
319 quit_comp_thread
= true;
320 for (idx
= 0; idx
< thread_count
; idx
++) {
321 qemu_mutex_lock(&comp_param
[idx
].mutex
);
322 qemu_cond_signal(&comp_param
[idx
].cond
);
323 qemu_mutex_unlock(&comp_param
[idx
].mutex
);
327 void migrate_compress_threads_join(void)
331 if (!migrate_use_compression()) {
334 terminate_compression_threads();
335 thread_count
= migrate_compress_threads();
336 for (i
= 0; i
< thread_count
; i
++) {
337 qemu_thread_join(compress_threads
+ i
);
338 qemu_fclose(comp_param
[i
].file
);
339 qemu_mutex_destroy(&comp_param
[i
].mutex
);
340 qemu_cond_destroy(&comp_param
[i
].cond
);
342 qemu_mutex_destroy(comp_done_lock
);
343 qemu_cond_destroy(comp_done_cond
);
344 g_free(compress_threads
);
346 g_free(comp_done_cond
);
347 g_free(comp_done_lock
);
348 compress_threads
= NULL
;
350 comp_done_cond
= NULL
;
351 comp_done_lock
= NULL
;
354 void migrate_compress_threads_create(void)
358 if (!migrate_use_compression()) {
361 quit_comp_thread
= false;
362 compression_switch
= true;
363 thread_count
= migrate_compress_threads();
364 compress_threads
= g_new0(QemuThread
, thread_count
);
365 comp_param
= g_new0(CompressParam
, thread_count
);
366 comp_done_cond
= g_new0(QemuCond
, 1);
367 comp_done_lock
= g_new0(QemuMutex
, 1);
368 qemu_cond_init(comp_done_cond
);
369 qemu_mutex_init(comp_done_lock
);
370 for (i
= 0; i
< thread_count
; i
++) {
371 /* com_param[i].file is just used as a dummy buffer to save data, set
374 comp_param
[i
].file
= qemu_fopen_ops(NULL
, &empty_ops
);
375 comp_param
[i
].done
= true;
376 qemu_mutex_init(&comp_param
[i
].mutex
);
377 qemu_cond_init(&comp_param
[i
].cond
);
378 qemu_thread_create(compress_threads
+ i
, "compress",
379 do_data_compress
, comp_param
+ i
,
380 QEMU_THREAD_JOINABLE
);
385 * save_page_header: Write page header to wire
387 * If this is the 1st block, it also writes the block identification
389 * Returns: Number of bytes written
391 * @f: QEMUFile where to send the data
392 * @block: block that contains the page we want to send
393 * @offset: offset inside the block for the page
394 * in the lower bits, it contains flags
396 static size_t save_page_header(QEMUFile
*f
, RAMBlock
*block
, ram_addr_t offset
)
400 qemu_put_be64(f
, offset
);
403 if (!(offset
& RAM_SAVE_FLAG_CONTINUE
)) {
404 qemu_put_byte(f
, strlen(block
->idstr
));
405 qemu_put_buffer(f
, (uint8_t *)block
->idstr
,
406 strlen(block
->idstr
));
407 size
+= 1 + strlen(block
->idstr
);
412 /* Update the xbzrle cache to reflect a page that's been sent as all 0.
413 * The important thing is that a stale (not-yet-0'd) page be replaced
415 * As a bonus, if the page wasn't in the cache it gets added so that
416 * when a small write is made into the 0'd page it gets XBZRLE sent
418 static void xbzrle_cache_zero_page(ram_addr_t current_addr
)
420 if (ram_bulk_stage
|| !migrate_use_xbzrle()) {
424 /* We don't care if this fails to allocate a new cache page
425 * as long as it updated an old one */
426 cache_insert(XBZRLE
.cache
, current_addr
, ZERO_TARGET_PAGE
,
430 #define ENCODING_FLAG_XBZRLE 0x1
433 * save_xbzrle_page: compress and send current page
435 * Returns: 1 means that we wrote the page
436 * 0 means that page is identical to the one already sent
437 * -1 means that xbzrle would be longer than normal
439 * @f: QEMUFile where to send the data
442 * @block: block that contains the page we want to send
443 * @offset: offset inside the block for the page
444 * @last_stage: if we are at the completion stage
445 * @bytes_transferred: increase it with the number of transferred bytes
447 static int save_xbzrle_page(QEMUFile
*f
, uint8_t **current_data
,
448 ram_addr_t current_addr
, RAMBlock
*block
,
449 ram_addr_t offset
, bool last_stage
,
450 uint64_t *bytes_transferred
)
452 int encoded_len
= 0, bytes_xbzrle
;
453 uint8_t *prev_cached_page
;
455 if (!cache_is_cached(XBZRLE
.cache
, current_addr
, bitmap_sync_count
)) {
456 acct_info
.xbzrle_cache_miss
++;
458 if (cache_insert(XBZRLE
.cache
, current_addr
, *current_data
,
459 bitmap_sync_count
) == -1) {
462 /* update *current_data when the page has been
463 inserted into cache */
464 *current_data
= get_cached_data(XBZRLE
.cache
, current_addr
);
470 prev_cached_page
= get_cached_data(XBZRLE
.cache
, current_addr
);
472 /* save current buffer into memory */
473 memcpy(XBZRLE
.current_buf
, *current_data
, TARGET_PAGE_SIZE
);
475 /* XBZRLE encoding (if there is no overflow) */
476 encoded_len
= xbzrle_encode_buffer(prev_cached_page
, XBZRLE
.current_buf
,
477 TARGET_PAGE_SIZE
, XBZRLE
.encoded_buf
,
479 if (encoded_len
== 0) {
480 DPRINTF("Skipping unmodified page\n");
482 } else if (encoded_len
== -1) {
483 DPRINTF("Overflow\n");
484 acct_info
.xbzrle_overflows
++;
485 /* update data in the cache */
487 memcpy(prev_cached_page
, *current_data
, TARGET_PAGE_SIZE
);
488 *current_data
= prev_cached_page
;
493 /* we need to update the data in the cache, in order to get the same data */
495 memcpy(prev_cached_page
, XBZRLE
.current_buf
, TARGET_PAGE_SIZE
);
498 /* Send XBZRLE based compressed page */
499 bytes_xbzrle
= save_page_header(f
, block
, offset
| RAM_SAVE_FLAG_XBZRLE
);
500 qemu_put_byte(f
, ENCODING_FLAG_XBZRLE
);
501 qemu_put_be16(f
, encoded_len
);
502 qemu_put_buffer(f
, XBZRLE
.encoded_buf
, encoded_len
);
503 bytes_xbzrle
+= encoded_len
+ 1 + 2;
504 acct_info
.xbzrle_pages
++;
505 acct_info
.xbzrle_bytes
+= bytes_xbzrle
;
506 *bytes_transferred
+= bytes_xbzrle
;
512 ram_addr_t
migration_bitmap_find_and_reset_dirty(MemoryRegion
*mr
,
515 unsigned long base
= mr
->ram_addr
>> TARGET_PAGE_BITS
;
516 unsigned long nr
= base
+ (start
>> TARGET_PAGE_BITS
);
517 uint64_t mr_size
= TARGET_PAGE_ALIGN(memory_region_size(mr
));
518 unsigned long size
= base
+ (mr_size
>> TARGET_PAGE_BITS
);
522 if (ram_bulk_stage
&& nr
> base
) {
525 next
= find_next_bit(migration_bitmap
, size
, nr
);
529 clear_bit(next
, migration_bitmap
);
530 migration_dirty_pages
--;
532 return (next
- base
) << TARGET_PAGE_BITS
;
535 static void migration_bitmap_sync_range(ram_addr_t start
, ram_addr_t length
)
537 migration_dirty_pages
+=
538 cpu_physical_memory_sync_dirty_bitmap(migration_bitmap
, start
, length
);
542 /* Fix me: there are too many global variables used in migration process. */
543 static int64_t start_time
;
544 static int64_t bytes_xfer_prev
;
545 static int64_t num_dirty_pages_period
;
546 static uint64_t xbzrle_cache_miss_prev
;
547 static uint64_t iterations_prev
;
549 static void migration_bitmap_sync_init(void)
553 num_dirty_pages_period
= 0;
554 xbzrle_cache_miss_prev
= 0;
558 /* Called with iothread lock held, to protect ram_list.dirty_memory[] */
559 static void migration_bitmap_sync(void)
562 uint64_t num_dirty_pages_init
= migration_dirty_pages
;
563 MigrationState
*s
= migrate_get_current();
565 int64_t bytes_xfer_now
;
569 if (!bytes_xfer_prev
) {
570 bytes_xfer_prev
= ram_bytes_transferred();
574 start_time
= qemu_clock_get_ms(QEMU_CLOCK_REALTIME
);
577 trace_migration_bitmap_sync_start();
578 address_space_sync_dirty_bitmap(&address_space_memory
);
581 QLIST_FOREACH_RCU(block
, &ram_list
.blocks
, next
) {
582 migration_bitmap_sync_range(block
->mr
->ram_addr
, block
->used_length
);
586 trace_migration_bitmap_sync_end(migration_dirty_pages
587 - num_dirty_pages_init
);
588 num_dirty_pages_period
+= migration_dirty_pages
- num_dirty_pages_init
;
589 end_time
= qemu_clock_get_ms(QEMU_CLOCK_REALTIME
);
591 /* more than 1 second = 1000 millisecons */
592 if (end_time
> start_time
+ 1000) {
593 if (migrate_auto_converge()) {
594 /* The following detection logic can be refined later. For now:
595 Check to see if the dirtied bytes is 50% more than the approx.
596 amount of bytes that just got transferred since the last time we
597 were in this routine. If that happens >N times (for now N==4)
598 we turn on the throttle down logic */
599 bytes_xfer_now
= ram_bytes_transferred();
600 if (s
->dirty_pages_rate
&&
601 (num_dirty_pages_period
* TARGET_PAGE_SIZE
>
602 (bytes_xfer_now
- bytes_xfer_prev
)/2) &&
603 (dirty_rate_high_cnt
++ > 4)) {
604 trace_migration_throttle();
605 mig_throttle_on
= true;
606 dirty_rate_high_cnt
= 0;
608 bytes_xfer_prev
= bytes_xfer_now
;
610 mig_throttle_on
= false;
612 if (migrate_use_xbzrle()) {
613 if (iterations_prev
!= acct_info
.iterations
) {
614 acct_info
.xbzrle_cache_miss_rate
=
615 (double)(acct_info
.xbzrle_cache_miss
-
616 xbzrle_cache_miss_prev
) /
617 (acct_info
.iterations
- iterations_prev
);
619 iterations_prev
= acct_info
.iterations
;
620 xbzrle_cache_miss_prev
= acct_info
.xbzrle_cache_miss
;
622 s
->dirty_pages_rate
= num_dirty_pages_period
* 1000
623 / (end_time
- start_time
);
624 s
->dirty_bytes_rate
= s
->dirty_pages_rate
* TARGET_PAGE_SIZE
;
625 start_time
= end_time
;
626 num_dirty_pages_period
= 0;
628 s
->dirty_sync_count
= bitmap_sync_count
;
632 * save_zero_page: Send the zero page to the stream
634 * Returns: Number of pages written.
636 * @f: QEMUFile where to send the data
637 * @block: block that contains the page we want to send
638 * @offset: offset inside the block for the page
639 * @p: pointer to the page
640 * @bytes_transferred: increase it with the number of transferred bytes
642 static int save_zero_page(QEMUFile
*f
, RAMBlock
*block
, ram_addr_t offset
,
643 uint8_t *p
, uint64_t *bytes_transferred
)
647 if (is_zero_range(p
, TARGET_PAGE_SIZE
)) {
648 acct_info
.dup_pages
++;
649 *bytes_transferred
+= save_page_header(f
, block
,
650 offset
| RAM_SAVE_FLAG_COMPRESS
);
652 *bytes_transferred
+= 1;
660 * ram_save_page: Send the given page to the stream
662 * Returns: Number of pages written.
664 * @f: QEMUFile where to send the data
665 * @block: block that contains the page we want to send
666 * @offset: offset inside the block for the page
667 * @last_stage: if we are at the completion stage
668 * @bytes_transferred: increase it with the number of transferred bytes
670 static int ram_save_page(QEMUFile
*f
, RAMBlock
* block
, ram_addr_t offset
,
671 bool last_stage
, uint64_t *bytes_transferred
)
675 ram_addr_t current_addr
;
676 MemoryRegion
*mr
= block
->mr
;
679 bool send_async
= true;
681 p
= memory_region_get_ram_ptr(mr
) + offset
;
683 /* In doubt sent page as normal */
685 ret
= ram_control_save_page(f
, block
->offset
,
686 offset
, TARGET_PAGE_SIZE
, &bytes_xmit
);
688 *bytes_transferred
+= bytes_xmit
;
694 current_addr
= block
->offset
+ offset
;
696 if (block
== last_sent_block
) {
697 offset
|= RAM_SAVE_FLAG_CONTINUE
;
699 if (ret
!= RAM_SAVE_CONTROL_NOT_SUPP
) {
700 if (ret
!= RAM_SAVE_CONTROL_DELAYED
) {
701 if (bytes_xmit
> 0) {
702 acct_info
.norm_pages
++;
703 } else if (bytes_xmit
== 0) {
704 acct_info
.dup_pages
++;
708 pages
= save_zero_page(f
, block
, offset
, p
, bytes_transferred
);
710 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
711 * page would be stale
713 xbzrle_cache_zero_page(current_addr
);
714 } else if (!ram_bulk_stage
&& migrate_use_xbzrle()) {
715 pages
= save_xbzrle_page(f
, &p
, current_addr
, block
,
716 offset
, last_stage
, bytes_transferred
);
718 /* Can't send this cached data async, since the cache page
719 * might get updated before it gets to the wire
726 /* XBZRLE overflow or normal page */
728 *bytes_transferred
+= save_page_header(f
, block
,
729 offset
| RAM_SAVE_FLAG_PAGE
);
731 qemu_put_buffer_async(f
, p
, TARGET_PAGE_SIZE
);
733 qemu_put_buffer(f
, p
, TARGET_PAGE_SIZE
);
735 *bytes_transferred
+= TARGET_PAGE_SIZE
;
737 acct_info
.norm_pages
++;
740 XBZRLE_cache_unlock();
745 static int do_compress_ram_page(CompressParam
*param
)
747 int bytes_sent
, blen
;
749 RAMBlock
*block
= param
->block
;
750 ram_addr_t offset
= param
->offset
;
752 p
= memory_region_get_ram_ptr(block
->mr
) + (offset
& TARGET_PAGE_MASK
);
754 bytes_sent
= save_page_header(param
->file
, block
, offset
|
755 RAM_SAVE_FLAG_COMPRESS_PAGE
);
756 blen
= qemu_put_compression_data(param
->file
, p
, TARGET_PAGE_SIZE
,
757 migrate_compress_level());
763 static inline void start_compression(CompressParam
*param
)
766 qemu_mutex_lock(¶m
->mutex
);
768 qemu_cond_signal(¶m
->cond
);
769 qemu_mutex_unlock(¶m
->mutex
);
772 static inline void start_decompression(DecompressParam
*param
)
774 qemu_mutex_lock(¶m
->mutex
);
776 qemu_cond_signal(¶m
->cond
);
777 qemu_mutex_unlock(¶m
->mutex
);
780 static uint64_t bytes_transferred
;
782 static void flush_compressed_data(QEMUFile
*f
)
784 int idx
, len
, thread_count
;
786 if (!migrate_use_compression()) {
789 thread_count
= migrate_compress_threads();
790 for (idx
= 0; idx
< thread_count
; idx
++) {
791 if (!comp_param
[idx
].done
) {
792 qemu_mutex_lock(comp_done_lock
);
793 while (!comp_param
[idx
].done
&& !quit_comp_thread
) {
794 qemu_cond_wait(comp_done_cond
, comp_done_lock
);
796 qemu_mutex_unlock(comp_done_lock
);
798 if (!quit_comp_thread
) {
799 len
= qemu_put_qemu_file(f
, comp_param
[idx
].file
);
800 bytes_transferred
+= len
;
805 static inline void set_compress_params(CompressParam
*param
, RAMBlock
*block
,
808 param
->block
= block
;
809 param
->offset
= offset
;
812 static int compress_page_with_multi_thread(QEMUFile
*f
, RAMBlock
*block
,
814 uint64_t *bytes_transferred
)
816 int idx
, thread_count
, bytes_xmit
= -1, pages
= -1;
818 thread_count
= migrate_compress_threads();
819 qemu_mutex_lock(comp_done_lock
);
821 for (idx
= 0; idx
< thread_count
; idx
++) {
822 if (comp_param
[idx
].done
) {
823 bytes_xmit
= qemu_put_qemu_file(f
, comp_param
[idx
].file
);
824 set_compress_params(&comp_param
[idx
], block
, offset
);
825 start_compression(&comp_param
[idx
]);
827 acct_info
.norm_pages
++;
828 *bytes_transferred
+= bytes_xmit
;
835 qemu_cond_wait(comp_done_cond
, comp_done_lock
);
838 qemu_mutex_unlock(comp_done_lock
);
844 * ram_save_compressed_page: compress the given page and send it to the stream
846 * Returns: Number of pages written.
848 * @f: QEMUFile where to send the data
849 * @block: block that contains the page we want to send
850 * @offset: offset inside the block for the page
851 * @last_stage: if we are at the completion stage
852 * @bytes_transferred: increase it with the number of transferred bytes
854 static int ram_save_compressed_page(QEMUFile
*f
, RAMBlock
*block
,
855 ram_addr_t offset
, bool last_stage
,
856 uint64_t *bytes_transferred
)
860 MemoryRegion
*mr
= block
->mr
;
864 p
= memory_region_get_ram_ptr(mr
) + offset
;
867 ret
= ram_control_save_page(f
, block
->offset
,
868 offset
, TARGET_PAGE_SIZE
, &bytes_xmit
);
870 *bytes_transferred
+= bytes_xmit
;
873 if (block
== last_sent_block
) {
874 offset
|= RAM_SAVE_FLAG_CONTINUE
;
876 if (ret
!= RAM_SAVE_CONTROL_NOT_SUPP
) {
877 if (ret
!= RAM_SAVE_CONTROL_DELAYED
) {
878 if (bytes_xmit
> 0) {
879 acct_info
.norm_pages
++;
880 } else if (bytes_xmit
== 0) {
881 acct_info
.dup_pages
++;
885 /* When starting the process of a new block, the first page of
886 * the block should be sent out before other pages in the same
887 * block, and all the pages in last block should have been sent
888 * out, keeping this order is important, because the 'cont' flag
889 * is used to avoid resending the block name.
891 if (block
!= last_sent_block
) {
892 flush_compressed_data(f
);
893 pages
= save_zero_page(f
, block
, offset
, p
, bytes_transferred
);
895 set_compress_params(&comp_param
[0], block
, offset
);
896 /* Use the qemu thread to compress the data to make sure the
897 * first page is sent out before other pages
899 bytes_xmit
= do_compress_ram_page(&comp_param
[0]);
900 acct_info
.norm_pages
++;
901 qemu_put_qemu_file(f
, comp_param
[0].file
);
902 *bytes_transferred
+= bytes_xmit
;
906 pages
= save_zero_page(f
, block
, offset
, p
, bytes_transferred
);
908 pages
= compress_page_with_multi_thread(f
, block
, offset
,
918 * ram_find_and_save_block: Finds a dirty page and sends it to f
920 * Called within an RCU critical section.
922 * Returns: The number of pages written
923 * 0 means no dirty pages
925 * @f: QEMUFile where to send the data
926 * @last_stage: if we are at the completion stage
927 * @bytes_transferred: increase it with the number of transferred bytes
930 static int ram_find_and_save_block(QEMUFile
*f
, bool last_stage
,
931 uint64_t *bytes_transferred
)
933 RAMBlock
*block
= last_seen_block
;
934 ram_addr_t offset
= last_offset
;
935 bool complete_round
= false;
940 block
= QLIST_FIRST_RCU(&ram_list
.blocks
);
944 offset
= migration_bitmap_find_and_reset_dirty(mr
, offset
);
945 if (complete_round
&& block
== last_seen_block
&&
946 offset
>= last_offset
) {
949 if (offset
>= block
->used_length
) {
951 block
= QLIST_NEXT_RCU(block
, next
);
953 block
= QLIST_FIRST_RCU(&ram_list
.blocks
);
954 complete_round
= true;
955 ram_bulk_stage
= false;
956 if (migrate_use_xbzrle()) {
957 /* If xbzrle is on, stop using the data compression at this
958 * point. In theory, xbzrle can do better than compression.
960 flush_compressed_data(f
);
961 compression_switch
= false;
965 if (compression_switch
&& migrate_use_compression()) {
966 pages
= ram_save_compressed_page(f
, block
, offset
, last_stage
,
969 pages
= ram_save_page(f
, block
, offset
, last_stage
,
973 /* if page is unmodified, continue to the next */
975 last_sent_block
= block
;
981 last_seen_block
= block
;
982 last_offset
= offset
;
987 void acct_update_position(QEMUFile
*f
, size_t size
, bool zero
)
989 uint64_t pages
= size
/ TARGET_PAGE_SIZE
;
991 acct_info
.dup_pages
+= pages
;
993 acct_info
.norm_pages
+= pages
;
994 bytes_transferred
+= size
;
995 qemu_update_position(f
, size
);
999 static ram_addr_t
ram_save_remaining(void)
1001 return migration_dirty_pages
;
1004 uint64_t ram_bytes_remaining(void)
1006 return ram_save_remaining() * TARGET_PAGE_SIZE
;
1009 uint64_t ram_bytes_transferred(void)
1011 return bytes_transferred
;
1014 uint64_t ram_bytes_total(void)
1020 QLIST_FOREACH_RCU(block
, &ram_list
.blocks
, next
)
1021 total
+= block
->used_length
;
1026 void free_xbzrle_decoded_buf(void)
1028 g_free(xbzrle_decoded_buf
);
1029 xbzrle_decoded_buf
= NULL
;
1032 static void migration_end(void)
1034 if (migration_bitmap
) {
1035 memory_global_dirty_log_stop();
1036 g_free(migration_bitmap
);
1037 migration_bitmap
= NULL
;
1040 XBZRLE_cache_lock();
1042 cache_fini(XBZRLE
.cache
);
1043 g_free(XBZRLE
.encoded_buf
);
1044 g_free(XBZRLE
.current_buf
);
1045 XBZRLE
.cache
= NULL
;
1046 XBZRLE
.encoded_buf
= NULL
;
1047 XBZRLE
.current_buf
= NULL
;
1049 XBZRLE_cache_unlock();
1052 static void ram_migration_cancel(void *opaque
)
1057 static void reset_ram_globals(void)
1059 last_seen_block
= NULL
;
1060 last_sent_block
= NULL
;
1062 last_version
= ram_list
.version
;
1063 ram_bulk_stage
= true;
1066 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1069 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1070 * long-running RCU critical section. When rcu-reclaims in the code
1071 * start to become numerous it will be necessary to reduce the
1072 * granularity of these critical sections.
1075 static int ram_save_setup(QEMUFile
*f
, void *opaque
)
1078 int64_t ram_bitmap_pages
; /* Size of bitmap in pages, including gaps */
1080 mig_throttle_on
= false;
1081 dirty_rate_high_cnt
= 0;
1082 bitmap_sync_count
= 0;
1083 migration_bitmap_sync_init();
1085 if (migrate_use_xbzrle()) {
1086 XBZRLE_cache_lock();
1087 XBZRLE
.cache
= cache_init(migrate_xbzrle_cache_size() /
1090 if (!XBZRLE
.cache
) {
1091 XBZRLE_cache_unlock();
1092 error_report("Error creating cache");
1095 XBZRLE_cache_unlock();
1097 /* We prefer not to abort if there is no memory */
1098 XBZRLE
.encoded_buf
= g_try_malloc0(TARGET_PAGE_SIZE
);
1099 if (!XBZRLE
.encoded_buf
) {
1100 error_report("Error allocating encoded_buf");
1104 XBZRLE
.current_buf
= g_try_malloc(TARGET_PAGE_SIZE
);
1105 if (!XBZRLE
.current_buf
) {
1106 error_report("Error allocating current_buf");
1107 g_free(XBZRLE
.encoded_buf
);
1108 XBZRLE
.encoded_buf
= NULL
;
1115 /* iothread lock needed for ram_list.dirty_memory[] */
1116 qemu_mutex_lock_iothread();
1117 qemu_mutex_lock_ramlist();
1119 bytes_transferred
= 0;
1120 reset_ram_globals();
1122 ram_bitmap_pages
= last_ram_offset() >> TARGET_PAGE_BITS
;
1123 migration_bitmap
= bitmap_new(ram_bitmap_pages
);
1124 bitmap_set(migration_bitmap
, 0, ram_bitmap_pages
);
1127 * Count the total number of pages used by ram blocks not including any
1128 * gaps due to alignment or unplugs.
1130 migration_dirty_pages
= ram_bytes_total() >> TARGET_PAGE_BITS
;
1132 memory_global_dirty_log_start();
1133 migration_bitmap_sync();
1134 qemu_mutex_unlock_ramlist();
1135 qemu_mutex_unlock_iothread();
1137 qemu_put_be64(f
, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE
);
1139 QLIST_FOREACH_RCU(block
, &ram_list
.blocks
, next
) {
1140 qemu_put_byte(f
, strlen(block
->idstr
));
1141 qemu_put_buffer(f
, (uint8_t *)block
->idstr
, strlen(block
->idstr
));
1142 qemu_put_be64(f
, block
->used_length
);
1147 ram_control_before_iterate(f
, RAM_CONTROL_SETUP
);
1148 ram_control_after_iterate(f
, RAM_CONTROL_SETUP
);
1150 qemu_put_be64(f
, RAM_SAVE_FLAG_EOS
);
1155 static int ram_save_iterate(QEMUFile
*f
, void *opaque
)
1163 if (ram_list
.version
!= last_version
) {
1164 reset_ram_globals();
1167 /* Read version before ram_list.blocks */
1170 ram_control_before_iterate(f
, RAM_CONTROL_ROUND
);
1172 t0
= qemu_clock_get_ns(QEMU_CLOCK_REALTIME
);
1174 while ((ret
= qemu_file_rate_limit(f
)) == 0) {
1177 pages
= ram_find_and_save_block(f
, false, &bytes_transferred
);
1178 /* no more pages to sent */
1182 pages_sent
+= pages
;
1183 acct_info
.iterations
++;
1184 check_guest_throttling();
1185 /* we want to check in the 1st loop, just in case it was the 1st time
1186 and we had to sync the dirty bitmap.
1187 qemu_get_clock_ns() is a bit expensive, so we only check each some
1190 if ((i
& 63) == 0) {
1191 uint64_t t1
= (qemu_clock_get_ns(QEMU_CLOCK_REALTIME
) - t0
) / 1000000;
1192 if (t1
> MAX_WAIT
) {
1193 DPRINTF("big wait: %" PRIu64
" milliseconds, %d iterations\n",
1200 flush_compressed_data(f
);
1204 * Must occur before EOS (or any QEMUFile operation)
1205 * because of RDMA protocol.
1207 ram_control_after_iterate(f
, RAM_CONTROL_ROUND
);
1209 qemu_put_be64(f
, RAM_SAVE_FLAG_EOS
);
1210 bytes_transferred
+= 8;
1212 ret
= qemu_file_get_error(f
);
1220 /* Called with iothread lock */
1221 static int ram_save_complete(QEMUFile
*f
, void *opaque
)
1225 migration_bitmap_sync();
1227 ram_control_before_iterate(f
, RAM_CONTROL_FINISH
);
1229 /* try transferring iterative blocks of memory */
1231 /* flush all remaining blocks regardless of rate limiting */
1235 pages
= ram_find_and_save_block(f
, true, &bytes_transferred
);
1236 /* no more blocks to sent */
1242 flush_compressed_data(f
);
1243 ram_control_after_iterate(f
, RAM_CONTROL_FINISH
);
1247 qemu_put_be64(f
, RAM_SAVE_FLAG_EOS
);
1252 static uint64_t ram_save_pending(QEMUFile
*f
, void *opaque
, uint64_t max_size
)
1254 uint64_t remaining_size
;
1256 remaining_size
= ram_save_remaining() * TARGET_PAGE_SIZE
;
1258 if (remaining_size
< max_size
) {
1259 qemu_mutex_lock_iothread();
1261 migration_bitmap_sync();
1263 qemu_mutex_unlock_iothread();
1264 remaining_size
= ram_save_remaining() * TARGET_PAGE_SIZE
;
1266 return remaining_size
;
1269 static int load_xbzrle(QEMUFile
*f
, ram_addr_t addr
, void *host
)
1271 unsigned int xh_len
;
1274 if (!xbzrle_decoded_buf
) {
1275 xbzrle_decoded_buf
= g_malloc(TARGET_PAGE_SIZE
);
1278 /* extract RLE header */
1279 xh_flags
= qemu_get_byte(f
);
1280 xh_len
= qemu_get_be16(f
);
1282 if (xh_flags
!= ENCODING_FLAG_XBZRLE
) {
1283 error_report("Failed to load XBZRLE page - wrong compression!");
1287 if (xh_len
> TARGET_PAGE_SIZE
) {
1288 error_report("Failed to load XBZRLE page - len overflow!");
1291 /* load data and decode */
1292 qemu_get_buffer(f
, xbzrle_decoded_buf
, xh_len
);
1295 if (xbzrle_decode_buffer(xbzrle_decoded_buf
, xh_len
, host
,
1296 TARGET_PAGE_SIZE
) == -1) {
1297 error_report("Failed to load XBZRLE page - decode error!");
1304 /* Must be called from within a rcu critical section.
1305 * Returns a pointer from within the RCU-protected ram_list.
1307 static inline void *host_from_stream_offset(QEMUFile
*f
,
1311 static RAMBlock
*block
= NULL
;
1315 if (flags
& RAM_SAVE_FLAG_CONTINUE
) {
1316 if (!block
|| block
->max_length
<= offset
) {
1317 error_report("Ack, bad migration stream!");
1321 return memory_region_get_ram_ptr(block
->mr
) + offset
;
1324 len
= qemu_get_byte(f
);
1325 qemu_get_buffer(f
, (uint8_t *)id
, len
);
1328 QLIST_FOREACH_RCU(block
, &ram_list
.blocks
, next
) {
1329 if (!strncmp(id
, block
->idstr
, sizeof(id
)) &&
1330 block
->max_length
> offset
) {
1331 return memory_region_get_ram_ptr(block
->mr
) + offset
;
1335 error_report("Can't find block %s!", id
);
1340 * If a page (or a whole RDMA chunk) has been
1341 * determined to be zero, then zap it.
1343 void ram_handle_compressed(void *host
, uint8_t ch
, uint64_t size
)
1345 if (ch
!= 0 || !is_zero_range(host
, size
)) {
1346 memset(host
, ch
, size
);
1350 static void *do_data_decompress(void *opaque
)
1352 DecompressParam
*param
= opaque
;
1353 unsigned long pagesize
;
1355 while (!quit_decomp_thread
) {
1356 qemu_mutex_lock(¶m
->mutex
);
1357 while (!param
->start
&& !quit_decomp_thread
) {
1358 qemu_cond_wait(¶m
->cond
, ¶m
->mutex
);
1359 pagesize
= TARGET_PAGE_SIZE
;
1360 if (!quit_decomp_thread
) {
1361 /* uncompress() will return failed in some case, especially
1362 * when the page is dirted when doing the compression, it's
1363 * not a problem because the dirty page will be retransferred
1364 * and uncompress() won't break the data in other pages.
1366 uncompress((Bytef
*)param
->des
, &pagesize
,
1367 (const Bytef
*)param
->compbuf
, param
->len
);
1369 param
->start
= false;
1371 qemu_mutex_unlock(¶m
->mutex
);
1377 void migrate_decompress_threads_create(void)
1379 int i
, thread_count
;
1381 thread_count
= migrate_decompress_threads();
1382 decompress_threads
= g_new0(QemuThread
, thread_count
);
1383 decomp_param
= g_new0(DecompressParam
, thread_count
);
1384 compressed_data_buf
= g_malloc0(compressBound(TARGET_PAGE_SIZE
));
1385 quit_decomp_thread
= false;
1386 for (i
= 0; i
< thread_count
; i
++) {
1387 qemu_mutex_init(&decomp_param
[i
].mutex
);
1388 qemu_cond_init(&decomp_param
[i
].cond
);
1389 decomp_param
[i
].compbuf
= g_malloc0(compressBound(TARGET_PAGE_SIZE
));
1390 qemu_thread_create(decompress_threads
+ i
, "decompress",
1391 do_data_decompress
, decomp_param
+ i
,
1392 QEMU_THREAD_JOINABLE
);
1396 void migrate_decompress_threads_join(void)
1398 int i
, thread_count
;
1400 quit_decomp_thread
= true;
1401 thread_count
= migrate_decompress_threads();
1402 for (i
= 0; i
< thread_count
; i
++) {
1403 qemu_mutex_lock(&decomp_param
[i
].mutex
);
1404 qemu_cond_signal(&decomp_param
[i
].cond
);
1405 qemu_mutex_unlock(&decomp_param
[i
].mutex
);
1407 for (i
= 0; i
< thread_count
; i
++) {
1408 qemu_thread_join(decompress_threads
+ i
);
1409 qemu_mutex_destroy(&decomp_param
[i
].mutex
);
1410 qemu_cond_destroy(&decomp_param
[i
].cond
);
1411 g_free(decomp_param
[i
].compbuf
);
1413 g_free(decompress_threads
);
1414 g_free(decomp_param
);
1415 g_free(compressed_data_buf
);
1416 decompress_threads
= NULL
;
1417 decomp_param
= NULL
;
1418 compressed_data_buf
= NULL
;
1421 static void decompress_data_with_multi_threads(uint8_t *compbuf
,
1422 void *host
, int len
)
1424 int idx
, thread_count
;
1426 thread_count
= migrate_decompress_threads();
1428 for (idx
= 0; idx
< thread_count
; idx
++) {
1429 if (!decomp_param
[idx
].start
) {
1430 memcpy(decomp_param
[idx
].compbuf
, compbuf
, len
);
1431 decomp_param
[idx
].des
= host
;
1432 decomp_param
[idx
].len
= len
;
1433 start_decompression(&decomp_param
[idx
]);
1437 if (idx
< thread_count
) {
1443 static int ram_load(QEMUFile
*f
, void *opaque
, int version_id
)
1445 int flags
= 0, ret
= 0;
1446 static uint64_t seq_iter
;
1451 if (version_id
!= 4) {
1455 /* This RCU critical section can be very long running.
1456 * When RCU reclaims in the code start to become numerous,
1457 * it will be necessary to reduce the granularity of this
1461 while (!ret
&& !(flags
& RAM_SAVE_FLAG_EOS
)) {
1462 ram_addr_t addr
, total_ram_bytes
;
1466 addr
= qemu_get_be64(f
);
1467 flags
= addr
& ~TARGET_PAGE_MASK
;
1468 addr
&= TARGET_PAGE_MASK
;
1470 switch (flags
& ~RAM_SAVE_FLAG_CONTINUE
) {
1471 case RAM_SAVE_FLAG_MEM_SIZE
:
1472 /* Synchronize RAM block list */
1473 total_ram_bytes
= addr
;
1474 while (!ret
&& total_ram_bytes
) {
1480 len
= qemu_get_byte(f
);
1481 qemu_get_buffer(f
, (uint8_t *)id
, len
);
1483 length
= qemu_get_be64(f
);
1485 QLIST_FOREACH_RCU(block
, &ram_list
.blocks
, next
) {
1486 if (!strncmp(id
, block
->idstr
, sizeof(id
))) {
1487 if (length
!= block
->used_length
) {
1488 Error
*local_err
= NULL
;
1490 ret
= qemu_ram_resize(block
->offset
, length
, &local_err
);
1492 error_report_err(local_err
);
1500 error_report("Unknown ramblock \"%s\", cannot "
1501 "accept migration", id
);
1505 total_ram_bytes
-= length
;
1508 case RAM_SAVE_FLAG_COMPRESS
:
1509 host
= host_from_stream_offset(f
, addr
, flags
);
1511 error_report("Illegal RAM offset " RAM_ADDR_FMT
, addr
);
1515 ch
= qemu_get_byte(f
);
1516 ram_handle_compressed(host
, ch
, TARGET_PAGE_SIZE
);
1518 case RAM_SAVE_FLAG_PAGE
:
1519 host
= host_from_stream_offset(f
, addr
, flags
);
1521 error_report("Illegal RAM offset " RAM_ADDR_FMT
, addr
);
1525 qemu_get_buffer(f
, host
, TARGET_PAGE_SIZE
);
1527 case RAM_SAVE_FLAG_COMPRESS_PAGE
:
1528 host
= host_from_stream_offset(f
, addr
, flags
);
1530 error_report("Invalid RAM offset " RAM_ADDR_FMT
, addr
);
1535 len
= qemu_get_be32(f
);
1536 if (len
< 0 || len
> compressBound(TARGET_PAGE_SIZE
)) {
1537 error_report("Invalid compressed data length: %d", len
);
1541 qemu_get_buffer(f
, compressed_data_buf
, len
);
1542 decompress_data_with_multi_threads(compressed_data_buf
, host
, len
);
1544 case RAM_SAVE_FLAG_XBZRLE
:
1545 host
= host_from_stream_offset(f
, addr
, flags
);
1547 error_report("Illegal RAM offset " RAM_ADDR_FMT
, addr
);
1551 if (load_xbzrle(f
, addr
, host
) < 0) {
1552 error_report("Failed to decompress XBZRLE page at "
1553 RAM_ADDR_FMT
, addr
);
1558 case RAM_SAVE_FLAG_EOS
:
1562 if (flags
& RAM_SAVE_FLAG_HOOK
) {
1563 ram_control_load_hook(f
, flags
);
1565 error_report("Unknown combination of migration flags: %#x",
1571 ret
= qemu_file_get_error(f
);
1576 DPRINTF("Completed load of VM with exit code %d seq iteration "
1577 "%" PRIu64
"\n", ret
, seq_iter
);
1581 static SaveVMHandlers savevm_ram_handlers
= {
1582 .save_live_setup
= ram_save_setup
,
1583 .save_live_iterate
= ram_save_iterate
,
1584 .save_live_complete
= ram_save_complete
,
1585 .save_live_pending
= ram_save_pending
,
1586 .load_state
= ram_load
,
1587 .cancel
= ram_migration_cancel
,
1590 void ram_mig_init(void)
1592 qemu_mutex_init(&XBZRLE
.lock
);
1593 register_savevm_live(NULL
, "ram", 0, 4, &savevm_ram_handlers
, NULL
);
1595 /* Stub function that's gets run on the vcpu when its brought out of the
1596 VM to run inside qemu via async_run_on_cpu()*/
1598 static void mig_sleep_cpu(void *opq
)
1600 qemu_mutex_unlock_iothread();
1602 qemu_mutex_lock_iothread();
1605 /* To reduce the dirty rate explicitly disallow the VCPUs from spending
1606 much time in the VM. The migration thread will try to catchup.
1607 Workload will experience a performance drop.
1609 static void mig_throttle_guest_down(void)
1613 qemu_mutex_lock_iothread();
1615 async_run_on_cpu(cpu
, mig_sleep_cpu
, NULL
);
1617 qemu_mutex_unlock_iothread();
1620 static void check_guest_throttling(void)
1625 if (!mig_throttle_on
) {
1630 t0
= qemu_clock_get_ns(QEMU_CLOCK_REALTIME
);
1634 t1
= qemu_clock_get_ns(QEMU_CLOCK_REALTIME
);
1636 /* If it has been more than 40 ms since the last time the guest
1637 * was throttled then do it again.
1639 if (40 < (t1
-t0
)/1000000) {
1640 mig_throttle_guest_down();