ram: Move dup_pages into RAMState
[qemu/kevin.git] / migration / ram.c
blobcdd56b7c3394407880142ffa6a0cb3c136cbc6b1
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
28 #include "qemu/osdep.h"
29 #include "qemu-common.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qapi-event.h"
33 #include "qemu/cutils.h"
34 #include "qemu/bitops.h"
35 #include "qemu/bitmap.h"
36 #include "qemu/timer.h"
37 #include "qemu/main-loop.h"
38 #include "migration/migration.h"
39 #include "migration/postcopy-ram.h"
40 #include "exec/address-spaces.h"
41 #include "migration/page_cache.h"
42 #include "qemu/error-report.h"
43 #include "trace.h"
44 #include "exec/ram_addr.h"
45 #include "qemu/rcu_queue.h"
46 #include "migration/colo.h"
48 /***********************************************************/
49 /* ram save/restore */
51 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
52 #define RAM_SAVE_FLAG_COMPRESS 0x02
53 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
54 #define RAM_SAVE_FLAG_PAGE 0x08
55 #define RAM_SAVE_FLAG_EOS 0x10
56 #define RAM_SAVE_FLAG_CONTINUE 0x20
57 #define RAM_SAVE_FLAG_XBZRLE 0x40
58 /* 0x80 is reserved in migration.h start with 0x100 next */
59 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
61 static uint8_t *ZERO_TARGET_PAGE;
63 static inline bool is_zero_range(uint8_t *p, uint64_t size)
65 return buffer_is_zero(p, size);
68 /* struct contains XBZRLE cache and a static page
69 used by the compression */
70 static struct {
71 /* buffer used for XBZRLE encoding */
72 uint8_t *encoded_buf;
73 /* buffer for storing page content */
74 uint8_t *current_buf;
75 /* Cache for XBZRLE, Protected by lock. */
76 PageCache *cache;
77 QemuMutex lock;
78 } XBZRLE;
80 /* buffer used for XBZRLE decoding */
81 static uint8_t *xbzrle_decoded_buf;
83 static void XBZRLE_cache_lock(void)
85 if (migrate_use_xbzrle())
86 qemu_mutex_lock(&XBZRLE.lock);
89 static void XBZRLE_cache_unlock(void)
91 if (migrate_use_xbzrle())
92 qemu_mutex_unlock(&XBZRLE.lock);
95 /**
96 * xbzrle_cache_resize: resize the xbzrle cache
98 * This function is called from qmp_migrate_set_cache_size in main
99 * thread, possibly while a migration is in progress. A running
100 * migration may be using the cache and might finish during this call,
101 * hence changes to the cache are protected by XBZRLE.lock().
103 * Returns the new_size or negative in case of error.
105 * @new_size: new cache size
107 int64_t xbzrle_cache_resize(int64_t new_size)
109 PageCache *new_cache;
110 int64_t ret;
112 if (new_size < TARGET_PAGE_SIZE) {
113 return -1;
116 XBZRLE_cache_lock();
118 if (XBZRLE.cache != NULL) {
119 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
120 goto out_new_size;
122 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
123 TARGET_PAGE_SIZE);
124 if (!new_cache) {
125 error_report("Error creating cache");
126 ret = -1;
127 goto out;
130 cache_fini(XBZRLE.cache);
131 XBZRLE.cache = new_cache;
134 out_new_size:
135 ret = pow2floor(new_size);
136 out:
137 XBZRLE_cache_unlock();
138 return ret;
141 /* State of RAM for migration */
142 struct RAMState {
143 /* Last block that we have visited searching for dirty pages */
144 RAMBlock *last_seen_block;
145 /* Last block from where we have sent data */
146 RAMBlock *last_sent_block;
147 /* Last offset we have sent data from */
148 ram_addr_t last_offset;
149 /* last ram version we have seen */
150 uint32_t last_version;
151 /* We are in the first round */
152 bool ram_bulk_stage;
153 /* How many times we have dirty too many pages */
154 int dirty_rate_high_cnt;
155 /* How many times we have synchronized the bitmap */
156 uint64_t bitmap_sync_count;
157 /* these variables are used for bitmap sync */
158 /* last time we did a full bitmap_sync */
159 int64_t time_last_bitmap_sync;
160 /* bytes transferred at start_time */
161 uint64_t bytes_xfer_prev;
162 /* number of dirty pages since start_time */
163 uint64_t num_dirty_pages_period;
164 /* xbzrle misses since the beginning of the period */
165 uint64_t xbzrle_cache_miss_prev;
166 /* number of iterations at the beginning of period */
167 uint64_t iterations_prev;
168 /* Accounting fields */
169 /* number of zero pages. It used to be pages filled by the same char. */
170 uint64_t zero_pages;
172 typedef struct RAMState RAMState;
174 static RAMState ram_state;
176 /* accounting for migration statistics */
177 typedef struct AccountingInfo {
178 uint64_t skipped_pages;
179 uint64_t norm_pages;
180 uint64_t iterations;
181 uint64_t xbzrle_bytes;
182 uint64_t xbzrle_pages;
183 uint64_t xbzrle_cache_miss;
184 double xbzrle_cache_miss_rate;
185 uint64_t xbzrle_overflows;
186 } AccountingInfo;
188 static AccountingInfo acct_info;
190 static void acct_clear(void)
192 memset(&acct_info, 0, sizeof(acct_info));
195 uint64_t dup_mig_bytes_transferred(void)
197 return ram_state.zero_pages * TARGET_PAGE_SIZE;
200 uint64_t dup_mig_pages_transferred(void)
202 return ram_state.zero_pages;
205 uint64_t skipped_mig_bytes_transferred(void)
207 return acct_info.skipped_pages * TARGET_PAGE_SIZE;
210 uint64_t skipped_mig_pages_transferred(void)
212 return acct_info.skipped_pages;
215 uint64_t norm_mig_bytes_transferred(void)
217 return acct_info.norm_pages * TARGET_PAGE_SIZE;
220 uint64_t norm_mig_pages_transferred(void)
222 return acct_info.norm_pages;
225 uint64_t xbzrle_mig_bytes_transferred(void)
227 return acct_info.xbzrle_bytes;
230 uint64_t xbzrle_mig_pages_transferred(void)
232 return acct_info.xbzrle_pages;
235 uint64_t xbzrle_mig_pages_cache_miss(void)
237 return acct_info.xbzrle_cache_miss;
240 double xbzrle_mig_cache_miss_rate(void)
242 return acct_info.xbzrle_cache_miss_rate;
245 uint64_t xbzrle_mig_pages_overflow(void)
247 return acct_info.xbzrle_overflows;
250 static QemuMutex migration_bitmap_mutex;
251 static uint64_t migration_dirty_pages;
253 /* used by the search for pages to send */
254 struct PageSearchStatus {
255 /* Current block being searched */
256 RAMBlock *block;
257 /* Current offset to search from */
258 ram_addr_t offset;
259 /* Set once we wrap around */
260 bool complete_round;
262 typedef struct PageSearchStatus PageSearchStatus;
264 static struct BitmapRcu {
265 struct rcu_head rcu;
266 /* Main migration bitmap */
267 unsigned long *bmap;
268 /* bitmap of pages that haven't been sent even once
269 * only maintained and used in postcopy at the moment
270 * where it's used to send the dirtymap at the start
271 * of the postcopy phase
273 unsigned long *unsentmap;
274 } *migration_bitmap_rcu;
276 struct CompressParam {
277 bool done;
278 bool quit;
279 QEMUFile *file;
280 QemuMutex mutex;
281 QemuCond cond;
282 RAMBlock *block;
283 ram_addr_t offset;
285 typedef struct CompressParam CompressParam;
287 struct DecompressParam {
288 bool done;
289 bool quit;
290 QemuMutex mutex;
291 QemuCond cond;
292 void *des;
293 uint8_t *compbuf;
294 int len;
296 typedef struct DecompressParam DecompressParam;
298 static CompressParam *comp_param;
299 static QemuThread *compress_threads;
300 /* comp_done_cond is used to wake up the migration thread when
301 * one of the compression threads has finished the compression.
302 * comp_done_lock is used to co-work with comp_done_cond.
304 static QemuMutex comp_done_lock;
305 static QemuCond comp_done_cond;
306 /* The empty QEMUFileOps will be used by file in CompressParam */
307 static const QEMUFileOps empty_ops = { };
309 static bool compression_switch;
310 static DecompressParam *decomp_param;
311 static QemuThread *decompress_threads;
312 static QemuMutex decomp_done_lock;
313 static QemuCond decomp_done_cond;
315 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
316 ram_addr_t offset);
318 static void *do_data_compress(void *opaque)
320 CompressParam *param = opaque;
321 RAMBlock *block;
322 ram_addr_t offset;
324 qemu_mutex_lock(&param->mutex);
325 while (!param->quit) {
326 if (param->block) {
327 block = param->block;
328 offset = param->offset;
329 param->block = NULL;
330 qemu_mutex_unlock(&param->mutex);
332 do_compress_ram_page(param->file, block, offset);
334 qemu_mutex_lock(&comp_done_lock);
335 param->done = true;
336 qemu_cond_signal(&comp_done_cond);
337 qemu_mutex_unlock(&comp_done_lock);
339 qemu_mutex_lock(&param->mutex);
340 } else {
341 qemu_cond_wait(&param->cond, &param->mutex);
344 qemu_mutex_unlock(&param->mutex);
346 return NULL;
349 static inline void terminate_compression_threads(void)
351 int idx, thread_count;
353 thread_count = migrate_compress_threads();
355 for (idx = 0; idx < thread_count; idx++) {
356 qemu_mutex_lock(&comp_param[idx].mutex);
357 comp_param[idx].quit = true;
358 qemu_cond_signal(&comp_param[idx].cond);
359 qemu_mutex_unlock(&comp_param[idx].mutex);
363 void migrate_compress_threads_join(void)
365 int i, thread_count;
367 if (!migrate_use_compression()) {
368 return;
370 terminate_compression_threads();
371 thread_count = migrate_compress_threads();
372 for (i = 0; i < thread_count; i++) {
373 qemu_thread_join(compress_threads + i);
374 qemu_fclose(comp_param[i].file);
375 qemu_mutex_destroy(&comp_param[i].mutex);
376 qemu_cond_destroy(&comp_param[i].cond);
378 qemu_mutex_destroy(&comp_done_lock);
379 qemu_cond_destroy(&comp_done_cond);
380 g_free(compress_threads);
381 g_free(comp_param);
382 compress_threads = NULL;
383 comp_param = NULL;
386 void migrate_compress_threads_create(void)
388 int i, thread_count;
390 if (!migrate_use_compression()) {
391 return;
393 compression_switch = true;
394 thread_count = migrate_compress_threads();
395 compress_threads = g_new0(QemuThread, thread_count);
396 comp_param = g_new0(CompressParam, thread_count);
397 qemu_cond_init(&comp_done_cond);
398 qemu_mutex_init(&comp_done_lock);
399 for (i = 0; i < thread_count; i++) {
400 /* comp_param[i].file is just used as a dummy buffer to save data,
401 * set its ops to empty.
403 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
404 comp_param[i].done = true;
405 comp_param[i].quit = false;
406 qemu_mutex_init(&comp_param[i].mutex);
407 qemu_cond_init(&comp_param[i].cond);
408 qemu_thread_create(compress_threads + i, "compress",
409 do_data_compress, comp_param + i,
410 QEMU_THREAD_JOINABLE);
415 * save_page_header: write page header to wire
417 * If this is the 1st block, it also writes the block identification
419 * Returns the number of bytes written
421 * @f: QEMUFile where to send the data
422 * @block: block that contains the page we want to send
423 * @offset: offset inside the block for the page
424 * in the lower bits, it contains flags
426 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
428 size_t size, len;
430 qemu_put_be64(f, offset);
431 size = 8;
433 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
434 len = strlen(block->idstr);
435 qemu_put_byte(f, len);
436 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
437 size += 1 + len;
439 return size;
443 * mig_throttle_guest_down: throotle down the guest
445 * Reduce amount of guest cpu execution to hopefully slow down memory
446 * writes. If guest dirty memory rate is reduced below the rate at
447 * which we can transfer pages to the destination then we should be
448 * able to complete migration. Some workloads dirty memory way too
449 * fast and will not effectively converge, even with auto-converge.
451 static void mig_throttle_guest_down(void)
453 MigrationState *s = migrate_get_current();
454 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
455 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
457 /* We have not started throttling yet. Let's start it. */
458 if (!cpu_throttle_active()) {
459 cpu_throttle_set(pct_initial);
460 } else {
461 /* Throttling already on, just increase the rate */
462 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
467 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
469 * @rs: current RAM state
470 * @current_addr: address for the zero page
472 * Update the xbzrle cache to reflect a page that's been sent as all 0.
473 * The important thing is that a stale (not-yet-0'd) page be replaced
474 * by the new data.
475 * As a bonus, if the page wasn't in the cache it gets added so that
476 * when a small write is made into the 0'd page it gets XBZRLE sent.
478 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
480 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
481 return;
484 /* We don't care if this fails to allocate a new cache page
485 * as long as it updated an old one */
486 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
487 rs->bitmap_sync_count);
490 #define ENCODING_FLAG_XBZRLE 0x1
493 * save_xbzrle_page: compress and send current page
495 * Returns: 1 means that we wrote the page
496 * 0 means that page is identical to the one already sent
497 * -1 means that xbzrle would be longer than normal
499 * @rs: current RAM state
500 * @f: QEMUFile where to send the data
501 * @current_data: pointer to the address of the page contents
502 * @current_addr: addr of the page
503 * @block: block that contains the page we want to send
504 * @offset: offset inside the block for the page
505 * @last_stage: if we are at the completion stage
506 * @bytes_transferred: increase it with the number of transferred bytes
508 static int save_xbzrle_page(RAMState *rs, QEMUFile *f, uint8_t **current_data,
509 ram_addr_t current_addr, RAMBlock *block,
510 ram_addr_t offset, bool last_stage,
511 uint64_t *bytes_transferred)
513 int encoded_len = 0, bytes_xbzrle;
514 uint8_t *prev_cached_page;
516 if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
517 acct_info.xbzrle_cache_miss++;
518 if (!last_stage) {
519 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
520 rs->bitmap_sync_count) == -1) {
521 return -1;
522 } else {
523 /* update *current_data when the page has been
524 inserted into cache */
525 *current_data = get_cached_data(XBZRLE.cache, current_addr);
528 return -1;
531 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
533 /* save current buffer into memory */
534 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
536 /* XBZRLE encoding (if there is no overflow) */
537 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
538 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
539 TARGET_PAGE_SIZE);
540 if (encoded_len == 0) {
541 trace_save_xbzrle_page_skipping();
542 return 0;
543 } else if (encoded_len == -1) {
544 trace_save_xbzrle_page_overflow();
545 acct_info.xbzrle_overflows++;
546 /* update data in the cache */
547 if (!last_stage) {
548 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
549 *current_data = prev_cached_page;
551 return -1;
554 /* we need to update the data in the cache, in order to get the same data */
555 if (!last_stage) {
556 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
559 /* Send XBZRLE based compressed page */
560 bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
561 qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
562 qemu_put_be16(f, encoded_len);
563 qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
564 bytes_xbzrle += encoded_len + 1 + 2;
565 acct_info.xbzrle_pages++;
566 acct_info.xbzrle_bytes += bytes_xbzrle;
567 *bytes_transferred += bytes_xbzrle;
569 return 1;
573 * migration_bitmap_find_dirty: find the next dirty page from start
575 * Called with rcu_read_lock() to protect migration_bitmap
577 * Returns the byte offset within memory region of the start of a dirty page
579 * @rs: current RAM state
580 * @rb: RAMBlock where to search for dirty pages
581 * @start: starting address (typically so we can continue from previous page)
582 * @ram_addr_abs: pointer into which to store the address of the dirty page
583 * within the global ram_addr space
585 static inline
586 ram_addr_t migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
587 ram_addr_t start,
588 ram_addr_t *ram_addr_abs)
590 unsigned long base = rb->offset >> TARGET_PAGE_BITS;
591 unsigned long nr = base + (start >> TARGET_PAGE_BITS);
592 uint64_t rb_size = rb->used_length;
593 unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
594 unsigned long *bitmap;
596 unsigned long next;
598 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
599 if (rs->ram_bulk_stage && nr > base) {
600 next = nr + 1;
601 } else {
602 next = find_next_bit(bitmap, size, nr);
605 *ram_addr_abs = next << TARGET_PAGE_BITS;
606 return (next - base) << TARGET_PAGE_BITS;
609 static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
611 bool ret;
612 int nr = addr >> TARGET_PAGE_BITS;
613 unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
615 ret = test_and_clear_bit(nr, bitmap);
617 if (ret) {
618 migration_dirty_pages--;
620 return ret;
623 static void migration_bitmap_sync_range(RAMState *rs, ram_addr_t start,
624 ram_addr_t length)
626 unsigned long *bitmap;
627 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
628 migration_dirty_pages += cpu_physical_memory_sync_dirty_bitmap(bitmap,
629 start, length, &rs->num_dirty_pages_period);
632 static void migration_bitmap_sync_init(RAMState *rs)
634 rs->time_last_bitmap_sync = 0;
635 rs->bytes_xfer_prev = 0;
636 rs->num_dirty_pages_period = 0;
637 rs->xbzrle_cache_miss_prev = 0;
638 rs->iterations_prev = 0;
642 * ram_pagesize_summary: calculate all the pagesizes of a VM
644 * Returns a summary bitmap of the page sizes of all RAMBlocks
646 * For VMs with just normal pages this is equivalent to the host page
647 * size. If it's got some huge pages then it's the OR of all the
648 * different page sizes.
650 uint64_t ram_pagesize_summary(void)
652 RAMBlock *block;
653 uint64_t summary = 0;
655 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
656 summary |= block->page_size;
659 return summary;
662 static void migration_bitmap_sync(RAMState *rs)
664 RAMBlock *block;
665 MigrationState *s = migrate_get_current();
666 int64_t end_time;
667 uint64_t bytes_xfer_now;
669 rs->bitmap_sync_count++;
671 if (!rs->bytes_xfer_prev) {
672 rs->bytes_xfer_prev = ram_bytes_transferred();
675 if (!rs->time_last_bitmap_sync) {
676 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
679 trace_migration_bitmap_sync_start();
680 memory_global_dirty_log_sync();
682 qemu_mutex_lock(&migration_bitmap_mutex);
683 rcu_read_lock();
684 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
685 migration_bitmap_sync_range(rs, block->offset, block->used_length);
687 rcu_read_unlock();
688 qemu_mutex_unlock(&migration_bitmap_mutex);
690 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
692 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
694 /* more than 1 second = 1000 millisecons */
695 if (end_time > rs->time_last_bitmap_sync + 1000) {
696 if (migrate_auto_converge()) {
697 /* The following detection logic can be refined later. For now:
698 Check to see if the dirtied bytes is 50% more than the approx.
699 amount of bytes that just got transferred since the last time we
700 were in this routine. If that happens twice, start or increase
701 throttling */
702 bytes_xfer_now = ram_bytes_transferred();
704 if (s->dirty_pages_rate &&
705 (rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
706 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
707 (rs->dirty_rate_high_cnt++ >= 2)) {
708 trace_migration_throttle();
709 rs->dirty_rate_high_cnt = 0;
710 mig_throttle_guest_down();
712 rs->bytes_xfer_prev = bytes_xfer_now;
715 if (migrate_use_xbzrle()) {
716 if (rs->iterations_prev != acct_info.iterations) {
717 acct_info.xbzrle_cache_miss_rate =
718 (double)(acct_info.xbzrle_cache_miss -
719 rs->xbzrle_cache_miss_prev) /
720 (acct_info.iterations - rs->iterations_prev);
722 rs->iterations_prev = acct_info.iterations;
723 rs->xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
725 s->dirty_pages_rate = rs->num_dirty_pages_period * 1000
726 / (end_time - rs->time_last_bitmap_sync);
727 s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
728 rs->time_last_bitmap_sync = end_time;
729 rs->num_dirty_pages_period = 0;
731 s->dirty_sync_count = rs->bitmap_sync_count;
732 if (migrate_use_events()) {
733 qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
738 * save_zero_page: send the zero page to the stream
740 * Returns the number of pages written.
742 * @rs: current RAM state
743 * @f: QEMUFile where to send the data
744 * @block: block that contains the page we want to send
745 * @offset: offset inside the block for the page
746 * @p: pointer to the page
747 * @bytes_transferred: increase it with the number of transferred bytes
749 static int save_zero_page(RAMState *rs, QEMUFile *f, RAMBlock *block,
750 ram_addr_t offset,
751 uint8_t *p, uint64_t *bytes_transferred)
753 int pages = -1;
755 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
756 rs->zero_pages++;
757 *bytes_transferred += save_page_header(f, block,
758 offset | RAM_SAVE_FLAG_COMPRESS);
759 qemu_put_byte(f, 0);
760 *bytes_transferred += 1;
761 pages = 1;
764 return pages;
767 static void ram_release_pages(MigrationState *ms, const char *rbname,
768 uint64_t offset, int pages)
770 if (!migrate_release_ram() || !migration_in_postcopy(ms)) {
771 return;
774 ram_discard_range(NULL, rbname, offset, pages << TARGET_PAGE_BITS);
778 * ram_save_page: send the given page to the stream
780 * Returns the number of pages written.
781 * < 0 - error
782 * >=0 - Number of pages written - this might legally be 0
783 * if xbzrle noticed the page was the same.
785 * @rs: current RAM state
786 * @ms: current migration state
787 * @f: QEMUFile where to send the data
788 * @block: block that contains the page we want to send
789 * @offset: offset inside the block for the page
790 * @last_stage: if we are at the completion stage
791 * @bytes_transferred: increase it with the number of transferred bytes
793 static int ram_save_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
794 PageSearchStatus *pss, bool last_stage,
795 uint64_t *bytes_transferred)
797 int pages = -1;
798 uint64_t bytes_xmit;
799 ram_addr_t current_addr;
800 uint8_t *p;
801 int ret;
802 bool send_async = true;
803 RAMBlock *block = pss->block;
804 ram_addr_t offset = pss->offset;
806 p = block->host + offset;
808 /* In doubt sent page as normal */
809 bytes_xmit = 0;
810 ret = ram_control_save_page(f, block->offset,
811 offset, TARGET_PAGE_SIZE, &bytes_xmit);
812 if (bytes_xmit) {
813 *bytes_transferred += bytes_xmit;
814 pages = 1;
817 XBZRLE_cache_lock();
819 current_addr = block->offset + offset;
821 if (block == rs->last_sent_block) {
822 offset |= RAM_SAVE_FLAG_CONTINUE;
824 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
825 if (ret != RAM_SAVE_CONTROL_DELAYED) {
826 if (bytes_xmit > 0) {
827 acct_info.norm_pages++;
828 } else if (bytes_xmit == 0) {
829 rs->zero_pages++;
832 } else {
833 pages = save_zero_page(rs, f, block, offset, p, bytes_transferred);
834 if (pages > 0) {
835 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
836 * page would be stale
838 xbzrle_cache_zero_page(rs, current_addr);
839 ram_release_pages(ms, block->idstr, pss->offset, pages);
840 } else if (!rs->ram_bulk_stage &&
841 !migration_in_postcopy(ms) && migrate_use_xbzrle()) {
842 pages = save_xbzrle_page(rs, f, &p, current_addr, block,
843 offset, last_stage, bytes_transferred);
844 if (!last_stage) {
845 /* Can't send this cached data async, since the cache page
846 * might get updated before it gets to the wire
848 send_async = false;
853 /* XBZRLE overflow or normal page */
854 if (pages == -1) {
855 *bytes_transferred += save_page_header(f, block,
856 offset | RAM_SAVE_FLAG_PAGE);
857 if (send_async) {
858 qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE,
859 migrate_release_ram() &
860 migration_in_postcopy(ms));
861 } else {
862 qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
864 *bytes_transferred += TARGET_PAGE_SIZE;
865 pages = 1;
866 acct_info.norm_pages++;
869 XBZRLE_cache_unlock();
871 return pages;
874 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
875 ram_addr_t offset)
877 int bytes_sent, blen;
878 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
880 bytes_sent = save_page_header(f, block, offset |
881 RAM_SAVE_FLAG_COMPRESS_PAGE);
882 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
883 migrate_compress_level());
884 if (blen < 0) {
885 bytes_sent = 0;
886 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
887 error_report("compressed data failed!");
888 } else {
889 bytes_sent += blen;
890 ram_release_pages(migrate_get_current(), block->idstr,
891 offset & TARGET_PAGE_MASK, 1);
894 return bytes_sent;
897 static uint64_t bytes_transferred;
899 static void flush_compressed_data(QEMUFile *f)
901 int idx, len, thread_count;
903 if (!migrate_use_compression()) {
904 return;
906 thread_count = migrate_compress_threads();
908 qemu_mutex_lock(&comp_done_lock);
909 for (idx = 0; idx < thread_count; idx++) {
910 while (!comp_param[idx].done) {
911 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
914 qemu_mutex_unlock(&comp_done_lock);
916 for (idx = 0; idx < thread_count; idx++) {
917 qemu_mutex_lock(&comp_param[idx].mutex);
918 if (!comp_param[idx].quit) {
919 len = qemu_put_qemu_file(f, comp_param[idx].file);
920 bytes_transferred += len;
922 qemu_mutex_unlock(&comp_param[idx].mutex);
926 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
927 ram_addr_t offset)
929 param->block = block;
930 param->offset = offset;
933 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
934 ram_addr_t offset,
935 uint64_t *bytes_transferred)
937 int idx, thread_count, bytes_xmit = -1, pages = -1;
939 thread_count = migrate_compress_threads();
940 qemu_mutex_lock(&comp_done_lock);
941 while (true) {
942 for (idx = 0; idx < thread_count; idx++) {
943 if (comp_param[idx].done) {
944 comp_param[idx].done = false;
945 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
946 qemu_mutex_lock(&comp_param[idx].mutex);
947 set_compress_params(&comp_param[idx], block, offset);
948 qemu_cond_signal(&comp_param[idx].cond);
949 qemu_mutex_unlock(&comp_param[idx].mutex);
950 pages = 1;
951 acct_info.norm_pages++;
952 *bytes_transferred += bytes_xmit;
953 break;
956 if (pages > 0) {
957 break;
958 } else {
959 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
962 qemu_mutex_unlock(&comp_done_lock);
964 return pages;
968 * ram_save_compressed_page: compress the given page and send it to the stream
970 * Returns the number of pages written.
972 * @rs: current RAM state
973 * @ms: current migration state
974 * @f: QEMUFile where to send the data
975 * @block: block that contains the page we want to send
976 * @offset: offset inside the block for the page
977 * @last_stage: if we are at the completion stage
978 * @bytes_transferred: increase it with the number of transferred bytes
980 static int ram_save_compressed_page(RAMState *rs, MigrationState *ms,
981 QEMUFile *f,
982 PageSearchStatus *pss, bool last_stage,
983 uint64_t *bytes_transferred)
985 int pages = -1;
986 uint64_t bytes_xmit = 0;
987 uint8_t *p;
988 int ret, blen;
989 RAMBlock *block = pss->block;
990 ram_addr_t offset = pss->offset;
992 p = block->host + offset;
994 ret = ram_control_save_page(f, block->offset,
995 offset, TARGET_PAGE_SIZE, &bytes_xmit);
996 if (bytes_xmit) {
997 *bytes_transferred += bytes_xmit;
998 pages = 1;
1000 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
1001 if (ret != RAM_SAVE_CONTROL_DELAYED) {
1002 if (bytes_xmit > 0) {
1003 acct_info.norm_pages++;
1004 } else if (bytes_xmit == 0) {
1005 rs->zero_pages++;
1008 } else {
1009 /* When starting the process of a new block, the first page of
1010 * the block should be sent out before other pages in the same
1011 * block, and all the pages in last block should have been sent
1012 * out, keeping this order is important, because the 'cont' flag
1013 * is used to avoid resending the block name.
1015 if (block != rs->last_sent_block) {
1016 flush_compressed_data(f);
1017 pages = save_zero_page(rs, f, block, offset, p, bytes_transferred);
1018 if (pages == -1) {
1019 /* Make sure the first page is sent out before other pages */
1020 bytes_xmit = save_page_header(f, block, offset |
1021 RAM_SAVE_FLAG_COMPRESS_PAGE);
1022 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1023 migrate_compress_level());
1024 if (blen > 0) {
1025 *bytes_transferred += bytes_xmit + blen;
1026 acct_info.norm_pages++;
1027 pages = 1;
1028 } else {
1029 qemu_file_set_error(f, blen);
1030 error_report("compressed data failed!");
1033 if (pages > 0) {
1034 ram_release_pages(ms, block->idstr, pss->offset, pages);
1036 } else {
1037 offset |= RAM_SAVE_FLAG_CONTINUE;
1038 pages = save_zero_page(rs, f, block, offset, p, bytes_transferred);
1039 if (pages == -1) {
1040 pages = compress_page_with_multi_thread(f, block, offset,
1041 bytes_transferred);
1042 } else {
1043 ram_release_pages(ms, block->idstr, pss->offset, pages);
1048 return pages;
1052 * find_dirty_block: find the next dirty page and update any state
1053 * associated with the search process.
1055 * Returns if a page is found
1057 * @rs: current RAM state
1058 * @f: QEMUFile where to send the data
1059 * @pss: data about the state of the current dirty page scan
1060 * @again: set to false if the search has scanned the whole of RAM
1061 * @ram_addr_abs: pointer into which to store the address of the dirty page
1062 * within the global ram_addr space
1064 static bool find_dirty_block(RAMState *rs, QEMUFile *f, PageSearchStatus *pss,
1065 bool *again, ram_addr_t *ram_addr_abs)
1067 pss->offset = migration_bitmap_find_dirty(rs, pss->block, pss->offset,
1068 ram_addr_abs);
1069 if (pss->complete_round && pss->block == rs->last_seen_block &&
1070 pss->offset >= rs->last_offset) {
1072 * We've been once around the RAM and haven't found anything.
1073 * Give up.
1075 *again = false;
1076 return false;
1078 if (pss->offset >= pss->block->used_length) {
1079 /* Didn't find anything in this RAM Block */
1080 pss->offset = 0;
1081 pss->block = QLIST_NEXT_RCU(pss->block, next);
1082 if (!pss->block) {
1083 /* Hit the end of the list */
1084 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1085 /* Flag that we've looped */
1086 pss->complete_round = true;
1087 rs->ram_bulk_stage = false;
1088 if (migrate_use_xbzrle()) {
1089 /* If xbzrle is on, stop using the data compression at this
1090 * point. In theory, xbzrle can do better than compression.
1092 flush_compressed_data(f);
1093 compression_switch = false;
1096 /* Didn't find anything this time, but try again on the new block */
1097 *again = true;
1098 return false;
1099 } else {
1100 /* Can go around again, but... */
1101 *again = true;
1102 /* We've found something so probably don't need to */
1103 return true;
1108 * unqueue_page: gets a page of the queue
1110 * Helper for 'get_queued_page' - gets a page off the queue
1112 * Returns the block of the page (or NULL if none available)
1114 * @ms: current migration state
1115 * @offset: used to return the offset within the RAMBlock
1116 * @ram_addr_abs: pointer into which to store the address of the dirty page
1117 * within the global ram_addr space
1119 static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1120 ram_addr_t *ram_addr_abs)
1122 RAMBlock *block = NULL;
1124 qemu_mutex_lock(&ms->src_page_req_mutex);
1125 if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1126 struct MigrationSrcPageRequest *entry =
1127 QSIMPLEQ_FIRST(&ms->src_page_requests);
1128 block = entry->rb;
1129 *offset = entry->offset;
1130 *ram_addr_abs = (entry->offset + entry->rb->offset) &
1131 TARGET_PAGE_MASK;
1133 if (entry->len > TARGET_PAGE_SIZE) {
1134 entry->len -= TARGET_PAGE_SIZE;
1135 entry->offset += TARGET_PAGE_SIZE;
1136 } else {
1137 memory_region_unref(block->mr);
1138 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1139 g_free(entry);
1142 qemu_mutex_unlock(&ms->src_page_req_mutex);
1144 return block;
1148 * get_queued_page: unqueue a page from the postocpy requests
1150 * Skips pages that are already sent (!dirty)
1152 * Returns if a queued page is found
1154 * @rs: current RAM state
1155 * @ms: current migration state
1156 * @pss: data about the state of the current dirty page scan
1157 * @ram_addr_abs: pointer into which to store the address of the dirty page
1158 * within the global ram_addr space
1160 static bool get_queued_page(RAMState *rs, MigrationState *ms,
1161 PageSearchStatus *pss,
1162 ram_addr_t *ram_addr_abs)
1164 RAMBlock *block;
1165 ram_addr_t offset;
1166 bool dirty;
1168 do {
1169 block = unqueue_page(ms, &offset, ram_addr_abs);
1171 * We're sending this page, and since it's postcopy nothing else
1172 * will dirty it, and we must make sure it doesn't get sent again
1173 * even if this queue request was received after the background
1174 * search already sent it.
1176 if (block) {
1177 unsigned long *bitmap;
1178 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1179 dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1180 if (!dirty) {
1181 trace_get_queued_page_not_dirty(
1182 block->idstr, (uint64_t)offset,
1183 (uint64_t)*ram_addr_abs,
1184 test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1185 atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1186 } else {
1187 trace_get_queued_page(block->idstr,
1188 (uint64_t)offset,
1189 (uint64_t)*ram_addr_abs);
1193 } while (block && !dirty);
1195 if (block) {
1197 * As soon as we start servicing pages out of order, then we have
1198 * to kill the bulk stage, since the bulk stage assumes
1199 * in (migration_bitmap_find_and_reset_dirty) that every page is
1200 * dirty, that's no longer true.
1202 rs->ram_bulk_stage = false;
1205 * We want the background search to continue from the queued page
1206 * since the guest is likely to want other pages near to the page
1207 * it just requested.
1209 pss->block = block;
1210 pss->offset = offset;
1213 return !!block;
1217 * migration_page_queue_free: drop any remaining pages in the ram
1218 * request queue
1220 * It should be empty at the end anyway, but in error cases there may
1221 * be some left. in case that there is any page left, we drop it.
1223 * @ms: current migration state
1225 void migration_page_queue_free(MigrationState *ms)
1227 struct MigrationSrcPageRequest *mspr, *next_mspr;
1228 /* This queue generally should be empty - but in the case of a failed
1229 * migration might have some droppings in.
1231 rcu_read_lock();
1232 QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1233 memory_region_unref(mspr->rb->mr);
1234 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1235 g_free(mspr);
1237 rcu_read_unlock();
1241 * ram_save_queue_pages: queue the page for transmission
1243 * A request from postcopy destination for example.
1245 * Returns zero on success or negative on error
1247 * @ms: current migration state
1248 * @rbname: Name of the RAMBLock of the request. NULL means the
1249 * same that last one.
1250 * @start: starting address from the start of the RAMBlock
1251 * @len: length (in bytes) to send
1253 int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1254 ram_addr_t start, ram_addr_t len)
1256 RAMBlock *ramblock;
1258 ms->postcopy_requests++;
1259 rcu_read_lock();
1260 if (!rbname) {
1261 /* Reuse last RAMBlock */
1262 ramblock = ms->last_req_rb;
1264 if (!ramblock) {
1266 * Shouldn't happen, we can't reuse the last RAMBlock if
1267 * it's the 1st request.
1269 error_report("ram_save_queue_pages no previous block");
1270 goto err;
1272 } else {
1273 ramblock = qemu_ram_block_by_name(rbname);
1275 if (!ramblock) {
1276 /* We shouldn't be asked for a non-existent RAMBlock */
1277 error_report("ram_save_queue_pages no block '%s'", rbname);
1278 goto err;
1280 ms->last_req_rb = ramblock;
1282 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1283 if (start+len > ramblock->used_length) {
1284 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1285 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1286 __func__, start, len, ramblock->used_length);
1287 goto err;
1290 struct MigrationSrcPageRequest *new_entry =
1291 g_malloc0(sizeof(struct MigrationSrcPageRequest));
1292 new_entry->rb = ramblock;
1293 new_entry->offset = start;
1294 new_entry->len = len;
1296 memory_region_ref(ramblock->mr);
1297 qemu_mutex_lock(&ms->src_page_req_mutex);
1298 QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1299 qemu_mutex_unlock(&ms->src_page_req_mutex);
1300 rcu_read_unlock();
1302 return 0;
1304 err:
1305 rcu_read_unlock();
1306 return -1;
1310 * ram_save_target_page: save one target page
1312 * Returns the number of pages written
1314 * @rs: current RAM state
1315 * @ms: current migration state
1316 * @f: QEMUFile where to send the data
1317 * @pss: data about the page we want to send
1318 * @last_stage: if we are at the completion stage
1319 * @bytes_transferred: increase it with the number of transferred bytes
1320 * @dirty_ram_abs: address of the start of the dirty page in ram_addr_t space
1322 static int ram_save_target_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
1323 PageSearchStatus *pss,
1324 bool last_stage,
1325 uint64_t *bytes_transferred,
1326 ram_addr_t dirty_ram_abs)
1328 int res = 0;
1330 /* Check the pages is dirty and if it is send it */
1331 if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1332 unsigned long *unsentmap;
1333 if (compression_switch && migrate_use_compression()) {
1334 res = ram_save_compressed_page(rs, ms, f, pss,
1335 last_stage,
1336 bytes_transferred);
1337 } else {
1338 res = ram_save_page(rs, ms, f, pss, last_stage,
1339 bytes_transferred);
1342 if (res < 0) {
1343 return res;
1345 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1346 if (unsentmap) {
1347 clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1349 /* Only update last_sent_block if a block was actually sent; xbzrle
1350 * might have decided the page was identical so didn't bother writing
1351 * to the stream.
1353 if (res > 0) {
1354 rs->last_sent_block = pss->block;
1358 return res;
1362 * ram_save_host_page: save a whole host page
1364 * Starting at *offset send pages up to the end of the current host
1365 * page. It's valid for the initial offset to point into the middle of
1366 * a host page in which case the remainder of the hostpage is sent.
1367 * Only dirty target pages are sent. Note that the host page size may
1368 * be a huge page for this block.
1370 * Returns the number of pages written or negative on error
1372 * @rs: current RAM state
1373 * @ms: current migration state
1374 * @f: QEMUFile where to send the data
1375 * @pss: data about the page we want to send
1376 * @last_stage: if we are at the completion stage
1377 * @bytes_transferred: increase it with the number of transferred bytes
1378 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1380 static int ram_save_host_page(RAMState *rs, MigrationState *ms, QEMUFile *f,
1381 PageSearchStatus *pss,
1382 bool last_stage,
1383 uint64_t *bytes_transferred,
1384 ram_addr_t dirty_ram_abs)
1386 int tmppages, pages = 0;
1387 size_t pagesize = qemu_ram_pagesize(pss->block);
1389 do {
1390 tmppages = ram_save_target_page(rs, ms, f, pss, last_stage,
1391 bytes_transferred, dirty_ram_abs);
1392 if (tmppages < 0) {
1393 return tmppages;
1396 pages += tmppages;
1397 pss->offset += TARGET_PAGE_SIZE;
1398 dirty_ram_abs += TARGET_PAGE_SIZE;
1399 } while (pss->offset & (pagesize - 1));
1401 /* The offset we leave with is the last one we looked at */
1402 pss->offset -= TARGET_PAGE_SIZE;
1403 return pages;
1407 * ram_find_and_save_block: finds a dirty page and sends it to f
1409 * Called within an RCU critical section.
1411 * Returns the number of pages written where zero means no dirty pages
1413 * @rs: current RAM state
1414 * @f: QEMUFile where to send the data
1415 * @last_stage: if we are at the completion stage
1416 * @bytes_transferred: increase it with the number of transferred bytes
1418 * On systems where host-page-size > target-page-size it will send all the
1419 * pages in a host page that are dirty.
1422 static int ram_find_and_save_block(RAMState *rs, QEMUFile *f, bool last_stage,
1423 uint64_t *bytes_transferred)
1425 PageSearchStatus pss;
1426 MigrationState *ms = migrate_get_current();
1427 int pages = 0;
1428 bool again, found;
1429 ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1430 ram_addr_t space */
1432 /* No dirty page as there is zero RAM */
1433 if (!ram_bytes_total()) {
1434 return pages;
1437 pss.block = rs->last_seen_block;
1438 pss.offset = rs->last_offset;
1439 pss.complete_round = false;
1441 if (!pss.block) {
1442 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1445 do {
1446 again = true;
1447 found = get_queued_page(rs, ms, &pss, &dirty_ram_abs);
1449 if (!found) {
1450 /* priority queue empty, so just search for something dirty */
1451 found = find_dirty_block(rs, f, &pss, &again, &dirty_ram_abs);
1454 if (found) {
1455 pages = ram_save_host_page(rs, ms, f, &pss,
1456 last_stage, bytes_transferred,
1457 dirty_ram_abs);
1459 } while (!pages && again);
1461 rs->last_seen_block = pss.block;
1462 rs->last_offset = pss.offset;
1464 return pages;
1467 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1469 uint64_t pages = size / TARGET_PAGE_SIZE;
1470 RAMState *rs = &ram_state;
1472 if (zero) {
1473 rs->zero_pages += pages;
1474 } else {
1475 acct_info.norm_pages += pages;
1476 bytes_transferred += size;
1477 qemu_update_position(f, size);
1481 static ram_addr_t ram_save_remaining(void)
1483 return migration_dirty_pages;
1486 uint64_t ram_bytes_remaining(void)
1488 return ram_save_remaining() * TARGET_PAGE_SIZE;
1491 uint64_t ram_bytes_transferred(void)
1493 return bytes_transferred;
1496 uint64_t ram_bytes_total(void)
1498 RAMBlock *block;
1499 uint64_t total = 0;
1501 rcu_read_lock();
1502 QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1503 total += block->used_length;
1504 rcu_read_unlock();
1505 return total;
1508 void free_xbzrle_decoded_buf(void)
1510 g_free(xbzrle_decoded_buf);
1511 xbzrle_decoded_buf = NULL;
1514 static void migration_bitmap_free(struct BitmapRcu *bmap)
1516 g_free(bmap->bmap);
1517 g_free(bmap->unsentmap);
1518 g_free(bmap);
1521 static void ram_migration_cleanup(void *opaque)
1523 /* caller have hold iothread lock or is in a bh, so there is
1524 * no writing race against this migration_bitmap
1526 struct BitmapRcu *bitmap = migration_bitmap_rcu;
1527 atomic_rcu_set(&migration_bitmap_rcu, NULL);
1528 if (bitmap) {
1529 memory_global_dirty_log_stop();
1530 call_rcu(bitmap, migration_bitmap_free, rcu);
1533 XBZRLE_cache_lock();
1534 if (XBZRLE.cache) {
1535 cache_fini(XBZRLE.cache);
1536 g_free(XBZRLE.encoded_buf);
1537 g_free(XBZRLE.current_buf);
1538 g_free(ZERO_TARGET_PAGE);
1539 XBZRLE.cache = NULL;
1540 XBZRLE.encoded_buf = NULL;
1541 XBZRLE.current_buf = NULL;
1543 XBZRLE_cache_unlock();
1546 static void ram_state_reset(RAMState *rs)
1548 rs->last_seen_block = NULL;
1549 rs->last_sent_block = NULL;
1550 rs->last_offset = 0;
1551 rs->last_version = ram_list.version;
1552 rs->ram_bulk_stage = true;
1555 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1557 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1559 /* called in qemu main thread, so there is
1560 * no writing race against this migration_bitmap
1562 if (migration_bitmap_rcu) {
1563 struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1564 bitmap = g_new(struct BitmapRcu, 1);
1565 bitmap->bmap = bitmap_new(new);
1567 /* prevent migration_bitmap content from being set bit
1568 * by migration_bitmap_sync_range() at the same time.
1569 * it is safe to migration if migration_bitmap is cleared bit
1570 * at the same time.
1572 qemu_mutex_lock(&migration_bitmap_mutex);
1573 bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1574 bitmap_set(bitmap->bmap, old, new - old);
1576 /* We don't have a way to safely extend the sentmap
1577 * with RCU; so mark it as missing, entry to postcopy
1578 * will fail.
1580 bitmap->unsentmap = NULL;
1582 atomic_rcu_set(&migration_bitmap_rcu, bitmap);
1583 qemu_mutex_unlock(&migration_bitmap_mutex);
1584 migration_dirty_pages += new - old;
1585 call_rcu(old_bitmap, migration_bitmap_free, rcu);
1590 * 'expected' is the value you expect the bitmap mostly to be full
1591 * of; it won't bother printing lines that are all this value.
1592 * If 'todump' is null the migration bitmap is dumped.
1594 void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1596 int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1598 int64_t cur;
1599 int64_t linelen = 128;
1600 char linebuf[129];
1602 if (!todump) {
1603 todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1606 for (cur = 0; cur < ram_pages; cur += linelen) {
1607 int64_t curb;
1608 bool found = false;
1610 * Last line; catch the case where the line length
1611 * is longer than remaining ram
1613 if (cur + linelen > ram_pages) {
1614 linelen = ram_pages - cur;
1616 for (curb = 0; curb < linelen; curb++) {
1617 bool thisbit = test_bit(cur + curb, todump);
1618 linebuf[curb] = thisbit ? '1' : '.';
1619 found = found || (thisbit != expected);
1621 if (found) {
1622 linebuf[curb] = '\0';
1623 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1628 /* **** functions for postcopy ***** */
1630 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1632 struct RAMBlock *block;
1633 unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1635 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1636 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1637 unsigned long range = first + (block->used_length >> TARGET_PAGE_BITS);
1638 unsigned long run_start = find_next_zero_bit(bitmap, range, first);
1640 while (run_start < range) {
1641 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1642 ram_discard_range(NULL, block->idstr, run_start << TARGET_PAGE_BITS,
1643 (run_end - run_start) << TARGET_PAGE_BITS);
1644 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1650 * postcopy_send_discard_bm_ram: discard a RAMBlock
1652 * Returns zero on success
1654 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1655 * Note: At this point the 'unsentmap' is the processed bitmap combined
1656 * with the dirtymap; so a '1' means it's either dirty or unsent.
1658 * @ms: current migration state
1659 * @pds: state for postcopy
1660 * @start: RAMBlock starting page
1661 * @length: RAMBlock size
1663 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1664 PostcopyDiscardState *pds,
1665 unsigned long start,
1666 unsigned long length)
1668 unsigned long end = start + length; /* one after the end */
1669 unsigned long current;
1670 unsigned long *unsentmap;
1672 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1673 for (current = start; current < end; ) {
1674 unsigned long one = find_next_bit(unsentmap, end, current);
1676 if (one <= end) {
1677 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1678 unsigned long discard_length;
1680 if (zero >= end) {
1681 discard_length = end - one;
1682 } else {
1683 discard_length = zero - one;
1685 if (discard_length) {
1686 postcopy_discard_send_range(ms, pds, one, discard_length);
1688 current = one + discard_length;
1689 } else {
1690 current = one;
1694 return 0;
1698 * postcopy_each_ram_send_discard: discard all RAMBlocks
1700 * Returns 0 for success or negative for error
1702 * Utility for the outgoing postcopy code.
1703 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1704 * passing it bitmap indexes and name.
1705 * (qemu_ram_foreach_block ends up passing unscaled lengths
1706 * which would mean postcopy code would have to deal with target page)
1708 * @ms: current migration state
1710 static int postcopy_each_ram_send_discard(MigrationState *ms)
1712 struct RAMBlock *block;
1713 int ret;
1715 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1716 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1717 PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1718 first,
1719 block->idstr);
1722 * Postcopy sends chunks of bitmap over the wire, but it
1723 * just needs indexes at this point, avoids it having
1724 * target page specific code.
1726 ret = postcopy_send_discard_bm_ram(ms, pds, first,
1727 block->used_length >> TARGET_PAGE_BITS);
1728 postcopy_discard_send_finish(ms, pds);
1729 if (ret) {
1730 return ret;
1734 return 0;
1738 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1740 * Helper for postcopy_chunk_hostpages; it's called twice to
1741 * canonicalize the two bitmaps, that are similar, but one is
1742 * inverted.
1744 * Postcopy requires that all target pages in a hostpage are dirty or
1745 * clean, not a mix. This function canonicalizes the bitmaps.
1747 * @ms: current migration state
1748 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1749 * otherwise we need to canonicalize partially dirty host pages
1750 * @block: block that contains the page we want to canonicalize
1751 * @pds: state for postcopy
1753 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1754 RAMBlock *block,
1755 PostcopyDiscardState *pds)
1757 unsigned long *bitmap;
1758 unsigned long *unsentmap;
1759 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1760 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1761 unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1762 unsigned long last = first + (len - 1);
1763 unsigned long run_start;
1765 if (block->page_size == TARGET_PAGE_SIZE) {
1766 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1767 return;
1770 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1771 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1773 if (unsent_pass) {
1774 /* Find a sent page */
1775 run_start = find_next_zero_bit(unsentmap, last + 1, first);
1776 } else {
1777 /* Find a dirty page */
1778 run_start = find_next_bit(bitmap, last + 1, first);
1781 while (run_start <= last) {
1782 bool do_fixup = false;
1783 unsigned long fixup_start_addr;
1784 unsigned long host_offset;
1787 * If the start of this run of pages is in the middle of a host
1788 * page, then we need to fixup this host page.
1790 host_offset = run_start % host_ratio;
1791 if (host_offset) {
1792 do_fixup = true;
1793 run_start -= host_offset;
1794 fixup_start_addr = run_start;
1795 /* For the next pass */
1796 run_start = run_start + host_ratio;
1797 } else {
1798 /* Find the end of this run */
1799 unsigned long run_end;
1800 if (unsent_pass) {
1801 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1802 } else {
1803 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1806 * If the end isn't at the start of a host page, then the
1807 * run doesn't finish at the end of a host page
1808 * and we need to discard.
1810 host_offset = run_end % host_ratio;
1811 if (host_offset) {
1812 do_fixup = true;
1813 fixup_start_addr = run_end - host_offset;
1815 * This host page has gone, the next loop iteration starts
1816 * from after the fixup
1818 run_start = fixup_start_addr + host_ratio;
1819 } else {
1821 * No discards on this iteration, next loop starts from
1822 * next sent/dirty page
1824 run_start = run_end + 1;
1828 if (do_fixup) {
1829 unsigned long page;
1831 /* Tell the destination to discard this page */
1832 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1833 /* For the unsent_pass we:
1834 * discard partially sent pages
1835 * For the !unsent_pass (dirty) we:
1836 * discard partially dirty pages that were sent
1837 * (any partially sent pages were already discarded
1838 * by the previous unsent_pass)
1840 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1841 host_ratio);
1844 /* Clean up the bitmap */
1845 for (page = fixup_start_addr;
1846 page < fixup_start_addr + host_ratio; page++) {
1847 /* All pages in this host page are now not sent */
1848 set_bit(page, unsentmap);
1851 * Remark them as dirty, updating the count for any pages
1852 * that weren't previously dirty.
1854 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1858 if (unsent_pass) {
1859 /* Find the next sent page for the next iteration */
1860 run_start = find_next_zero_bit(unsentmap, last + 1,
1861 run_start);
1862 } else {
1863 /* Find the next dirty page for the next iteration */
1864 run_start = find_next_bit(bitmap, last + 1, run_start);
1870 * postcopy_chuck_hostpages: discrad any partially sent host page
1872 * Utility for the outgoing postcopy code.
1874 * Discard any partially sent host-page size chunks, mark any partially
1875 * dirty host-page size chunks as all dirty. In this case the host-page
1876 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1878 * Returns zero on success
1880 * @ms: current migration state
1882 static int postcopy_chunk_hostpages(MigrationState *ms)
1884 RAMState *rs = &ram_state;
1885 struct RAMBlock *block;
1887 /* Easiest way to make sure we don't resume in the middle of a host-page */
1888 rs->last_seen_block = NULL;
1889 rs->last_sent_block = NULL;
1890 rs->last_offset = 0;
1892 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1893 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1895 PostcopyDiscardState *pds =
1896 postcopy_discard_send_init(ms, first, block->idstr);
1898 /* First pass: Discard all partially sent host pages */
1899 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1901 * Second pass: Ensure that all partially dirty host pages are made
1902 * fully dirty.
1904 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1906 postcopy_discard_send_finish(ms, pds);
1907 } /* ram_list loop */
1909 return 0;
1913 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1915 * Returns zero on success
1917 * Transmit the set of pages to be discarded after precopy to the target
1918 * these are pages that:
1919 * a) Have been previously transmitted but are now dirty again
1920 * b) Pages that have never been transmitted, this ensures that
1921 * any pages on the destination that have been mapped by background
1922 * tasks get discarded (transparent huge pages is the specific concern)
1923 * Hopefully this is pretty sparse
1925 * @ms: current migration state
1927 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1929 int ret;
1930 unsigned long *bitmap, *unsentmap;
1932 rcu_read_lock();
1934 /* This should be our last sync, the src is now paused */
1935 migration_bitmap_sync(&ram_state);
1937 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1938 if (!unsentmap) {
1939 /* We don't have a safe way to resize the sentmap, so
1940 * if the bitmap was resized it will be NULL at this
1941 * point.
1943 error_report("migration ram resized during precopy phase");
1944 rcu_read_unlock();
1945 return -EINVAL;
1948 /* Deal with TPS != HPS and huge pages */
1949 ret = postcopy_chunk_hostpages(ms);
1950 if (ret) {
1951 rcu_read_unlock();
1952 return ret;
1956 * Update the unsentmap to be unsentmap = unsentmap | dirty
1958 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1959 bitmap_or(unsentmap, unsentmap, bitmap,
1960 last_ram_offset() >> TARGET_PAGE_BITS);
1963 trace_ram_postcopy_send_discard_bitmap();
1964 #ifdef DEBUG_POSTCOPY
1965 ram_debug_dump_bitmap(unsentmap, true);
1966 #endif
1968 ret = postcopy_each_ram_send_discard(ms);
1969 rcu_read_unlock();
1971 return ret;
1975 * ram_discard_range: discard dirtied pages at the beginning of postcopy
1977 * Returns zero on success
1979 * @mis: current migration incoming state
1980 * @rbname: name of the RAMBlock of the request. NULL means the
1981 * same that last one.
1982 * @start: RAMBlock starting page
1983 * @length: RAMBlock size
1985 int ram_discard_range(MigrationIncomingState *mis,
1986 const char *rbname,
1987 uint64_t start, size_t length)
1989 int ret = -1;
1991 trace_ram_discard_range(rbname, start, length);
1993 rcu_read_lock();
1994 RAMBlock *rb = qemu_ram_block_by_name(rbname);
1996 if (!rb) {
1997 error_report("ram_discard_range: Failed to find block '%s'", rbname);
1998 goto err;
2001 ret = ram_block_discard_range(rb, start, length);
2003 err:
2004 rcu_read_unlock();
2006 return ret;
2009 static int ram_save_init_globals(RAMState *rs)
2011 int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
2013 rs->dirty_rate_high_cnt = 0;
2014 rs->bitmap_sync_count = 0;
2015 rs->zero_pages = 0;
2016 migration_bitmap_sync_init(rs);
2017 qemu_mutex_init(&migration_bitmap_mutex);
2019 if (migrate_use_xbzrle()) {
2020 XBZRLE_cache_lock();
2021 ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
2022 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
2023 TARGET_PAGE_SIZE,
2024 TARGET_PAGE_SIZE);
2025 if (!XBZRLE.cache) {
2026 XBZRLE_cache_unlock();
2027 error_report("Error creating cache");
2028 return -1;
2030 XBZRLE_cache_unlock();
2032 /* We prefer not to abort if there is no memory */
2033 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2034 if (!XBZRLE.encoded_buf) {
2035 error_report("Error allocating encoded_buf");
2036 return -1;
2039 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2040 if (!XBZRLE.current_buf) {
2041 error_report("Error allocating current_buf");
2042 g_free(XBZRLE.encoded_buf);
2043 XBZRLE.encoded_buf = NULL;
2044 return -1;
2047 acct_clear();
2050 /* For memory_global_dirty_log_start below. */
2051 qemu_mutex_lock_iothread();
2053 qemu_mutex_lock_ramlist();
2054 rcu_read_lock();
2055 bytes_transferred = 0;
2056 ram_state_reset(rs);
2058 migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
2059 /* Skip setting bitmap if there is no RAM */
2060 if (ram_bytes_total()) {
2061 ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2062 migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
2063 bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
2065 if (migrate_postcopy_ram()) {
2066 migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
2067 bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
2072 * Count the total number of pages used by ram blocks not including any
2073 * gaps due to alignment or unplugs.
2075 migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2077 memory_global_dirty_log_start();
2078 migration_bitmap_sync(rs);
2079 qemu_mutex_unlock_ramlist();
2080 qemu_mutex_unlock_iothread();
2081 rcu_read_unlock();
2083 return 0;
2087 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2088 * long-running RCU critical section. When rcu-reclaims in the code
2089 * start to become numerous it will be necessary to reduce the
2090 * granularity of these critical sections.
2094 * ram_save_setup: Setup RAM for migration
2096 * Returns zero to indicate success and negative for error
2098 * @f: QEMUFile where to send the data
2099 * @opaque: RAMState pointer
2101 static int ram_save_setup(QEMUFile *f, void *opaque)
2103 RAMState *rs = opaque;
2104 RAMBlock *block;
2106 /* migration has already setup the bitmap, reuse it. */
2107 if (!migration_in_colo_state()) {
2108 if (ram_save_init_globals(rs) < 0) {
2109 return -1;
2113 rcu_read_lock();
2115 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2117 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2118 qemu_put_byte(f, strlen(block->idstr));
2119 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2120 qemu_put_be64(f, block->used_length);
2121 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2122 qemu_put_be64(f, block->page_size);
2126 rcu_read_unlock();
2128 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2129 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2131 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2133 return 0;
2137 * ram_save_iterate: iterative stage for migration
2139 * Returns zero to indicate success and negative for error
2141 * @f: QEMUFile where to send the data
2142 * @opaque: RAMState pointer
2144 static int ram_save_iterate(QEMUFile *f, void *opaque)
2146 RAMState *rs = opaque;
2147 int ret;
2148 int i;
2149 int64_t t0;
2150 int done = 0;
2152 rcu_read_lock();
2153 if (ram_list.version != rs->last_version) {
2154 ram_state_reset(rs);
2157 /* Read version before ram_list.blocks */
2158 smp_rmb();
2160 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2162 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2163 i = 0;
2164 while ((ret = qemu_file_rate_limit(f)) == 0) {
2165 int pages;
2167 pages = ram_find_and_save_block(rs, f, false, &bytes_transferred);
2168 /* no more pages to sent */
2169 if (pages == 0) {
2170 done = 1;
2171 break;
2173 acct_info.iterations++;
2175 /* we want to check in the 1st loop, just in case it was the 1st time
2176 and we had to sync the dirty bitmap.
2177 qemu_get_clock_ns() is a bit expensive, so we only check each some
2178 iterations
2180 if ((i & 63) == 0) {
2181 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2182 if (t1 > MAX_WAIT) {
2183 trace_ram_save_iterate_big_wait(t1, i);
2184 break;
2187 i++;
2189 flush_compressed_data(f);
2190 rcu_read_unlock();
2193 * Must occur before EOS (or any QEMUFile operation)
2194 * because of RDMA protocol.
2196 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2198 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2199 bytes_transferred += 8;
2201 ret = qemu_file_get_error(f);
2202 if (ret < 0) {
2203 return ret;
2206 return done;
2210 * ram_save_complete: function called to send the remaining amount of ram
2212 * Returns zero to indicate success
2214 * Called with iothread lock
2216 * @f: QEMUFile where to send the data
2217 * @opaque: RAMState pointer
2219 static int ram_save_complete(QEMUFile *f, void *opaque)
2221 RAMState *rs = opaque;
2223 rcu_read_lock();
2225 if (!migration_in_postcopy(migrate_get_current())) {
2226 migration_bitmap_sync(rs);
2229 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2231 /* try transferring iterative blocks of memory */
2233 /* flush all remaining blocks regardless of rate limiting */
2234 while (true) {
2235 int pages;
2237 pages = ram_find_and_save_block(rs, f, !migration_in_colo_state(),
2238 &bytes_transferred);
2239 /* no more blocks to sent */
2240 if (pages == 0) {
2241 break;
2245 flush_compressed_data(f);
2246 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2248 rcu_read_unlock();
2250 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2252 return 0;
2255 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2256 uint64_t *non_postcopiable_pending,
2257 uint64_t *postcopiable_pending)
2259 RAMState *rs = opaque;
2260 uint64_t remaining_size;
2262 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2264 if (!migration_in_postcopy(migrate_get_current()) &&
2265 remaining_size < max_size) {
2266 qemu_mutex_lock_iothread();
2267 rcu_read_lock();
2268 migration_bitmap_sync(rs);
2269 rcu_read_unlock();
2270 qemu_mutex_unlock_iothread();
2271 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2274 /* We can do postcopy, and all the data is postcopiable */
2275 *postcopiable_pending += remaining_size;
2278 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2280 unsigned int xh_len;
2281 int xh_flags;
2282 uint8_t *loaded_data;
2284 if (!xbzrle_decoded_buf) {
2285 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2287 loaded_data = xbzrle_decoded_buf;
2289 /* extract RLE header */
2290 xh_flags = qemu_get_byte(f);
2291 xh_len = qemu_get_be16(f);
2293 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2294 error_report("Failed to load XBZRLE page - wrong compression!");
2295 return -1;
2298 if (xh_len > TARGET_PAGE_SIZE) {
2299 error_report("Failed to load XBZRLE page - len overflow!");
2300 return -1;
2302 /* load data and decode */
2303 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2305 /* decode RLE */
2306 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2307 TARGET_PAGE_SIZE) == -1) {
2308 error_report("Failed to load XBZRLE page - decode error!");
2309 return -1;
2312 return 0;
2316 * ram_block_from_stream: read a RAMBlock id from the migration stream
2318 * Must be called from within a rcu critical section.
2320 * Returns a pointer from within the RCU-protected ram_list.
2322 * @f: QEMUFile where to read the data from
2323 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2325 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2327 static RAMBlock *block = NULL;
2328 char id[256];
2329 uint8_t len;
2331 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2332 if (!block) {
2333 error_report("Ack, bad migration stream!");
2334 return NULL;
2336 return block;
2339 len = qemu_get_byte(f);
2340 qemu_get_buffer(f, (uint8_t *)id, len);
2341 id[len] = 0;
2343 block = qemu_ram_block_by_name(id);
2344 if (!block) {
2345 error_report("Can't find block %s", id);
2346 return NULL;
2349 return block;
2352 static inline void *host_from_ram_block_offset(RAMBlock *block,
2353 ram_addr_t offset)
2355 if (!offset_in_ramblock(block, offset)) {
2356 return NULL;
2359 return block->host + offset;
2363 * ram_handle_compressed: handle the zero page case
2365 * If a page (or a whole RDMA chunk) has been
2366 * determined to be zero, then zap it.
2368 * @host: host address for the zero page
2369 * @ch: what the page is filled from. We only support zero
2370 * @size: size of the zero page
2372 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2374 if (ch != 0 || !is_zero_range(host, size)) {
2375 memset(host, ch, size);
2379 static void *do_data_decompress(void *opaque)
2381 DecompressParam *param = opaque;
2382 unsigned long pagesize;
2383 uint8_t *des;
2384 int len;
2386 qemu_mutex_lock(&param->mutex);
2387 while (!param->quit) {
2388 if (param->des) {
2389 des = param->des;
2390 len = param->len;
2391 param->des = 0;
2392 qemu_mutex_unlock(&param->mutex);
2394 pagesize = TARGET_PAGE_SIZE;
2395 /* uncompress() will return failed in some case, especially
2396 * when the page is dirted when doing the compression, it's
2397 * not a problem because the dirty page will be retransferred
2398 * and uncompress() won't break the data in other pages.
2400 uncompress((Bytef *)des, &pagesize,
2401 (const Bytef *)param->compbuf, len);
2403 qemu_mutex_lock(&decomp_done_lock);
2404 param->done = true;
2405 qemu_cond_signal(&decomp_done_cond);
2406 qemu_mutex_unlock(&decomp_done_lock);
2408 qemu_mutex_lock(&param->mutex);
2409 } else {
2410 qemu_cond_wait(&param->cond, &param->mutex);
2413 qemu_mutex_unlock(&param->mutex);
2415 return NULL;
2418 static void wait_for_decompress_done(void)
2420 int idx, thread_count;
2422 if (!migrate_use_compression()) {
2423 return;
2426 thread_count = migrate_decompress_threads();
2427 qemu_mutex_lock(&decomp_done_lock);
2428 for (idx = 0; idx < thread_count; idx++) {
2429 while (!decomp_param[idx].done) {
2430 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2433 qemu_mutex_unlock(&decomp_done_lock);
2436 void migrate_decompress_threads_create(void)
2438 int i, thread_count;
2440 thread_count = migrate_decompress_threads();
2441 decompress_threads = g_new0(QemuThread, thread_count);
2442 decomp_param = g_new0(DecompressParam, thread_count);
2443 qemu_mutex_init(&decomp_done_lock);
2444 qemu_cond_init(&decomp_done_cond);
2445 for (i = 0; i < thread_count; i++) {
2446 qemu_mutex_init(&decomp_param[i].mutex);
2447 qemu_cond_init(&decomp_param[i].cond);
2448 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2449 decomp_param[i].done = true;
2450 decomp_param[i].quit = false;
2451 qemu_thread_create(decompress_threads + i, "decompress",
2452 do_data_decompress, decomp_param + i,
2453 QEMU_THREAD_JOINABLE);
2457 void migrate_decompress_threads_join(void)
2459 int i, thread_count;
2461 thread_count = migrate_decompress_threads();
2462 for (i = 0; i < thread_count; i++) {
2463 qemu_mutex_lock(&decomp_param[i].mutex);
2464 decomp_param[i].quit = true;
2465 qemu_cond_signal(&decomp_param[i].cond);
2466 qemu_mutex_unlock(&decomp_param[i].mutex);
2468 for (i = 0; i < thread_count; i++) {
2469 qemu_thread_join(decompress_threads + i);
2470 qemu_mutex_destroy(&decomp_param[i].mutex);
2471 qemu_cond_destroy(&decomp_param[i].cond);
2472 g_free(decomp_param[i].compbuf);
2474 g_free(decompress_threads);
2475 g_free(decomp_param);
2476 decompress_threads = NULL;
2477 decomp_param = NULL;
2480 static void decompress_data_with_multi_threads(QEMUFile *f,
2481 void *host, int len)
2483 int idx, thread_count;
2485 thread_count = migrate_decompress_threads();
2486 qemu_mutex_lock(&decomp_done_lock);
2487 while (true) {
2488 for (idx = 0; idx < thread_count; idx++) {
2489 if (decomp_param[idx].done) {
2490 decomp_param[idx].done = false;
2491 qemu_mutex_lock(&decomp_param[idx].mutex);
2492 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2493 decomp_param[idx].des = host;
2494 decomp_param[idx].len = len;
2495 qemu_cond_signal(&decomp_param[idx].cond);
2496 qemu_mutex_unlock(&decomp_param[idx].mutex);
2497 break;
2500 if (idx < thread_count) {
2501 break;
2502 } else {
2503 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2506 qemu_mutex_unlock(&decomp_done_lock);
2510 * ram_postcopy_incoming_init: allocate postcopy data structures
2512 * Returns 0 for success and negative if there was one error
2514 * @mis: current migration incoming state
2516 * Allocate data structures etc needed by incoming migration with
2517 * postcopy-ram. postcopy-ram's similarly names
2518 * postcopy_ram_incoming_init does the work.
2520 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2522 size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2524 return postcopy_ram_incoming_init(mis, ram_pages);
2528 * ram_load_postcopy: load a page in postcopy case
2530 * Returns 0 for success or -errno in case of error
2532 * Called in postcopy mode by ram_load().
2533 * rcu_read_lock is taken prior to this being called.
2535 * @f: QEMUFile where to send the data
2537 static int ram_load_postcopy(QEMUFile *f)
2539 int flags = 0, ret = 0;
2540 bool place_needed = false;
2541 bool matching_page_sizes = false;
2542 MigrationIncomingState *mis = migration_incoming_get_current();
2543 /* Temporary page that is later 'placed' */
2544 void *postcopy_host_page = postcopy_get_tmp_page(mis);
2545 void *last_host = NULL;
2546 bool all_zero = false;
2548 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2549 ram_addr_t addr;
2550 void *host = NULL;
2551 void *page_buffer = NULL;
2552 void *place_source = NULL;
2553 RAMBlock *block = NULL;
2554 uint8_t ch;
2556 addr = qemu_get_be64(f);
2557 flags = addr & ~TARGET_PAGE_MASK;
2558 addr &= TARGET_PAGE_MASK;
2560 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2561 place_needed = false;
2562 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2563 block = ram_block_from_stream(f, flags);
2565 host = host_from_ram_block_offset(block, addr);
2566 if (!host) {
2567 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2568 ret = -EINVAL;
2569 break;
2571 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2573 * Postcopy requires that we place whole host pages atomically;
2574 * these may be huge pages for RAMBlocks that are backed by
2575 * hugetlbfs.
2576 * To make it atomic, the data is read into a temporary page
2577 * that's moved into place later.
2578 * The migration protocol uses, possibly smaller, target-pages
2579 * however the source ensures it always sends all the components
2580 * of a host page in order.
2582 page_buffer = postcopy_host_page +
2583 ((uintptr_t)host & (block->page_size - 1));
2584 /* If all TP are zero then we can optimise the place */
2585 if (!((uintptr_t)host & (block->page_size - 1))) {
2586 all_zero = true;
2587 } else {
2588 /* not the 1st TP within the HP */
2589 if (host != (last_host + TARGET_PAGE_SIZE)) {
2590 error_report("Non-sequential target page %p/%p",
2591 host, last_host);
2592 ret = -EINVAL;
2593 break;
2599 * If it's the last part of a host page then we place the host
2600 * page
2602 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2603 (block->page_size - 1)) == 0;
2604 place_source = postcopy_host_page;
2606 last_host = host;
2608 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2609 case RAM_SAVE_FLAG_COMPRESS:
2610 ch = qemu_get_byte(f);
2611 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2612 if (ch) {
2613 all_zero = false;
2615 break;
2617 case RAM_SAVE_FLAG_PAGE:
2618 all_zero = false;
2619 if (!place_needed || !matching_page_sizes) {
2620 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2621 } else {
2622 /* Avoids the qemu_file copy during postcopy, which is
2623 * going to do a copy later; can only do it when we
2624 * do this read in one go (matching page sizes)
2626 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2627 TARGET_PAGE_SIZE);
2629 break;
2630 case RAM_SAVE_FLAG_EOS:
2631 /* normal exit */
2632 break;
2633 default:
2634 error_report("Unknown combination of migration flags: %#x"
2635 " (postcopy mode)", flags);
2636 ret = -EINVAL;
2639 if (place_needed) {
2640 /* This gets called at the last target page in the host page */
2641 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2643 if (all_zero) {
2644 ret = postcopy_place_page_zero(mis, place_dest,
2645 block->page_size);
2646 } else {
2647 ret = postcopy_place_page(mis, place_dest,
2648 place_source, block->page_size);
2651 if (!ret) {
2652 ret = qemu_file_get_error(f);
2656 return ret;
2659 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2661 int flags = 0, ret = 0;
2662 static uint64_t seq_iter;
2663 int len = 0;
2665 * If system is running in postcopy mode, page inserts to host memory must
2666 * be atomic
2668 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2669 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2670 bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2672 seq_iter++;
2674 if (version_id != 4) {
2675 ret = -EINVAL;
2678 /* This RCU critical section can be very long running.
2679 * When RCU reclaims in the code start to become numerous,
2680 * it will be necessary to reduce the granularity of this
2681 * critical section.
2683 rcu_read_lock();
2685 if (postcopy_running) {
2686 ret = ram_load_postcopy(f);
2689 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2690 ram_addr_t addr, total_ram_bytes;
2691 void *host = NULL;
2692 uint8_t ch;
2694 addr = qemu_get_be64(f);
2695 flags = addr & ~TARGET_PAGE_MASK;
2696 addr &= TARGET_PAGE_MASK;
2698 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2699 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2700 RAMBlock *block = ram_block_from_stream(f, flags);
2702 host = host_from_ram_block_offset(block, addr);
2703 if (!host) {
2704 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2705 ret = -EINVAL;
2706 break;
2710 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2711 case RAM_SAVE_FLAG_MEM_SIZE:
2712 /* Synchronize RAM block list */
2713 total_ram_bytes = addr;
2714 while (!ret && total_ram_bytes) {
2715 RAMBlock *block;
2716 char id[256];
2717 ram_addr_t length;
2719 len = qemu_get_byte(f);
2720 qemu_get_buffer(f, (uint8_t *)id, len);
2721 id[len] = 0;
2722 length = qemu_get_be64(f);
2724 block = qemu_ram_block_by_name(id);
2725 if (block) {
2726 if (length != block->used_length) {
2727 Error *local_err = NULL;
2729 ret = qemu_ram_resize(block, length,
2730 &local_err);
2731 if (local_err) {
2732 error_report_err(local_err);
2735 /* For postcopy we need to check hugepage sizes match */
2736 if (postcopy_advised &&
2737 block->page_size != qemu_host_page_size) {
2738 uint64_t remote_page_size = qemu_get_be64(f);
2739 if (remote_page_size != block->page_size) {
2740 error_report("Mismatched RAM page size %s "
2741 "(local) %zd != %" PRId64,
2742 id, block->page_size,
2743 remote_page_size);
2744 ret = -EINVAL;
2747 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2748 block->idstr);
2749 } else {
2750 error_report("Unknown ramblock \"%s\", cannot "
2751 "accept migration", id);
2752 ret = -EINVAL;
2755 total_ram_bytes -= length;
2757 break;
2759 case RAM_SAVE_FLAG_COMPRESS:
2760 ch = qemu_get_byte(f);
2761 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2762 break;
2764 case RAM_SAVE_FLAG_PAGE:
2765 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2766 break;
2768 case RAM_SAVE_FLAG_COMPRESS_PAGE:
2769 len = qemu_get_be32(f);
2770 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2771 error_report("Invalid compressed data length: %d", len);
2772 ret = -EINVAL;
2773 break;
2775 decompress_data_with_multi_threads(f, host, len);
2776 break;
2778 case RAM_SAVE_FLAG_XBZRLE:
2779 if (load_xbzrle(f, addr, host) < 0) {
2780 error_report("Failed to decompress XBZRLE page at "
2781 RAM_ADDR_FMT, addr);
2782 ret = -EINVAL;
2783 break;
2785 break;
2786 case RAM_SAVE_FLAG_EOS:
2787 /* normal exit */
2788 break;
2789 default:
2790 if (flags & RAM_SAVE_FLAG_HOOK) {
2791 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2792 } else {
2793 error_report("Unknown combination of migration flags: %#x",
2794 flags);
2795 ret = -EINVAL;
2798 if (!ret) {
2799 ret = qemu_file_get_error(f);
2803 wait_for_decompress_done();
2804 rcu_read_unlock();
2805 trace_ram_load_complete(ret, seq_iter);
2806 return ret;
2809 static SaveVMHandlers savevm_ram_handlers = {
2810 .save_live_setup = ram_save_setup,
2811 .save_live_iterate = ram_save_iterate,
2812 .save_live_complete_postcopy = ram_save_complete,
2813 .save_live_complete_precopy = ram_save_complete,
2814 .save_live_pending = ram_save_pending,
2815 .load_state = ram_load,
2816 .cleanup = ram_migration_cleanup,
2819 void ram_mig_init(void)
2821 qemu_mutex_init(&XBZRLE.lock);
2822 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);